3 жил өмнө · e6abdb807f
--- a/find_source/t_channel.py
+++ b/find_source/t_channel.py
@@ -0,0 +1,130 @@
 
				+import copy
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+from common.databases import mongo_table
			
 
				+from common.tools import get_current_date
			
 
				+from crawler.download import Downloader, RenderDownloader
			
 
				+from crawler.services.channel import bfs
			
 
				+
			
 
				+shujuziyuan = mongo_table('shujuziyuan', 'channel')
			
 
				+
			
 
				+
			
 
				+def is_duplicate(seed, href):
			
 
				+    if href != seed:
			
 
				+        return False
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def read_local_data(file):
			
 
				+    data_lst = []
			
 
				+    df = pd.read_excel(io=file, sheet_name=0)
			
 
				+    for site, *item in df.values:
			
 
				+        # print(site, item)
			
 
				+        href = item.pop(2)
			
 
				+        # print(site, href, *item)
			
 
				+        data = (site, href, *item)
			
 
				+        if data not in data_lst:
			
 
				+            data_lst.append(data)
			
 
				+    return data_lst
			
 
				+
			
 
				+
			
 
				+def excavate_data(tasks):
			
 
				+    static = Downloader()
			
 
				+    render = RenderDownloader()
			
 
				+    download_tools = [static, render]
			
 
				+    for site, href, *other in tasks:
			
 
				+        print("开始 >>> ", site, href, *other)
			
 
				+
			
 
				+        # 尝试多种页面请求处理方式
			
 
				+        insert_lst = []
			
 
				+        item = {
			
 
				+            'site': site,  # 网站名称
			
 
				+            'seed': href,  # 种子地址
			
 
				+            'seed_spidercode': other[0],  # 种子爬虫代码
			
 
				+            'seed_channel': other[1],  # 种子栏目名称
			
 
				+            'seed_spiderstatus': other[2],  # 种子爬虫状态
			
 
				+            'remark': '',
			
 
				+        }
			
 
				+        for dl in download_tools:
			
 
				+            resp = dl.get(href, timeout=3)
			
 
				+            results = bfs(resp, href)
			
 
				+            for key, items in results.items():
			
 
				+                print(f"搜索 >>> {key} && {len(items)}")
			
 
				+                for val in items:
			
 
				+                    channel, url = val
			
 
				+                    copy_data = copy.deepcopy(item)
			
 
				+                    copy_data.update({
			
 
				+                        'channel': channel,  # 栏目地址
			
 
				+                        'href': url,  # 网站名称
			
 
				+                        'is_duplicate': is_duplicate(href, url),  # 种子与新栏目地址是否相同
			
 
				+                    })
			
 
				+                    insert_lst.append(copy_data)
			
 
				+            if len(results) > 0:
			
 
				+                break
			
 
				+
			
 
				+        if len(insert_lst) > 0:
			
 
				+            shujuziyuan.insert_many(insert_lst)
			
 
				+        else:
			
 
				+            shujuziyuan.insert_one(item)
			
 
				+        print('结束 >>>\n')
			
 
				+
			
 
				+
			
 
				+def to_excel():
			
 
				+    date = get_current_date(fmt="%Y%m%d")
			
 
				+    file = f'{date}_栏目挖掘.xlsx'
			
 
				+    writer = pd.ExcelWriter(file)
			
 
				+    q = {'channel': {'$exists': 1}, 'is_duplicate': False}
			
 
				+    projection = {
			
 
				+        'site': 1,
			
 
				+        'seed_spidercode': 1,
			
 
				+        'seed': 1,
			
 
				+        'seed_channel': 1,
			
 
				+        'href': 1,
			
 
				+        'channel': 1,
			
 
				+        'seed_spiderstatus': 1,
			
 
				+        'remark': 1,
			
 
				+        '_id': 0
			
 
				+    }
			
 
				+    cursor = shujuziyuan.find(q, projection=projection)
			
 
				+    df = pd.DataFrame(list(cursor))
			
 
				+    df.columns = [
			
 
				+        '网站名称',
			
 
				+        '种子栏目地址',
			
 
				+        '种子爬虫代码',
			
 
				+        '种子栏目名称',
			
 
				+        '种子爬虫状态',
			
 
				+        '栏目名称',
			
 
				+        '栏目地址',
			
 
				+        '备注'
			
 
				+    ]
			
 
				+    df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False)
			
 
				+
			
 
				+    q = {'channel': {'$exists': 0}}
			
 
				+    projection = {
			
 
				+        'site': 1,
			
 
				+        'seed_spidercode': 1,
			
 
				+        'seed': 1,
			
 
				+        'seed_channel': 1,
			
 
				+        'seed_spiderstatus': 1,
			
 
				+        'remark': 1,
			
 
				+        '_id': 0
			
 
				+    }
			
 
				+    cursor = shujuziyuan.find(q, projection=projection)
			
 
				+    df = pd.DataFrame(list(cursor))
			
 
				+    df.columns = [
			
 
				+        '网站名称',
			
 
				+        '种子栏目地址',
			
 
				+        '种子爬虫代码',
			
 
				+        '种子栏目名称',
			
 
				+        '种子爬虫状态',
			
 
				+    ]
			
 
				+    df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False)
			
 
				+    writer.save()
			
 
				+    print(f"{file} 录入完成")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # crawl_tasks = read_local_data('seed.xlsx')
			
 
				+    # excavate_data(crawl_tasks)
			
 
				+    to_excel()