|
@@ -0,0 +1,130 @@
|
|
|
+import copy
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+from common.databases import mongo_table
|
|
|
+from common.tools import get_current_date
|
|
|
+from crawler.download import Downloader, RenderDownloader
|
|
|
+from crawler.services.channel import bfs
|
|
|
+
|
|
|
+shujuziyuan = mongo_table('shujuziyuan', 'channel')
|
|
|
+
|
|
|
+
|
|
|
+def is_duplicate(seed, href):
|
|
|
+ if href != seed:
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+def read_local_data(file):
|
|
|
+ data_lst = []
|
|
|
+ df = pd.read_excel(io=file, sheet_name=0)
|
|
|
+ for site, *item in df.values:
|
|
|
+ # print(site, item)
|
|
|
+ href = item.pop(2)
|
|
|
+ # print(site, href, *item)
|
|
|
+ data = (site, href, *item)
|
|
|
+ if data not in data_lst:
|
|
|
+ data_lst.append(data)
|
|
|
+ return data_lst
|
|
|
+
|
|
|
+
|
|
|
+def excavate_data(tasks):
|
|
|
+ static = Downloader()
|
|
|
+ render = RenderDownloader()
|
|
|
+ download_tools = [static, render]
|
|
|
+ for site, href, *other in tasks:
|
|
|
+ print("开始 >>> ", site, href, *other)
|
|
|
+
|
|
|
+ # 尝试多种页面请求处理方式
|
|
|
+ insert_lst = []
|
|
|
+ item = {
|
|
|
+ 'site': site, # 网站名称
|
|
|
+ 'seed': href, # 种子地址
|
|
|
+ 'seed_spidercode': other[0], # 种子爬虫代码
|
|
|
+ 'seed_channel': other[1], # 种子栏目名称
|
|
|
+ 'seed_spiderstatus': other[2], # 种子爬虫状态
|
|
|
+ 'remark': '',
|
|
|
+ }
|
|
|
+ for dl in download_tools:
|
|
|
+ resp = dl.get(href, timeout=3)
|
|
|
+ results = bfs(resp, href)
|
|
|
+ for key, items in results.items():
|
|
|
+ print(f"搜索 >>> {key} && {len(items)}")
|
|
|
+ for val in items:
|
|
|
+ channel, url = val
|
|
|
+ copy_data = copy.deepcopy(item)
|
|
|
+ copy_data.update({
|
|
|
+ 'channel': channel, # 栏目地址
|
|
|
+ 'href': url, # 网站名称
|
|
|
+ 'is_duplicate': is_duplicate(href, url), # 种子与新栏目地址是否相同
|
|
|
+ })
|
|
|
+ insert_lst.append(copy_data)
|
|
|
+ if len(results) > 0:
|
|
|
+ break
|
|
|
+
|
|
|
+ if len(insert_lst) > 0:
|
|
|
+ shujuziyuan.insert_many(insert_lst)
|
|
|
+ else:
|
|
|
+ shujuziyuan.insert_one(item)
|
|
|
+ print('结束 >>>\n')
|
|
|
+
|
|
|
+
|
|
|
+def to_excel():
|
|
|
+ date = get_current_date(fmt="%Y%m%d")
|
|
|
+ file = f'{date}_栏目挖掘.xlsx'
|
|
|
+ writer = pd.ExcelWriter(file)
|
|
|
+ q = {'channel': {'$exists': 1}, 'is_duplicate': False}
|
|
|
+ projection = {
|
|
|
+ 'site': 1,
|
|
|
+ 'seed_spidercode': 1,
|
|
|
+ 'seed': 1,
|
|
|
+ 'seed_channel': 1,
|
|
|
+ 'href': 1,
|
|
|
+ 'channel': 1,
|
|
|
+ 'seed_spiderstatus': 1,
|
|
|
+ 'remark': 1,
|
|
|
+ '_id': 0
|
|
|
+ }
|
|
|
+ cursor = shujuziyuan.find(q, projection=projection)
|
|
|
+ df = pd.DataFrame(list(cursor))
|
|
|
+ df.columns = [
|
|
|
+ '网站名称',
|
|
|
+ '种子栏目地址',
|
|
|
+ '种子爬虫代码',
|
|
|
+ '种子栏目名称',
|
|
|
+ '种子爬虫状态',
|
|
|
+ '栏目名称',
|
|
|
+ '栏目地址',
|
|
|
+ '备注'
|
|
|
+ ]
|
|
|
+ df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False)
|
|
|
+
|
|
|
+ q = {'channel': {'$exists': 0}}
|
|
|
+ projection = {
|
|
|
+ 'site': 1,
|
|
|
+ 'seed_spidercode': 1,
|
|
|
+ 'seed': 1,
|
|
|
+ 'seed_channel': 1,
|
|
|
+ 'seed_spiderstatus': 1,
|
|
|
+ 'remark': 1,
|
|
|
+ '_id': 0
|
|
|
+ }
|
|
|
+ cursor = shujuziyuan.find(q, projection=projection)
|
|
|
+ df = pd.DataFrame(list(cursor))
|
|
|
+ df.columns = [
|
|
|
+ '网站名称',
|
|
|
+ '种子栏目地址',
|
|
|
+ '种子爬虫代码',
|
|
|
+ '种子栏目名称',
|
|
|
+ '种子爬虫状态',
|
|
|
+ ]
|
|
|
+ df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False)
|
|
|
+ writer.save()
|
|
|
+ print(f"{file} 录入完成")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ # crawl_tasks = read_local_data('seed.xlsx')
|
|
|
+ # excavate_data(crawl_tasks)
|
|
|
+ to_excel()
|