import copy import pandas as pd from common.databases import mongo_table from common.tools import get_current_date from crawler.download import Downloader, RenderDownloader from crawler.services.channel import bfs shujuziyuan = mongo_table('shujuziyuan', 'channel') def is_duplicate(seed, href): if href != seed: return False return True def read_local_data(file): data_lst = [] browsing_history = {} df = pd.read_excel(io=file, sheet_name=0) for site, *item in df.values: duplicate_spider_state_items = { 4: '已作废', 6: '已下架', 10: '已删除', } if item[3] not in duplicate_spider_state_items: print(site, item) href = item.pop(2) # print(site, href, *item) if site not in browsing_history: browsing_history.setdefault(site, {}) channel = item[1] if channel not in browsing_history[site]: channel_nums = len(browsing_history[site]) if channel_nums == 0: browsing_history[site][channel] = 1 else: channel_nums += 1 browsing_history[site][channel] = channel_nums nums = browsing_history[site][channel] # print(f"{site}_{channel}", nums) data = (site, href, *item, nums) if data not in data_lst: data_lst.append(data) return data_lst def excavate_data(tasks): static = Downloader() render = RenderDownloader() download_tools = [static, render] for site, href, *other in tasks: print("开始 >>> ", site, href, *other) # 尝试多种页面请求处理方式 insert_lst = [] item = { 'site': site, # 网站名称 'seed': href, # 种子地址 'seed_spidercode': other[0], # 种子爬虫代码 'seed_channel': other[1], # 种子栏目名称 'seed_spiderstatus': other[2], # 种子爬虫状态 'nums': other[-1], # 栏目的序号 'remark': '', } for dl in download_tools: resp = dl.get(href, timeout=3) results = bfs(resp, href) for key, items in results.items(): print(f"搜索 >>> {key} && {len(items)}") for val in items: channel, url = val copy_data = copy.deepcopy(item) copy_data.update({ 'channel': channel, # 栏目地址 'href': url, # 网站名称 'is_duplicate': is_duplicate(href, url), # 种子与新栏目地址是否相同 }) insert_lst.append(copy_data) if len(results) > 0: break if len(insert_lst) > 0: shujuziyuan.insert_many(insert_lst) else: shujuziyuan.insert_one(item) print('结束 >>>\n') def to_excel(): date = get_current_date(fmt="%Y%m%d") file = f'{date}_栏目挖掘.xlsx' writer = pd.ExcelWriter(file) # q = {'channel': {'$exists': 1}, 'is_duplicate': False} q = {'channel': {'$exists': 1}, 'nums': 1} projection = { 'site': 1, 'seed_spidercode': 1, 'seed': 1, 'seed_channel': 1, 'href': 1, 'channel': 1, 'seed_spiderstatus': 1, 'remark': 1, '_id': 0 } cursor = shujuziyuan.find(q, projection=projection) df = pd.DataFrame(list(cursor)) df.columns = [ '网站名称', '种子栏目地址', '种子爬虫代码', '种子栏目名称', '种子爬虫状态', '栏目名称', '栏目地址', '备注' ] df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False) q = {'channel': {'$exists': 0}} projection = { 'site': 1, 'seed_spidercode': 1, 'seed': 1, 'seed_channel': 1, 'seed_spiderstatus': 1, 'remark': 1, '_id': 0 } cursor = shujuziyuan.find(q, projection=projection) df = pd.DataFrame(list(cursor)) df.columns = [ '网站名称', '种子栏目地址', '种子爬虫代码', '种子栏目名称', '种子爬虫状态', ] df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False) writer.save() print(f"{file} 录入完成") if __name__ == '__main__': # crawl_tasks = read_local_data('seed.xlsx') # excavate_data(crawl_tasks) to_excel()