123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- import copy
- import pandas as pd
- from common.databases import mongo_table
- from common.tools import get_current_date
- from crawler.download import Downloader, RenderDownloader
- from crawler.services.channel import bfs
- shujuziyuan = mongo_table('shujuziyuan', 'channel')
- def is_duplicate(seed, href):
- if href != seed:
- return False
- return True
- def read_local_data(file):
- data_lst = []
- browsing_history = {}
- df = pd.read_excel(io=file, sheet_name=0)
- for site, *item in df.values:
- # print(site, item)
- href = item.pop(2)
- # print(site, href, *item)
- if site not in browsing_history:
- browsing_history.setdefault(site, {})
- channel = item[1]
- if channel not in browsing_history[site]:
- channel_nums = len(browsing_history[site])
- if channel_nums == 0:
- browsing_history[site][channel] = 1
- else:
- channel_nums += 1
- browsing_history[site][channel] = channel_nums
- nums = browsing_history[site][channel]
- # print(f"{site}_{channel}", nums)
- data = (site, href, *item, nums)
- if data not in data_lst:
- data_lst.append(data)
- return data_lst
- def excavate_data(tasks):
- static = Downloader()
- render = RenderDownloader()
- download_tools = [static, render]
- for site, href, *other in tasks:
- print("开始 >>> ", site, href, *other)
- # 尝试多种页面请求处理方式
- insert_lst = []
- item = {
- 'site': site, # 网站名称
- 'seed': href, # 种子地址
- 'seed_spidercode': other[0], # 种子爬虫代码
- 'seed_channel': other[1], # 种子栏目名称
- 'seed_spiderstatus': other[2], # 种子爬虫状态
- 'nums': other[-1], # 栏目的序号
- 'remark': '',
- }
- for dl in download_tools:
- resp = dl.get(href, timeout=3)
- results = bfs(resp, href)
- for key, items in results.items():
- print(f"搜索 >>> {key} && {len(items)}")
- for val in items:
- channel, url = val
- copy_data = copy.deepcopy(item)
- copy_data.update({
- 'channel': channel, # 栏目地址
- 'href': url, # 网站名称
- 'is_duplicate': is_duplicate(href, url), # 种子与新栏目地址是否相同
- })
- insert_lst.append(copy_data)
- if len(results) > 0:
- break
- if len(insert_lst) > 0:
- shujuziyuan.insert_many(insert_lst)
- else:
- shujuziyuan.insert_one(item)
- print('结束 >>>\n')
- def to_excel():
- date = get_current_date(fmt="%Y%m%d")
- file = f'{date}_栏目挖掘.xlsx'
- writer = pd.ExcelWriter(file)
- # q = {'channel': {'$exists': 1}, 'is_duplicate': False}
- q = {'channel': {'$exists': 1}, 'nums': 1}
- projection = {
- 'site': 1,
- 'seed_spidercode': 1,
- 'seed': 1,
- 'seed_channel': 1,
- 'href': 1,
- 'channel': 1,
- 'seed_spiderstatus': 1,
- 'remark': 1,
- '_id': 0
- }
- cursor = shujuziyuan.find(q, projection=projection)
- df = pd.DataFrame(list(cursor))
- df.columns = [
- '网站名称',
- '种子栏目地址',
- '种子爬虫代码',
- '种子栏目名称',
- '种子爬虫状态',
- '栏目名称',
- '栏目地址',
- '备注'
- ]
- df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False)
- q = {'channel': {'$exists': 0}}
- projection = {
- 'site': 1,
- 'seed_spidercode': 1,
- 'seed': 1,
- 'seed_channel': 1,
- 'seed_spiderstatus': 1,
- 'remark': 1,
- '_id': 0
- }
- cursor = shujuziyuan.find(q, projection=projection)
- df = pd.DataFrame(list(cursor))
- df.columns = [
- '网站名称',
- '种子栏目地址',
- '种子爬虫代码',
- '种子栏目名称',
- '种子爬虫状态',
- ]
- df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False)
- writer.save()
- print(f"{file} 录入完成")
- if __name__ == '__main__':
- # crawl_tasks = read_local_data('seed.xlsx')
- # excavate_data(crawl_tasks)
- to_excel()
|