import time import urllib3 import pandas as pd from crawler.Task import Task from crawler.q import RedisQueue from crawler.utils import extract_host, extract_domain, is_url from settings import ( REDIS_QUERY_KEYWORD, REDIS_EXCAVATE, FILTER_WORDS, MGO_KEYWORDS, MGO_ORGS, MGO_URLS, MGO_COMPETING_GOODS, MGO_LUA_SPIDERS, ) from common.databases import int2long, mongo_table def push_task(file): path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/' mrq = RedisQueue() _file = str(file) df = pd.read_excel(path + _file, sheet_name=0) lst = [Task(url=nd[0], groups='seed_url') for nd in df.values] print(mrq.push_task(REDIS_EXCAVATE, lst, level=9)) def make_seed_words_table(file): path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/' df = pd.read_excel(path + file, sheet_name=1) # print(df.to_dict()) # 企业名称 df_dict = df.to_dict() for key, val in df_dict['企业名称'].items(): # print(key, val) MGO_ORGS.insert_one({'name': val}) print('企业名称表成功创建') # 关键词 for val in FILTER_WORDS: # print(val) MGO_KEYWORDS.insert_one({'name': val}) print('关键词表成功创建') def make_seed_urls_table(file): path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/' _file = str(file) df = pd.read_excel(path + _file, sheet_name=0) for nd in df.values: if len(nd) > 1: name, url = nd[0], nd[1] else: name, url = '', nd[0] # print(name, url) if is_url(url) and extract_domain(url) != '': try: MGO_URLS.insert_one({'name': extract_host(url), 'site_name': name}) except urllib3.exceptions.LocationParseError: continue print('种子urls表成功创建') def make_competing_goods_table(file): path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/' _file = str(file) df = pd.read_excel(path + _file, sheet_name=0) for nd in df.values: url = nd[0] MGO_COMPETING_GOODS.insert_one({'name': url}) print('竞品urls表成功创建') def make_garbage_tab(): q = {"param_common.11": {"$exists": True}} projection = {'param_common': 1, '_id': 0} data_garbage = mongo_table('shujuziyuan', 'data_garbage', host='127.0.0.1', port=27017) cursor = MGO_LUA_SPIDERS.find(q, projection=projection) history = [] for item in cursor: href = item['param_common'][11] domain = extract_domain(href) if len(domain) > 0 and domain not in history: data_garbage.insert_one({ 'domain': domain, 'create_at': int2long(int(time.time())), }) history.append(domain) print(f'href >> {href}; domain >> {domain}') def make_domain(): q = {"param_common.11": {"$exists": True}} projection = {'param_common': 1, '_id': 0} data_garbage = mongo_table('dzr', 'web_domain', host='baibai.ink', port=28082) cursor = MGO_LUA_SPIDERS.find(q, projection=projection) history = [] for item in cursor: href = item['param_common'][11] domain = extract_domain(href) if len(domain) > 0 and domain not in history: data_garbage.insert_one({ 'domain': domain, 'create_at': int2long(int(time.time())), }) history.append(domain) print(f'href >> {href}; domain >> {domain}') if __name__ == '__main__': # make_garbage_tab() make_seed_urls_table('元博网正文数据寻源网址类2022.8.12.xlsx') # make_seed_urls_table('剑鱼正文数据寻源网址类2022.8.25.xlsx') # make_competing_goods_table('元博网正文数据寻源网址类2022.2.23.xlsx') # make_seed_words_table('自动寻源程序种子提供2022.xlsx')