123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- import time
- import urllib3
- import pandas as pd
- from crawler.Task import Task
- from crawler.q import RedisQueue
- from crawler.utils import extract_host, extract_domain, is_url
- from settings import (
- REDIS_QUERY_KEYWORD,
- REDIS_EXCAVATE,
- FILTER_WORDS,
- MGO_KEYWORDS,
- MGO_ORGS,
- MGO_URLS,
- MGO_COMPETING_GOODS,
- MGO_LUA_SPIDERS,
- )
- from common.databases import int2long, mongo_table
- def push_task(file):
- path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/'
- mrq = RedisQueue()
- _file = str(file)
- df = pd.read_excel(path + _file, sheet_name=0)
- lst = [Task(url=nd[0], groups='seed_url') for nd in df.values]
- print(mrq.push_task(REDIS_EXCAVATE, lst, level=9))
- def make_seed_words_table(file):
- path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/'
- df = pd.read_excel(path + file, sheet_name=1)
- # print(df.to_dict())
- # 企业名称
- df_dict = df.to_dict()
- for key, val in df_dict['企业名称'].items():
- # print(key, val)
- MGO_ORGS.insert_one({'name': val})
- print('企业名称表成功创建')
- # 关键词
- for val in FILTER_WORDS:
- # print(val)
- MGO_KEYWORDS.insert_one({'name': val})
- print('关键词表成功创建')
- def make_seed_urls_table(file):
- path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/'
- _file = str(file)
- df = pd.read_excel(path + _file, sheet_name=0)
- for nd in df.values:
- if len(nd) > 1:
- name, url = nd[0], nd[1]
- else:
- name, url = '', nd[0]
- # print(name, url)
- if is_url(url) and extract_domain(url) != '':
- try:
- MGO_URLS.insert_one({'name': extract_host(url), 'site_name': name})
- except urllib3.exceptions.LocationParseError:
- continue
- print('种子urls表成功创建')
- def make_competing_goods_table(file):
- path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/'
- _file = str(file)
- df = pd.read_excel(path + _file, sheet_name=0)
- for nd in df.values:
- url = nd[0]
- MGO_COMPETING_GOODS.insert_one({'name': url})
- print('竞品urls表成功创建')
- def make_garbage_tab():
- q = {"param_common.11": {"$exists": True}}
- projection = {'param_common': 1, '_id': 0}
- data_garbage = mongo_table('shujuziyuan', 'data_garbage', host='127.0.0.1', port=27017)
- cursor = MGO_LUA_SPIDERS.find(q, projection=projection)
- history = []
- for item in cursor:
- href = item['param_common'][11]
- domain = extract_domain(href)
- if len(domain) > 0 and domain not in history:
- data_garbage.insert_one({
- 'domain': domain,
- 'create_at': int2long(int(time.time())),
- })
- history.append(domain)
- print(f'href >> {href}; domain >> {domain}')
- def make_domain():
- q = {"param_common.11": {"$exists": True}}
- projection = {'param_common': 1, '_id': 0}
- data_garbage = mongo_table('dzr', 'web_domain', host='baibai.ink', port=28082)
- cursor = MGO_LUA_SPIDERS.find(q, projection=projection)
- history = []
- for item in cursor:
- href = item['param_common'][11]
- domain = extract_domain(href)
- if len(domain) > 0 and domain not in history:
- data_garbage.insert_one({
- 'domain': domain,
- 'create_at': int2long(int(time.time())),
- })
- history.append(domain)
- print(f'href >> {href}; domain >> {domain}')
- if __name__ == '__main__':
- # make_garbage_tab()
- make_seed_urls_table('元博网正文数据寻源网址类2022.8.12.xlsx')
- # make_seed_urls_table('剑鱼正文数据寻源网址类2022.8.25.xlsx')
- # make_competing_goods_table('元博网正文数据寻源网址类2022.2.23.xlsx')
- # make_seed_words_table('自动寻源程序种子提供2022.xlsx')
|