|
@@ -5,11 +5,11 @@ from urllib.parse import quote
|
|
|
|
|
|
import requests
|
|
import requests
|
|
from lxml.html import fromstring, HtmlElement
|
|
from lxml.html import fromstring, HtmlElement
|
|
-
|
|
|
|
|
|
+from utils.tools import sha1
|
|
from config.load import crawler_url, region
|
|
from config.load import crawler_url, region
|
|
from crawler.crawl_scheduler import Scheduler
|
|
from crawler.crawl_scheduler import Scheduler
|
|
from crawler.login import login, load_login_cookies, login_session_check
|
|
from crawler.login import login, load_login_cookies, login_session_check
|
|
-from utils.databases import mongo_table, int2long, es_query
|
|
|
|
|
|
+from utils.databases import mongo_table, int2long, es_query, redis_client
|
|
from utils.execptions import CustomCheckError, VoidCrawlError, JyBasicException
|
|
from utils.execptions import CustomCheckError, VoidCrawlError, JyBasicException
|
|
from utils.log import logger
|
|
from utils.log import logger
|
|
from utils.socks5 import Proxy
|
|
from utils.socks5 import Proxy
|
|
@@ -22,17 +22,19 @@ class ListSpider:
|
|
def __init__(self, db: str, crawl_tab: str, crawl_max_page=None):
|
|
def __init__(self, db: str, crawl_tab: str, crawl_max_page=None):
|
|
self.crawl_menus = [
|
|
self.crawl_menus = [
|
|
# CrawlMenu('企业采购', 'a_ybwcgyzbw_qycg', '7%2C'),
|
|
# CrawlMenu('企业采购', 'a_ybwcgyzbw_qycg', '7%2C'),
|
|
- CrawlMenu('政府采购', 'a_ybwcgyzbw_zfcg', '6%2C'),
|
|
|
|
- CrawlMenu('招标预告', 'a_ybwcgyzbw_zbyg', '5%2C'),
|
|
|
|
|
|
+ # CrawlMenu('政府采购', 'a_ybwcgyzbw_zfcg', '6%2C'),
|
|
|
|
+ # CrawlMenu('招标预告', 'a_ybwcgyzbw_zbyg', '5%2C'),
|
|
CrawlMenu('中标公示', 'a_ybwcgyzbw_zbgs', '4%2C'),
|
|
CrawlMenu('中标公示', 'a_ybwcgyzbw_zbgs', '4%2C'),
|
|
- CrawlMenu('服务招标', 'a_ybwcgyzbw_fwzb', '3%2C'),
|
|
|
|
- CrawlMenu('货物招标', 'a_ybwcgyzbw_hwzb', '2%2C'),
|
|
|
|
- CrawlMenu('工程招标', 'a_ybwcgyzbw_gczb', '1%2C'),
|
|
|
|
|
|
+ # CrawlMenu('服务招标', 'a_ybwcgyzbw_fwzb', '3%2C'),
|
|
|
|
+ # CrawlMenu('货物招标', 'a_ybwcgyzbw_hwzb', '2%2C'),
|
|
|
|
+ # CrawlMenu('工程招标', 'a_ybwcgyzbw_gczb', '1%2C'),
|
|
]
|
|
]
|
|
self.crawl_max_page = crawl_max_page or 1
|
|
self.crawl_max_page = crawl_max_page or 1
|
|
self.crawl_tab = mongo_table(db, crawl_tab)
|
|
self.crawl_tab = mongo_table(db, crawl_tab)
|
|
self.user = None
|
|
self.user = None
|
|
self.session = None
|
|
self.session = None
|
|
|
|
+ self.r = redis_client()
|
|
|
|
+ self.redis_key = 'ybw_2022'
|
|
|
|
|
|
def crawl_request(self, url: str, refer: str, **kwargs):
|
|
def crawl_request(self, url: str, refer: str, **kwargs):
|
|
headers = {
|
|
headers = {
|
|
@@ -113,20 +115,23 @@ class ListSpider:
|
|
}
|
|
}
|
|
if title is None:
|
|
if title is None:
|
|
raise CustomCheckError(code=10107, reason='发布标题为空')
|
|
raise CustomCheckError(code=10107, reason='发布标题为空')
|
|
- item['count'] = es_query(item["title"], item["l_np_publishtime"])
|
|
|
|
- item['crawl'] = False
|
|
|
|
- # print(f'>>> {title} - {competehref}')
|
|
|
|
- results.append(item)
|
|
|
|
|
|
+ sign = sha1(item['competehref'])
|
|
|
|
+ if not self.r.hexists(self.redis_key, sign):
|
|
|
|
+ item['count'] = es_query(item["title"], item["l_np_publishtime"])
|
|
|
|
+ item['crawl'] = False
|
|
|
|
+ # print(f'>>> {title} - {competehref}')
|
|
|
|
+ results.append(item)
|
|
|
|
+ self.r.hset(self.redis_key, sign, '')
|
|
|
|
|
|
if len(results) > 0:
|
|
if len(results) > 0:
|
|
self.crawl_tab.insert_many(results)
|
|
self.crawl_tab.insert_many(results)
|
|
return len(results)
|
|
return len(results)
|
|
|
|
|
|
def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
|
|
def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
|
|
|
|
+ self.session = requests.session()
|
|
for region_id, region_name in region.items():
|
|
for region_id, region_name in region.items():
|
|
previous_url = None
|
|
previous_url = None
|
|
crawl_total, cookies = 1, None
|
|
crawl_total, cookies = 1, None
|
|
- self.session = requests.session()
|
|
|
|
'''每个普通账号仅能查询4000条数据,设置每页最大条数:100,共计40页'''
|
|
'''每个普通账号仅能查询4000条数据,设置每页最大条数:100,共计40页'''
|
|
page_size = 30
|
|
page_size = 30
|
|
for page in range(1, self.crawl_max_page + 1):
|
|
for page in range(1, self.crawl_max_page + 1):
|
|
@@ -184,7 +189,7 @@ class ListSpider:
|
|
sc.err_record(e)
|
|
sc.err_record(e)
|
|
finally:
|
|
finally:
|
|
sc.wait_for_next_task(random.choice(range(2, 6)))
|
|
sc.wait_for_next_task(random.choice(range(2, 6)))
|
|
- self.session.close()
|
|
|
|
|
|
+ self.session.close()
|
|
|
|
|
|
def start(self):
|
|
def start(self):
|
|
for menu in self.crawl_menus:
|
|
for menu in self.crawl_menus:
|