|
@@ -10,6 +10,7 @@ import sys
|
|
import time
|
|
import time
|
|
from collections import namedtuple
|
|
from collections import namedtuple
|
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
|
+from functools import partial
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
|
|
|
import execjs
|
|
import execjs
|
|
@@ -33,12 +34,12 @@ except AttributeError:
|
|
|
|
|
|
class Spider:
|
|
class Spider:
|
|
|
|
|
|
- def __init__(self, menus, threads=1, interval=0.5, date_items=None):
|
|
|
|
|
|
+ def __init__(self, menus, threads=1, interval=0.5, date=None, dates=None):
|
|
self.theme_list = Mongo_client()['py_spider']['theme_list']
|
|
self.theme_list = Mongo_client()['py_spider']['theme_list']
|
|
self.RDS = RedisFilter()
|
|
self.RDS = RedisFilter()
|
|
|
|
|
|
self.menus = menus
|
|
self.menus = menus
|
|
- self.kwargs = {'date_items': (date_items or {})}
|
|
|
|
|
|
+ self.kwargs = {'date': date, 'dates': dates}
|
|
|
|
|
|
thread_name = Path(sys.argv[0]).name.replace('.py', '')
|
|
thread_name = Path(sys.argv[0]).name.replace('.py', '')
|
|
self._executor = ThreadPoolExecutor(max_workers=threads,
|
|
self._executor = ThreadPoolExecutor(max_workers=threads,
|
|
@@ -64,11 +65,15 @@ class Spider:
|
|
ctx = execjs.compile(ex_js)
|
|
ctx = execjs.compile(ex_js)
|
|
return ctx.call('getFullHref')
|
|
return ctx.call('getFullHref')
|
|
|
|
|
|
- def download(self, page, menu, start_date=None, end_date=None):
|
|
|
|
|
|
+ def download(self, page, menu, date=None):
|
|
logger.debug(f"网络请求|{menu.channel}|第{page}页")
|
|
logger.debug(f"网络请求|{menu.channel}|第{page}页")
|
|
- if not start_date and not end_date:
|
|
|
|
|
|
+ if date is None:
|
|
url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&page=" + str(page)
|
|
url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&page=" + str(page)
|
|
else:
|
|
else:
|
|
|
|
+ if not isinstance(date, (tuple, list)):
|
|
|
|
+ raise TypeError("dates is not iterable")
|
|
|
|
+
|
|
|
|
+ start_date, end_date = date
|
|
end_date = str(end_date).strip() + " 23:59:59"
|
|
end_date = str(end_date).strip() + " 23:59:59"
|
|
url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&startcheckDate=" + start_date + "&endcheckDate=" + end_date + "&page=" + str(page)
|
|
url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&startcheckDate=" + start_date + "&endcheckDate=" + end_date + "&page=" + str(page)
|
|
|
|
|
|
@@ -90,12 +95,27 @@ class Spider:
|
|
response.raise_for_status()
|
|
response.raise_for_status()
|
|
return response
|
|
return response
|
|
|
|
|
|
- def fetch_list(self, menu, page, max_retries=3, show_log=True):
|
|
|
|
- date_items = self.kwargs["date_items"]
|
|
|
|
|
|
+ def fetch_max_pagination(self, menu):
|
|
|
|
+ logger.info(f"自动获取最大分页|{menu.channel}")
|
|
|
|
+ c = 0
|
|
|
|
+ while True:
|
|
|
|
+ try:
|
|
|
|
+ response = self.fetch_list(menu, 1, 3, show_log=False)
|
|
|
|
+ if response is not None:
|
|
|
|
+ html = response.content.decode()
|
|
|
|
+ pagination = Selector(html).xpath(
|
|
|
|
+ '//div[@class="pagination"]/label/text()'
|
|
|
|
+ ).extract_first()
|
|
|
|
+ if pagination is not None and str(pagination).isdigit():
|
|
|
|
+ return int(pagination)
|
|
|
|
+ except Exception as e:
|
|
|
|
+ logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
|
|
|
|
+
|
|
|
|
+ def fetch_list(self, menu, page, max_retries=3, date=None, show_log=True):
|
|
for i in range(max_retries):
|
|
for i in range(max_retries):
|
|
try:
|
|
try:
|
|
time.sleep(self._interval)
|
|
time.sleep(self._interval)
|
|
- return self.download(page=page, menu=menu, **date_items)
|
|
|
|
|
|
+ return self.download(page=page, menu=menu, date=date)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
if show_log:
|
|
if show_log:
|
|
logger.error(f"网络请求|{menu.channel}|第{page}页|{type(e).__name__}|重试..{i + 1}")
|
|
logger.error(f"网络请求|{menu.channel}|第{page}页|{type(e).__name__}|重试..{i + 1}")
|
|
@@ -147,33 +167,19 @@ class Spider:
|
|
|
|
|
|
logger.info(f'采集成功|{menu.channel}|第{page}页|发布{len(root[1:])}条|入库{count}条')
|
|
logger.info(f'采集成功|{menu.channel}|第{page}页|发布{len(root[1:])}条|入库{count}条')
|
|
|
|
|
|
- def _spider(self, menu, page, max_retries):
|
|
|
|
|
|
+ def _spider(self, menu, page, max_retries, date):
|
|
try:
|
|
try:
|
|
- response = self.fetch_list(menu, page, max_retries)
|
|
|
|
|
|
+ response = self.fetch_list(menu, page, max_retries, date)
|
|
if response is not None:
|
|
if response is not None:
|
|
html = response.content.decode()
|
|
html = response.content.decode()
|
|
self.parse(html, page, menu)
|
|
self.parse(html, page, menu)
|
|
except Exception as why:
|
|
except Exception as why:
|
|
logger.error(f"采集失败|{menu.channel}|第{page}页|原因|{type(why).__name__}")
|
|
logger.error(f"采集失败|{menu.channel}|第{page}页|原因|{type(why).__name__}")
|
|
|
|
|
|
- def fetch_max_pagination(self, menu):
|
|
|
|
- logger.info(f"自动获取最大分页|{menu.channel}")
|
|
|
|
- c = 0
|
|
|
|
- while True:
|
|
|
|
- try:
|
|
|
|
- response = self.fetch_list(menu, 1, 3, show_log=False)
|
|
|
|
- if response is not None:
|
|
|
|
- html = response.content.decode()
|
|
|
|
- pagination = Selector(html).xpath(
|
|
|
|
- '//div[@class="pagination"]/label/text()'
|
|
|
|
- ).extract_first()
|
|
|
|
- if pagination is not None and str(pagination).isdigit():
|
|
|
|
- return int(pagination)
|
|
|
|
- except Exception as e:
|
|
|
|
- logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
|
|
|
|
-
|
|
|
|
def start(self):
|
|
def start(self):
|
|
logger.debug("********** 任务开始 **********")
|
|
logger.debug("********** 任务开始 **********")
|
|
|
|
+ _spider = partial(self.add_task, self._spider)
|
|
|
|
+
|
|
try:
|
|
try:
|
|
for menu in self.menus:
|
|
for menu in self.menus:
|
|
auto_paginate = getattr(menu, 'auto_paginate', False)
|
|
auto_paginate = getattr(menu, 'auto_paginate', False)
|
|
@@ -186,8 +192,16 @@ class Spider:
|
|
continue
|
|
continue
|
|
|
|
|
|
max_page = max(max_page, 1)
|
|
max_page = max(max_page, 1)
|
|
- for page in range(1, max_page + 1):
|
|
|
|
- self.add_task(self._spider, menu, page, max_retries=10)
|
|
|
|
|
|
+
|
|
|
|
+ dates = self.kwargs["dates"]
|
|
|
|
+ if dates is not None and isinstance(dates, list):
|
|
|
|
+ for date in dates:
|
|
|
|
+ for page in range(1, max_page + 1):
|
|
|
|
+ _spider(menu, page, max_retries=10, date=date)
|
|
|
|
+ else:
|
|
|
|
+ date = self.kwargs['date']
|
|
|
|
+ for page in range(1, max_page + 1):
|
|
|
|
+ _spider(menu, page, max_retries=10, date=date)
|
|
|
|
|
|
self.wait()
|
|
self.wait()
|
|
finally:
|
|
finally:
|
|
@@ -206,5 +220,6 @@ if __name__ == '__main__':
|
|
Menu('资格预审公告', 'a_zgzbtbggfwpt_zgysgg2', '92', 'qualify', 1),
|
|
Menu('资格预审公告', 'a_zgzbtbggfwpt_zgysgg2', '92', 'qualify', 1),
|
|
]
|
|
]
|
|
|
|
|
|
- # Spider(menu_lst, threads=2, date_items={'start_date': '2025-04-28', 'end_date': '2025-04-28'}).start()
|
|
|
|
|
|
+ # Spider(menu_lst, threads=2, date=('2025-04-28', '2025-04-28')).start()
|
|
|
|
+ # Spider(menu_lst, threads=2, dates=[('2025-04-28', '2025-04-28'), ('2025-04-27', '2025-04-27')]).start()
|
|
Spider(menu_lst, threads=2).start()
|
|
Spider(menu_lst, threads=2).start()
|