3 meses atrás · 3bba603662
--- a/lzz_theme/zgzbtbggfwpt/history_crawl2.py
+++ b/lzz_theme/zgzbtbggfwpt/history_crawl2.py
@@ -26,11 +26,5 @@ if __name__ == '__main__':
 
				     today_str = today.strftime("%Y-%m-%d")
			
 
				     before_day_3 = today - timedelta(days=3)
			
 
				     before_day_3_str = before_day_3.strftime("%Y-%m-%d")
			
 
				-
			
 
				-    Crawl_Zgzb(
			
 
				-        menus,
			
 
				-        date_items={
			
 
				-            'start_date': before_day_3_str,
			
 
				-            'end_date': today_str
			
 
				-        }
			
 
				-    ).start()
			
 
				+    date = (before_day_3_str, today_str)
			
 
				+    Crawl_Zgzb(menus, date=date, threads=10, interval=0.4).start()
			
--- a/lzz_theme/zgzbtbggfwpt/spider_list.py
+++ b/lzz_theme/zgzbtbggfwpt/spider_list.py
@@ -10,6 +10,7 @@ import sys
 
				 import time
			
 
				 from collections import namedtuple
			
 
				 from concurrent.futures import ThreadPoolExecutor, wait
			
 
				+from functools import partial
			
 
				 from pathlib import Path
			
 
				 
			
 
				 import execjs
			
@@ -33,12 +34,12 @@ except AttributeError:
 
				 
			
 
				 class Spider:
			
 
				 
			
 
				-    def __init__(self, menus, threads=1, interval=0.5, date_items=None):
			
 
				+    def __init__(self, menus, threads=1, interval=0.5, date=None, dates=None):
			
 
				         self.theme_list = Mongo_client()['py_spider']['theme_list']
			
 
				         self.RDS = RedisFilter()
			
 
				 
			
 
				         self.menus = menus
			
 
				-        self.kwargs = {'date_items': (date_items or {})}
			
 
				+        self.kwargs = {'date': date, 'dates': dates}
			
 
				 
			
 
				         thread_name = Path(sys.argv[0]).name.replace('.py', '')
			
 
				         self._executor = ThreadPoolExecutor(max_workers=threads,
			
@@ -64,11 +65,15 @@ class Spider:
 
				             ctx = execjs.compile(ex_js)
			
 
				             return ctx.call('getFullHref')
			
 
				 
			
 
				-    def download(self, page, menu, start_date=None, end_date=None):
			
 
				+    def download(self, page, menu, date=None):
			
 
				         logger.debug(f"网络请求|{menu.channel}|第{page}页")
			
 
				-        if not start_date and not end_date:
			
 
				+        if date is None:
			
 
				             url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&page=" + str(page)
			
 
				         else:
			
 
				+            if not isinstance(date, (tuple, list)):
			
 
				+                raise TypeError("dates is not iterable")
			
 
				+
			
 
				+            start_date, end_date = date
			
 
				             end_date = str(end_date).strip() + " 23:59:59"
			
 
				             url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&startcheckDate=" + start_date + "&endcheckDate=" + end_date + "&page=" + str(page)
			
 
				 
			
@@ -90,12 +95,27 @@ class Spider:
 
				         response.raise_for_status()
			
 
				         return response
			
 
				 
			
 
				-    def fetch_list(self, menu, page, max_retries=3, show_log=True):
			
 
				-        date_items = self.kwargs["date_items"]
			
 
				+    def fetch_max_pagination(self, menu):
			
 
				+        logger.info(f"自动获取最大分页|{menu.channel}")
			
 
				+        c = 0
			
 
				+        while True:
			
 
				+            try:
			
 
				+                response = self.fetch_list(menu, 1, 3, show_log=False)
			
 
				+                if response is not None:
			
 
				+                    html = response.content.decode()
			
 
				+                    pagination = Selector(html).xpath(
			
 
				+                        '//div[@class="pagination"]/label/text()'
			
 
				+                    ).extract_first()
			
 
				+                    if pagination is not None and str(pagination).isdigit():
			
 
				+                        return int(pagination)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
			
 
				+
			
 
				+    def fetch_list(self, menu, page, max_retries=3, date=None, show_log=True):
			
 
				         for i in range(max_retries):
			
 
				             try:
			
 
				                 time.sleep(self._interval)
			
 
				-                return self.download(page=page, menu=menu, **date_items)
			
 
				+                return self.download(page=page, menu=menu, date=date)
			
 
				             except Exception as e:
			
 
				                 if show_log:
			
 
				                     logger.error(f"网络请求|{menu.channel}|第{page}页|{type(e).__name__}|重试..{i + 1}")
			
@@ -147,33 +167,19 @@ class Spider:
 
				 
			
 
				         logger.info(f'采集成功|{menu.channel}|第{page}页|发布{len(root[1:])}条|入库{count}条')
			
 
				 
			
 
				-    def _spider(self, menu, page, max_retries):
			
 
				+    def _spider(self, menu, page, max_retries, date):
			
 
				         try:
			
 
				-            response = self.fetch_list(menu, page, max_retries)
			
 
				+            response = self.fetch_list(menu, page, max_retries, date)
			
 
				             if response is not None:
			
 
				                 html = response.content.decode()
			
 
				                 self.parse(html, page, menu)
			
 
				         except Exception as why:
			
 
				             logger.error(f"采集失败|{menu.channel}|第{page}页|原因|{type(why).__name__}")
			
 
				 
			
 
				-    def fetch_max_pagination(self, menu):
			
 
				-        logger.info(f"自动获取最大分页|{menu.channel}")
			
 
				-        c = 0
			
 
				-        while True:
			
 
				-            try:
			
 
				-                response = self.fetch_list(menu, 1, 3, show_log=False)
			
 
				-                if response is not None:
			
 
				-                    html = response.content.decode()
			
 
				-                    pagination = Selector(html).xpath(
			
 
				-                        '//div[@class="pagination"]/label/text()'
			
 
				-                    ).extract_first()
			
 
				-                    if pagination is not None and str(pagination).isdigit():
			
 
				-                        return int(pagination)
			
 
				-            except Exception as e:
			
 
				-                logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
			
 
				-
			
 
				     def start(self):
			
 
				         logger.debug("********** 任务开始 **********")
			
 
				+        _spider = partial(self.add_task, self._spider)
			
 
				+
			
 
				         try:
			
 
				             for menu in self.menus:
			
 
				                 auto_paginate = getattr(menu, 'auto_paginate', False)
			
@@ -186,8 +192,16 @@ class Spider:
 
				                         continue
			
 
				 
			
 
				                 max_page = max(max_page, 1)
			
 
				-                for page in range(1, max_page + 1):
			
 
				-                    self.add_task(self._spider, menu, page, max_retries=10)
			
 
				+
			
 
				+                dates = self.kwargs["dates"]
			
 
				+                if dates is not None and isinstance(dates, list):
			
 
				+                    for date in dates:
			
 
				+                        for page in range(1, max_page + 1):
			
 
				+                            _spider(menu, page, max_retries=10, date=date)
			
 
				+                else:
			
 
				+                    date = self.kwargs['date']
			
 
				+                    for page in range(1, max_page + 1):
			
 
				+                        _spider(menu, page, max_retries=10, date=date)
			
 
				 
			
 
				             self.wait()
			
 
				         finally:
			
@@ -206,5 +220,6 @@ if __name__ == '__main__':
 
				         Menu('资格预审公告', 'a_zgzbtbggfwpt_zgysgg2', '92', 'qualify', 1),
			
 
				     ]
			
 
				 
			
 
				-    # Spider(menu_lst, threads=2, date_items={'start_date': '2025-04-28', 'end_date': '2025-04-28'}).start()
			
 
				+    # Spider(menu_lst, threads=2, date=('2025-04-28', '2025-04-28')).start()
			
 
				+    # Spider(menu_lst, threads=2, dates=[('2025-04-28', '2025-04-28'), ('2025-04-27', '2025-04-27')]).start()
			
 
				     Spider(menu_lst, threads=2).start()
			
--- a/lzz_theme/zgzbtbggfwpt/zgzbtbggfwpt_list_date.py
+++ b/lzz_theme/zgzbtbggfwpt/zgzbtbggfwpt_list_date.py
@@ -24,13 +24,5 @@ if __name__ == '__main__':
 
				     today_str = today.strftime("%Y-%m-%d")
			
 
				     before_day_3 = today - timedelta(days=3)
			
 
				     before_day_3_str = before_day_3.strftime("%Y-%m-%d")
			
 
				-
			
 
				-    Spider(
			
 
				-        menus,
			
 
				-        threads=10,
			
 
				-        interval=1,
			
 
				-        date_items={
			
 
				-            'start_date': before_day_3_str,
			
 
				-            'end_date': today_str
			
 
				-        }
			
 
				-    ).start()
			
 
				+    date = (before_day_3_str, today_str)
			
 
				+    Spider(menus, threads=10, interval=1, date=date).start()