3 mesi fa · 3bba603662
--- a/lzz_theme/zgzbtbggfwpt/history_crawl2.py
+++ b/lzz_theme/zgzbtbggfwpt/history_crawl2.py
@@ -26,11 +26,5 @@ if __name__ == '__main__':
 
															     today_str = today.strftime("%Y-%m-%d")
														
 
															     before_day_3 = today - timedelta(days=3)
														
 
															     before_day_3_str = before_day_3.strftime("%Y-%m-%d")
														
 
															-
														
 
															-    Crawl_Zgzb(
														
 
															-        menus,
														
 
															-        date_items={
														
 
															-            'start_date': before_day_3_str,
														
 
															-            'end_date': today_str
														
 
															-        }
														
 
															-    ).start()
														
 
															+    date = (before_day_3_str, today_str)
														
 
															+    Crawl_Zgzb(menus, date=date, threads=10, interval=0.4).start()
														
--- a/lzz_theme/zgzbtbggfwpt/spider_list.py
+++ b/lzz_theme/zgzbtbggfwpt/spider_list.py
@@ -10,6 +10,7 @@ import sys
 
															 import time
														
 
															 from collections import namedtuple
														
 
															 from concurrent.futures import ThreadPoolExecutor, wait
														
 
															+from functools import partial
														
 
															 from pathlib import Path
														
 
															 import execjs
														
@@ -33,12 +34,12 @@ except AttributeError:
 
															 class Spider:
														
 
															-    def __init__(self, menus, threads=1, interval=0.5, date_items=None):
														
 
															+    def __init__(self, menus, threads=1, interval=0.5, date=None, dates=None):
														
 
															         self.theme_list = Mongo_client()['py_spider']['theme_list']
														
 
															         self.RDS = RedisFilter()
														
 
															         self.menus = menus
														
 
															-        self.kwargs = {'date_items': (date_items or {})}
														
 
															+        self.kwargs = {'date': date, 'dates': dates}
														
 
															         thread_name = Path(sys.argv[0]).name.replace('.py', '')
														
 
															         self._executor = ThreadPoolExecutor(max_workers=threads,
														
@@ -64,11 +65,15 @@ class Spider:
 
															             ctx = execjs.compile(ex_js)
														
 
															             return ctx.call('getFullHref')
														
 
															-    def download(self, page, menu, start_date=None, end_date=None):
														
 
															+    def download(self, page, menu, date=None):
														
 
															         logger.debug(f"网络请求|{menu.channel}|第{page}页")
														
 
															-        if not start_date and not end_date:
														
 
															+        if date is None:
														
 
															             url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&page=" + str(page)
														
 
															         else:
														
 
															+            if not isinstance(date, (tuple, list)):
														
 
															+                raise TypeError("dates is not iterable")
														
 
															+
														
 
															+            start_date, end_date = date
														
 
															             end_date = str(end_date).strip() + " 23:59:59"
														
 
															             url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&startcheckDate=" + start_date + "&endcheckDate=" + end_date + "&page=" + str(page)
														
@@ -90,12 +95,27 @@ class Spider:
 
															         response.raise_for_status()
														
 
															         return response
														
 
															-    def fetch_list(self, menu, page, max_retries=3, show_log=True):
														
 
															-        date_items = self.kwargs["date_items"]
														
 
															+    def fetch_max_pagination(self, menu):
														
 
															+        logger.info(f"自动获取最大分页|{menu.channel}")
														
 
															+        c = 0
														
 
															+        while True:
														
 
															+            try:
														
 
															+                response = self.fetch_list(menu, 1, 3, show_log=False)
														
 
															+                if response is not None:
														
 
															+                    html = response.content.decode()
														
 
															+                    pagination = Selector(html).xpath(
														
 
															+                        '//div[@class="pagination"]/label/text()'
														
 
															+                    ).extract_first()
														
 
															+                    if pagination is not None and str(pagination).isdigit():
														
 
															+                        return int(pagination)
														
 
															+            except Exception as e:
														
 
															+                logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
														
 
															+
														
 
															+    def fetch_list(self, menu, page, max_retries=3, date=None, show_log=True):
														
 
															         for i in range(max_retries):
														
 
															             try:
														
 
															                 time.sleep(self._interval)
														
 
															-                return self.download(page=page, menu=menu, **date_items)
														
 
															+                return self.download(page=page, menu=menu, date=date)
														
 
															             except Exception as e:
														
 
															                 if show_log:
														
 
															                     logger.error(f"网络请求|{menu.channel}|第{page}页|{type(e).__name__}|重试..{i + 1}")
														
@@ -147,33 +167,19 @@ class Spider:
 
															         logger.info(f'采集成功|{menu.channel}|第{page}页|发布{len(root[1:])}条|入库{count}条')
														
 
															-    def _spider(self, menu, page, max_retries):
														
 
															+    def _spider(self, menu, page, max_retries, date):
														
 
															         try:
														
 
															-            response = self.fetch_list(menu, page, max_retries)
														
 
															+            response = self.fetch_list(menu, page, max_retries, date)
														
 
															             if response is not None:
														
 
															                 html = response.content.decode()
														
 
															                 self.parse(html, page, menu)
														
 
															         except Exception as why:
														
 
															             logger.error(f"采集失败|{menu.channel}|第{page}页|原因|{type(why).__name__}")
														
 
															-    def fetch_max_pagination(self, menu):
														
 
															-        logger.info(f"自动获取最大分页|{menu.channel}")
														
 
															-        c = 0
														
 
															-        while True:
														
 
															-            try:
														
 
															-                response = self.fetch_list(menu, 1, 3, show_log=False)
														
 
															-                if response is not None:
														
 
															-                    html = response.content.decode()
														
 
															-                    pagination = Selector(html).xpath(
														
 
															-                        '//div[@class="pagination"]/label/text()'
														
 
															-                    ).extract_first()
														
 
															-                    if pagination is not None and str(pagination).isdigit():
														
 
															-                        return int(pagination)
														
 
															-            except Exception as e:
														
 
															-                logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
														
 
															-
														
 
															     def start(self):
														
 
															         logger.debug("********** 任务开始 **********")
														
 
															+        _spider = partial(self.add_task, self._spider)
														
 
															+
														
 
															         try:
														
 
															             for menu in self.menus:
														
 
															                 auto_paginate = getattr(menu, 'auto_paginate', False)
														
@@ -186,8 +192,16 @@ class Spider:
 
															                         continue
														
 
															                 max_page = max(max_page, 1)
														
 
															-                for page in range(1, max_page + 1):
														
 
															-                    self.add_task(self._spider, menu, page, max_retries=10)
														
 
															+
														
 
															+                dates = self.kwargs["dates"]
														
 
															+                if dates is not None and isinstance(dates, list):
														
 
															+                    for date in dates:
														
 
															+                        for page in range(1, max_page + 1):
														
 
															+                            _spider(menu, page, max_retries=10, date=date)
														
 
															+                else:
														
 
															+                    date = self.kwargs['date']
														
 
															+                    for page in range(1, max_page + 1):
														
 
															+                        _spider(menu, page, max_retries=10, date=date)
														
 
															             self.wait()
														
 
															         finally:
														
@@ -206,5 +220,6 @@ if __name__ == '__main__':
 
															         Menu('资格预审公告', 'a_zgzbtbggfwpt_zgysgg2', '92', 'qualify', 1),
														
 
															     ]
														
 
															-    # Spider(menu_lst, threads=2, date_items={'start_date': '2025-04-28', 'end_date': '2025-04-28'}).start()
														
 
															+    # Spider(menu_lst, threads=2, date=('2025-04-28', '2025-04-28')).start()
														
 
															+    # Spider(menu_lst, threads=2, dates=[('2025-04-28', '2025-04-28'), ('2025-04-27', '2025-04-27')]).start()
														
 
															     Spider(menu_lst, threads=2).start()
														
--- a/lzz_theme/zgzbtbggfwpt/zgzbtbggfwpt_list_date.py
+++ b/lzz_theme/zgzbtbggfwpt/zgzbtbggfwpt_list_date.py
@@ -24,13 +24,5 @@ if __name__ == '__main__':
 
															     today_str = today.strftime("%Y-%m-%d")
														
 
															     before_day_3 = today - timedelta(days=3)
														
 
															     before_day_3_str = before_day_3.strftime("%Y-%m-%d")
														
 
															-
														
 
															-    Spider(
														
 
															-        menus,
														
 
															-        threads=10,
														
 
															-        interval=1,
														
 
															-        date_items={
														
 
															-            'start_date': before_day_3_str,
														
 
															-            'end_date': today_str
														
 
															-        }
														
 
															-    ).start()
														
 
															+    date = (before_day_3_str, today_str)
														
 
															+    Spider(menus, threads=10, interval=1, date=date).start()