Ver código fonte

优化日期参数配置,支持多日查询与单日查询;

dzr 3 meses atrás
pai
commit
3bba603662

+ 2 - 8
lzz_theme/zgzbtbggfwpt/history_crawl2.py

@@ -26,11 +26,5 @@ if __name__ == '__main__':
     today_str = today.strftime("%Y-%m-%d")
     before_day_3 = today - timedelta(days=3)
     before_day_3_str = before_day_3.strftime("%Y-%m-%d")
-
-    Crawl_Zgzb(
-        menus,
-        date_items={
-            'start_date': before_day_3_str,
-            'end_date': today_str
-        }
-    ).start()
+    date = (before_day_3_str, today_str)
+    Crawl_Zgzb(menus, date=date, threads=10, interval=0.4).start()

+ 43 - 28
lzz_theme/zgzbtbggfwpt/spider_list.py

@@ -10,6 +10,7 @@ import sys
 import time
 from collections import namedtuple
 from concurrent.futures import ThreadPoolExecutor, wait
+from functools import partial
 from pathlib import Path
 
 import execjs
@@ -33,12 +34,12 @@ except AttributeError:
 
 class Spider:
 
-    def __init__(self, menus, threads=1, interval=0.5, date_items=None):
+    def __init__(self, menus, threads=1, interval=0.5, date=None, dates=None):
         self.theme_list = Mongo_client()['py_spider']['theme_list']
         self.RDS = RedisFilter()
 
         self.menus = menus
-        self.kwargs = {'date_items': (date_items or {})}
+        self.kwargs = {'date': date, 'dates': dates}
 
         thread_name = Path(sys.argv[0]).name.replace('.py', '')
         self._executor = ThreadPoolExecutor(max_workers=threads,
@@ -64,11 +65,15 @@ class Spider:
             ctx = execjs.compile(ex_js)
             return ctx.call('getFullHref')
 
-    def download(self, page, menu, start_date=None, end_date=None):
+    def download(self, page, menu, date=None):
         logger.debug(f"网络请求|{menu.channel}|第{page}页")
-        if not start_date and not end_date:
+        if date is None:
             url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&page=" + str(page)
         else:
+            if not isinstance(date, (tuple, list)):
+                raise TypeError("dates is not iterable")
+
+            start_date, end_date = date
             end_date = str(end_date).strip() + " 23:59:59"
             url = "https://bulletin.cebpubservice.com/xxfbcmses/search/" + menu.rid + ".html?searchDate=2000-03-20&dates=300&word=&categoryId=" + menu.cid + "&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=0&startcheckDate=" + start_date + "&endcheckDate=" + end_date + "&page=" + str(page)
 
@@ -90,12 +95,27 @@ class Spider:
         response.raise_for_status()
         return response
 
-    def fetch_list(self, menu, page, max_retries=3, show_log=True):
-        date_items = self.kwargs["date_items"]
+    def fetch_max_pagination(self, menu):
+        logger.info(f"自动获取最大分页|{menu.channel}")
+        c = 0
+        while True:
+            try:
+                response = self.fetch_list(menu, 1, 3, show_log=False)
+                if response is not None:
+                    html = response.content.decode()
+                    pagination = Selector(html).xpath(
+                        '//div[@class="pagination"]/label/text()'
+                    ).extract_first()
+                    if pagination is not None and str(pagination).isdigit():
+                        return int(pagination)
+            except Exception as e:
+                logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
+
+    def fetch_list(self, menu, page, max_retries=3, date=None, show_log=True):
         for i in range(max_retries):
             try:
                 time.sleep(self._interval)
-                return self.download(page=page, menu=menu, **date_items)
+                return self.download(page=page, menu=menu, date=date)
             except Exception as e:
                 if show_log:
                     logger.error(f"网络请求|{menu.channel}|第{page}页|{type(e).__name__}|重试..{i + 1}")
@@ -147,33 +167,19 @@ class Spider:
 
         logger.info(f'采集成功|{menu.channel}|第{page}页|发布{len(root[1:])}条|入库{count}条')
 
-    def _spider(self, menu, page, max_retries):
+    def _spider(self, menu, page, max_retries, date):
         try:
-            response = self.fetch_list(menu, page, max_retries)
+            response = self.fetch_list(menu, page, max_retries, date)
             if response is not None:
                 html = response.content.decode()
                 self.parse(html, page, menu)
         except Exception as why:
             logger.error(f"采集失败|{menu.channel}|第{page}页|原因|{type(why).__name__}")
 
-    def fetch_max_pagination(self, menu):
-        logger.info(f"自动获取最大分页|{menu.channel}")
-        c = 0
-        while True:
-            try:
-                response = self.fetch_list(menu, 1, 3, show_log=False)
-                if response is not None:
-                    html = response.content.decode()
-                    pagination = Selector(html).xpath(
-                        '//div[@class="pagination"]/label/text()'
-                    ).extract_first()
-                    if pagination is not None and str(pagination).isdigit():
-                        return int(pagination)
-            except Exception as e:
-                logger.error(f"网络请求|{menu.channel}|{type(e).__name__}|重试..{c + 1}")
-
     def start(self):
         logger.debug("********** 任务开始 **********")
+        _spider = partial(self.add_task, self._spider)
+
         try:
             for menu in self.menus:
                 auto_paginate = getattr(menu, 'auto_paginate', False)
@@ -186,8 +192,16 @@ class Spider:
                         continue
 
                 max_page = max(max_page, 1)
-                for page in range(1, max_page + 1):
-                    self.add_task(self._spider, menu, page, max_retries=10)
+
+                dates = self.kwargs["dates"]
+                if dates is not None and isinstance(dates, list):
+                    for date in dates:
+                        for page in range(1, max_page + 1):
+                            _spider(menu, page, max_retries=10, date=date)
+                else:
+                    date = self.kwargs['date']
+                    for page in range(1, max_page + 1):
+                        _spider(menu, page, max_retries=10, date=date)
 
             self.wait()
         finally:
@@ -206,5 +220,6 @@ if __name__ == '__main__':
         Menu('资格预审公告', 'a_zgzbtbggfwpt_zgysgg2', '92', 'qualify', 1),
     ]
 
-    # Spider(menu_lst, threads=2, date_items={'start_date': '2025-04-28', 'end_date': '2025-04-28'}).start()
+    # Spider(menu_lst, threads=2, date=('2025-04-28', '2025-04-28')).start()
+    # Spider(menu_lst, threads=2, dates=[('2025-04-28', '2025-04-28'), ('2025-04-27', '2025-04-27')]).start()
     Spider(menu_lst, threads=2).start()

+ 2 - 10
lzz_theme/zgzbtbggfwpt/zgzbtbggfwpt_list_date.py

@@ -24,13 +24,5 @@ if __name__ == '__main__':
     today_str = today.strftime("%Y-%m-%d")
     before_day_3 = today - timedelta(days=3)
     before_day_3_str = before_day_3.strftime("%Y-%m-%d")
-
-    Spider(
-        menus,
-        threads=10,
-        interval=1,
-        date_items={
-            'start_date': before_day_3_str,
-            'end_date': today_str
-        }
-    ).start()
+    date = (before_day_3_str, today_str)
+    Spider(menus, threads=10, interval=1, date=date).start()