Browse Source

ybw-检测会话信息是否保持

lizongze 3 năm trước cách đây
mục cha
commit
51e3503b13
2 tập tin đã thay đổi với 31 bổ sung14 xóa
  1. 18 13
      ybw/list_spider.py
  2. 13 1
      ybw/utils/tools.py

+ 18 - 13
ybw/list_spider.py

@@ -5,11 +5,11 @@ from urllib.parse import quote
 
 import requests
 from lxml.html import fromstring, HtmlElement
-
+from utils.tools import sha1
 from config.load import crawler_url, region
 from crawler.crawl_scheduler import Scheduler
 from crawler.login import login, load_login_cookies, login_session_check
-from utils.databases import mongo_table, int2long, es_query
+from utils.databases import mongo_table, int2long, es_query, redis_client
 from utils.execptions import CustomCheckError, VoidCrawlError, JyBasicException
 from utils.log import logger
 from utils.socks5 import Proxy
@@ -22,17 +22,19 @@ class ListSpider:
     def __init__(self, db: str, crawl_tab: str, crawl_max_page=None):
         self.crawl_menus = [
             # CrawlMenu('企业采购', 'a_ybwcgyzbw_qycg', '7%2C'),
-            CrawlMenu('政府采购', 'a_ybwcgyzbw_zfcg', '6%2C'),
-            CrawlMenu('招标预告', 'a_ybwcgyzbw_zbyg', '5%2C'),
+            # CrawlMenu('政府采购', 'a_ybwcgyzbw_zfcg', '6%2C'),
+            # CrawlMenu('招标预告', 'a_ybwcgyzbw_zbyg', '5%2C'),
             CrawlMenu('中标公示', 'a_ybwcgyzbw_zbgs', '4%2C'),
-            CrawlMenu('服务招标', 'a_ybwcgyzbw_fwzb', '3%2C'),
-            CrawlMenu('货物招标', 'a_ybwcgyzbw_hwzb', '2%2C'),
-            CrawlMenu('工程招标', 'a_ybwcgyzbw_gczb', '1%2C'),
+            # CrawlMenu('服务招标', 'a_ybwcgyzbw_fwzb', '3%2C'),
+            # CrawlMenu('货物招标', 'a_ybwcgyzbw_hwzb', '2%2C'),
+            # CrawlMenu('工程招标', 'a_ybwcgyzbw_gczb', '1%2C'),
         ]
         self.crawl_max_page = crawl_max_page or 1
         self.crawl_tab = mongo_table(db, crawl_tab)
         self.user = None
         self.session = None
+        self.r = redis_client()
+        self.redis_key = 'ybw_2022'
 
     def crawl_request(self, url: str, refer: str, **kwargs):
         headers = {
@@ -113,20 +115,23 @@ class ListSpider:
             }
             if title is None:
                 raise CustomCheckError(code=10107, reason='发布标题为空')
-            item['count'] = es_query(item["title"], item["l_np_publishtime"])
-            item['crawl'] = False
-            # print(f'>>> {title} - {competehref}')
-            results.append(item)
+            sign = sha1(item['competehref'])
+            if not self.r.hexists(self.redis_key, sign):
+                item['count'] = es_query(item["title"], item["l_np_publishtime"])
+                item['crawl'] = False
+                # print(f'>>> {title} - {competehref}')
+                results.append(item)
+                self.r.hset(self.redis_key, sign, '')
 
         if len(results) > 0:
             self.crawl_tab.insert_many(results)
         return len(results)
 
     def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
+        self.session = requests.session()
         for region_id, region_name in region.items():
             previous_url = None
             crawl_total, cookies = 1, None
-            self.session = requests.session()
             '''每个普通账号仅能查询4000条数据,设置每页最大条数:100,共计40页'''
             page_size = 30
             for page in range(1, self.crawl_max_page + 1):
@@ -184,7 +189,7 @@ class ListSpider:
                     sc.err_record(e)
                 finally:
                     sc.wait_for_next_task(random.choice(range(2, 6)))
-            self.session.close()
+        self.session.close()
 
     def start(self):
         for menu in self.crawl_menus:

+ 13 - 1
ybw/utils/tools.py

@@ -1,5 +1,5 @@
 import socket
-
+import hashlib
 
 def get_host_ip():
     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
@@ -9,3 +9,15 @@ def get_host_ip():
     finally:
         s.close()
     return ip
+
+
+def sha1(text: str):
+    """
+    十六进制数字字符串形式摘要值
+
+    @param text: 字符串文本
+    @return: 摘要值
+    """
+    _sha1 = hashlib.sha1()
+    _sha1.update(text.encode("utf-8"))
+    return _sha1.hexdigest()