Bladeren bron

拟建爬虫标题过滤限制

lizongze 1 jaar geleden
bovenliggende
commit
328d6952e6
2 gewijzigde bestanden met toevoegingen van 11 en 2 verwijderingen
  1. 1 1
      FworkSpider/items/njpc_item.py
  2. 10 1
      FworkSpider/untils/check_data.py

+ 1 - 1
FworkSpider/items/njpc_item.py

@@ -120,7 +120,7 @@ class NjpcListItem(BaseListItem):
         self.publishtime = ""  # 文章发布时间
 
     def pre_to_db(self):
-        if CheckData.channel(self.channel, group="njpc"):
+        if CheckData.channel(self.channel, self.site, group="njpc"):
             code, reason = CheckData.title(self.projectname, group="njpc")
             if code == 10106:
                 log.warning(f"{self.projectname}--不可入库,原因:{reason}")

+ 10 - 1
FworkSpider/untils/check_data.py

@@ -22,6 +22,11 @@ class CheckData:
     __plan_to_build_channel_set = {"通知", "部门文件", "公示",
                                     "办件" ,"公司", "新闻", "最新资讯"}
 
+    __ignore_site = ['辽宁省投资项目在线审批监管平台', '河北省投资项目在线审批管理平台', '广西投资项目在线审批监管平台',
+                     '安徽省投资项目在线审批监管平台', '宁夏回族自治区投资项目在线审批办事大厅', '广州市规划和自然资源局',
+                     '湖北政务服务网', '贵州省投资项目在线审批监管平台', '海南省投资项目在线审批监管平台',
+                     '吉林省投资项目在线审批监管平台', '江苏省投资项目在线审批监管平台新网址']
+
     @classmethod
     def title(cls, name: str, group=None):
         check_texts = cls.__bidding_title_set
@@ -37,11 +42,15 @@ class CheckData:
         return 200, 'ok'
 
     @classmethod
-    def channel(cls, name: str, group=None):
+    def channel(cls, name: str, site: str, group=None):
         check_texts = cls.__bidding_channel_set
+        ignore_site = cls.__ignore_site
         if group and group.lower() in ["njpc", "plan_to_build"]:
             check_texts = cls.__plan_to_build_channel_set
 
+        if site in ignore_site:
+            return False
+
         for text in check_texts:
             valid_text = re.search(text, name)
             if valid_text is not None: