Browse Source

zbytb--updata

lizongze 3 years ago
parent
commit
9bd793433b
1 changed files with 20 additions and 23 deletions
  1. 20 23
      zbytb/crawler/spiders/DetailPageSpider.py

+ 20 - 23
zbytb/crawler/spiders/DetailPageSpider.py

@@ -136,30 +136,27 @@ class CrawlDetailPageSpider:
 
     def process_content(self, content, rows: dict):
         self.process_attachment(content, rows)
-        if "method=downEnId" in rows.get('projectinfo').get('attachments').get('1').get('org_url'):
-            logger.warning("[不采集]{}-{}".format(rows['title'], rows['publishtime']))
-        else:
-            rows["contenthtml"] = clean_js(content)
-            special = {
-                '<iframe[^<>]*>[\s\S]*?</iframe>': ''
-            }
-            rows["detail"] = cleaner(content, special=special)
-            try:
-                CheckText(rows["detail"])
-            except CustomCheckError:
-                # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
-                rows["detail"] = "<br/>详细内容请访问原网页!"
+        rows["contenthtml"] = clean_js(content)
+        special = {
+            '<iframe[^<>]*>[\s\S]*?</iframe>': ''
+        }
+        rows["detail"] = cleaner(content, special=special)
+        try:
+            CheckText(rows["detail"])
+        except CustomCheckError:
+            # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
+            rows["detail"] = "<br/>详细内容请访问原网页!"
 
-            rows["comeintime"] = int2long(int(time.time()))
-            '''清除采集字段'''
-            if 'crawl_status' in rows:
-                del rows['crawl_status']
-            del rows['type_code'], rows['account'], rows['crawl'], rows['count']
-            try:
-                self.save_tab.insert_one(rows)
-            except DuplicateKeyError:
-                pass
-            logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
+        rows["comeintime"] = int2long(int(time.time()))
+        '''清除采集字段'''
+        if 'crawl_status' in rows:
+            del rows['crawl_status']
+        del rows['type_code'], rows['account'], rows['crawl'], rows['count']
+        try:
+            self.save_tab.insert_one(rows)
+        except DuplicateKeyError:
+            pass
+        logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
 
     def set_senior_privilege(self, item: dict):
         """