3 years ago · 9bd793433b
--- a/zbytb/crawler/spiders/DetailPageSpider.py
+++ b/zbytb/crawler/spiders/DetailPageSpider.py
@@ -136,30 +136,27 @@ class CrawlDetailPageSpider:
 
				 
			
 
				     def process_content(self, content, rows: dict):
			
 
				         self.process_attachment(content, rows)
			
 
				-        if "method=downEnId" in rows.get('projectinfo').get('attachments').get('1').get('org_url'):
			
 
				-            logger.warning("[不采集]{}-{}".format(rows['title'], rows['publishtime']))
			
 
				-        else:
			
 
				-            rows["contenthtml"] = clean_js(content)
			
 
				-            special = {
			
 
				-                '<iframe[^<>]*>[\s\S]*?</iframe>': ''
			
 
				-            }
			
 
				-            rows["detail"] = cleaner(content, special=special)
			
 
				-            try:
			
 
				-                CheckText(rows["detail"])
			
 
				-            except CustomCheckError:
			
 
				-                # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
			
 
				-                rows["detail"] = "<br/>详细内容请访问原网页！"
			
 
				+        rows["contenthtml"] = clean_js(content)
			
 
				+        special = {
			
 
				+            '<iframe[^<>]*>[\s\S]*?</iframe>': ''
			
 
				+        }
			
 
				+        rows["detail"] = cleaner(content, special=special)
			
 
				+        try:
			
 
				+            CheckText(rows["detail"])
			
 
				+        except CustomCheckError:
			
 
				+            # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
			
 
				+            rows["detail"] = "<br/>详细内容请访问原网页！"
			
 
				 
			
 
				-            rows["comeintime"] = int2long(int(time.time()))
			
 
				-            '''清除采集字段'''
			
 
				-            if 'crawl_status' in rows:
			
 
				-                del rows['crawl_status']
			
 
				-            del rows['type_code'], rows['account'], rows['crawl'], rows['count']
			
 
				-            try:
			
 
				-                self.save_tab.insert_one(rows)
			
 
				-            except DuplicateKeyError:
			
 
				-                pass
			
 
				-            logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
			
 
				+        rows["comeintime"] = int2long(int(time.time()))
			
 
				+        '''清除采集字段'''
			
 
				+        if 'crawl_status' in rows:
			
 
				+            del rows['crawl_status']
			
 
				+        del rows['type_code'], rows['account'], rows['crawl'], rows['count']
			
 
				+        try:
			
 
				+            self.save_tab.insert_one(rows)
			
 
				+        except DuplicateKeyError:
			
 
				+            pass
			
 
				+        logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
			
 
				 
			
 
				     def set_senior_privilege(self, item: dict):
			
 
				         """