|
@@ -136,30 +136,27 @@ class CrawlDetailPageSpider:
|
|
|
|
|
|
def process_content(self, content, rows: dict):
|
|
|
self.process_attachment(content, rows)
|
|
|
- if "method=downEnId" in rows.get('projectinfo').get('attachments').get('1').get('org_url'):
|
|
|
- logger.warning("[不采集]{}-{}".format(rows['title'], rows['publishtime']))
|
|
|
- else:
|
|
|
- rows["contenthtml"] = clean_js(content)
|
|
|
- special = {
|
|
|
- '<iframe[^<>]*>[\s\S]*?</iframe>': ''
|
|
|
- }
|
|
|
- rows["detail"] = cleaner(content, special=special)
|
|
|
- try:
|
|
|
- CheckText(rows["detail"])
|
|
|
- except CustomCheckError:
|
|
|
- # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
|
|
|
- rows["detail"] = "<br/>详细内容请访问原网页!"
|
|
|
+ rows["contenthtml"] = clean_js(content)
|
|
|
+ special = {
|
|
|
+ '<iframe[^<>]*>[\s\S]*?</iframe>': ''
|
|
|
+ }
|
|
|
+ rows["detail"] = cleaner(content, special=special)
|
|
|
+ try:
|
|
|
+ CheckText(rows["detail"])
|
|
|
+ except CustomCheckError:
|
|
|
+ # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
|
|
|
+ rows["detail"] = "<br/>详细内容请访问原网页!"
|
|
|
|
|
|
- rows["comeintime"] = int2long(int(time.time()))
|
|
|
- '''清除采集字段'''
|
|
|
- if 'crawl_status' in rows:
|
|
|
- del rows['crawl_status']
|
|
|
- del rows['type_code'], rows['account'], rows['crawl'], rows['count']
|
|
|
- try:
|
|
|
- self.save_tab.insert_one(rows)
|
|
|
- except DuplicateKeyError:
|
|
|
- pass
|
|
|
- logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
|
|
|
+ rows["comeintime"] = int2long(int(time.time()))
|
|
|
+ '''清除采集字段'''
|
|
|
+ if 'crawl_status' in rows:
|
|
|
+ del rows['crawl_status']
|
|
|
+ del rows['type_code'], rows['account'], rows['crawl'], rows['count']
|
|
|
+ try:
|
|
|
+ self.save_tab.insert_one(rows)
|
|
|
+ except DuplicateKeyError:
|
|
|
+ pass
|
|
|
+ logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
|
|
|
|
|
|
def set_senior_privilege(self, item: dict):
|
|
|
"""
|