2 years ago · e71f860a25
--- a/FworkSpider/items/njpc_item.py
+++ b/FworkSpider/items/njpc_item.py
@@ -21,6 +21,7 @@ class DataNjpcItem(Item):
 
				 
			
 
				         # 辅助字段 存储时的辅助字段
			
 
				         self.save = True  # 区县
			
 
				+        self.sendflag = False
			
 
				 
			
 
				         # 以下字段为 二类字段，没有则不做存储，不在存储结构中
			
 
				         # 附件，默认为Null 正确的格式为 projectinfo.attachments = [{
			
@@ -100,11 +101,8 @@ class DataNjpcItem(Item):
 
				             raise ValueError("The publication time format is incorrect -> %r " %(self.publishtime))
			
 
				 
			
 
				         # 数据获取失败处理：输出错误日志
			
 
				-        if self.contentlhtml is None:
			
 
				-            log.error(f"{self.href},此链接数据正文抓取失败")
			
 
				-            self.save=False
			
 
				         if not self.projectname or not self.publishtime or not self.href:
			
 
				-            log.error(f"部分数据抓取失败，数据详情：\n 链接：{self.href}\n 发布时间：{self.publishtime}\n标题:{self.title}")
			
 
				+            log.error(f"部分数据抓取失败，数据详情：\n 链接：{self.href}\n 发布时间：{self.publishtime}\n标题:{self.projectname}")
			
 
				             self.save=False
			
 
				         if self.contentlhtml is not None and self.detail =='':
			
 
				             self.detail = substitute(self.contentlhtml)
			
--- a/FworkSpider/items/spider_item.py
+++ b/FworkSpider/items/spider_item.py
@@ -1,11 +1,13 @@
 
				 from feapder import Item
			
 
				-from untils.tools import int2long,substitute,text_search,CheckPrePareRequest
			
 
				+from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
			
 
				 import time
			
 
				 from feapder.utils.log import log
			
 
				 from feapder.utils.tools import get_current_date
			
 
				 from datetime import datetime
			
 
				 import os
			
 
				 from feapder import setting
			
 
				+global xxc
			
 
				+xxc = 0
			
 
				 
			
 
				 class DataBakItem(Item):
			
 
				 
			
@@ -30,8 +32,10 @@ class DataBakItem(Item):
 
				         self.contenthtml = ""  # 快照页源码
			
 
				         self.detail = ""  # 快照页源码清洗之后招投标文本
			
 
				         self.projectinfo = None  # 快照页源码清洗之后招投标文本
			
 
				+        self.save = True
			
 
				     def stop(self):
			
 
				-        print(self.cc_err)
			
 
				+        self.save=False
			
 
				+        raise HtmlEmptyError
			
 
				 
			
 
				     def pre_to_db(self):
			
 
				         # 生成入库时间戳（秒级）, 定义为long型
			
@@ -64,7 +68,6 @@ class DataBakItem(Item):
 
				                 self.sendflag = "true"
			
 
				 
			
 
				 
			
 
				-
			
 
				 class MgpListItem(Item):
			
 
				     def __init__(self):
			
 
				         # self.__table_name__='ggg_list'
			
@@ -79,7 +82,7 @@ class MgpListItem(Item):
 
				         self.parse_url = "" # 定义解析详情页主页内容的xpath
			
 
				         self.request_params = {} # 定义callback所需的参数，诸如render，headers，method，data，params等等，
			
 
				                                 # 必须与requests请求的参数名称对应，否则无法识别
			
 
				-        self.failed = 0 # 定义callback所需的参数，诸如render，headers，method等等
			
 
				+        self.failed = 0 #失败请求的计数
			
 
				         self.author = "开发及维护人员" # 开发及维护人员
			
 
				         self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
			
 
				         self.ex_python = None # 定义需要执行的python代码，生成params/date，如header和cookie特殊，最好使用特殊定义法
			
@@ -88,6 +91,8 @@ class MgpListItem(Item):
 
				         self.files = False # 附件采集配置
			
 
				         self.error = None
			
 
				         self.spidercode = ""
			
 
				+        self.save=True
			
 
				+
			
 
				         # self.error_info =
			
 
				     def pre_to_db(self):
			
 
				         # 生成入库时间戳（秒级）, 定义为long型
			
@@ -95,10 +100,19 @@ class MgpListItem(Item):
 
				         self.spidercode = self.item.get("spidercode")
			
 
				 
			
 
				         if "通知公告" in self.item.get("channel"):
			
 
				-            CheckPrePareRequest().check_crawl_title(self.item.get("title"))
			
 
				+            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
			
 
				+            if code == 10106:
			
 
				+                log.error(f"{self.item.get('title')}----不可入库，失败原因:{reason}")
			
 
				         elif "公告公示" in self.item.get("channel"):
			
 
				-            CheckPrePareRequest().check_crawl_title(self.item.get("title"))
			
 
				-        #  '''
			
 
				+            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
			
 
				+            if code == 10106:
			
 
				+                log.error(f"{self.item.get('title')}----不可入库，失败原因:{reason}")
			
 
				+
			
 
				+        global xxc
			
 
				+        xxc += 1
			
 
				+
			
 
				+    def open_spider(self):
			
 
				+        pass
			
 
				 
			
 
				 class ListItem(Item):
			
 
				     def __init__(self):
			
@@ -109,6 +123,7 @@ class ListItem(Item):
 
				         self.count=0
			
 
				         self.code=-1
			
 
				         self.rel_count = 0
			
 
				+        self.save=True
			
 
				 
			
 
				     def pre_to_db(self):
			
 
				         time.sleep(0.1)
			
@@ -116,3 +131,10 @@ class ListItem(Item):
 
				         if self.author is None:
			
 
				             self.author = os.path.basename(os.getcwd())
			
 
				         self.runtime = get_current_date(date_format="%Y-%m-%d")
			
 
				+        global xxc
			
 
				+        print("xxc___________________",xxc)
			
 
				+        self.rel_count = xxc
			
 
				+        xxc = 0
			
 
				+
			
 
				+
			
 
				+