maguopeng 2 년 전
부모
커밋
e71f860a25
2개의 변경된 파일31개의 추가작업 그리고 11개의 파일을 삭제
  1. 2 4
      FworkSpider/items/njpc_item.py
  2. 29 7
      FworkSpider/items/spider_item.py

+ 2 - 4
FworkSpider/items/njpc_item.py

@@ -21,6 +21,7 @@ class DataNjpcItem(Item):
 
         # 辅助字段 存储时的辅助字段
         self.save = True  # 区县
+        self.sendflag = False
 
         # 以下字段为 二类字段,没有则不做存储,不在存储结构中
         # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
@@ -100,11 +101,8 @@ class DataNjpcItem(Item):
             raise ValueError("The publication time format is incorrect -> %r " %(self.publishtime))
 
         # 数据获取失败处理:输出错误日志
-        if self.contentlhtml is None:
-            log.error(f"{self.href},此链接数据正文抓取失败")
-            self.save=False
         if not self.projectname or not self.publishtime or not self.href:
-            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.title}")
+            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.projectname}")
             self.save=False
         if self.contentlhtml is not None and self.detail =='':
             self.detail = substitute(self.contentlhtml)

+ 29 - 7
FworkSpider/items/spider_item.py

@@ -1,11 +1,13 @@
 from feapder import Item
-from untils.tools import int2long,substitute,text_search,CheckPrePareRequest
+from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
 import time
 from feapder.utils.log import log
 from feapder.utils.tools import get_current_date
 from datetime import datetime
 import os
 from feapder import setting
+global xxc
+xxc = 0
 
 class DataBakItem(Item):
 
@@ -30,8 +32,10 @@ class DataBakItem(Item):
         self.contenthtml = ""  # 快照页源码
         self.detail = ""  # 快照页源码清洗之后招投标文本
         self.projectinfo = None  # 快照页源码清洗之后招投标文本
+        self.save = True
     def stop(self):
-        print(self.cc_err)
+        self.save=False
+        raise HtmlEmptyError
 
     def pre_to_db(self):
         # 生成入库时间戳(秒级), 定义为long型
@@ -64,7 +68,6 @@ class DataBakItem(Item):
                 self.sendflag = "true"
 
 
-
 class MgpListItem(Item):
     def __init__(self):
         # self.__table_name__='ggg_list'
@@ -79,7 +82,7 @@ class MgpListItem(Item):
         self.parse_url = "" # 定义解析详情页主页内容的xpath
         self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,
                                 # 必须与requests请求的参数名称对应,否则无法识别
-        self.failed = 0 # 定义callback所需的参数,诸如render,headers,method等等
+        self.failed = 0 #失败请求的计数
         self.author = "开发及维护人员" # 开发及维护人员
         self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
         self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
@@ -88,6 +91,8 @@ class MgpListItem(Item):
         self.files = False # 附件采集配置
         self.error = None
         self.spidercode = ""
+        self.save=True
+
         # self.error_info =
     def pre_to_db(self):
         # 生成入库时间戳(秒级), 定义为long型
@@ -95,10 +100,19 @@ class MgpListItem(Item):
         self.spidercode = self.item.get("spidercode")
 
         if "通知公告" in self.item.get("channel"):
-            CheckPrePareRequest().check_crawl_title(self.item.get("title"))
+            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
+            if code == 10106:
+                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
         elif "公告公示" in self.item.get("channel"):
-            CheckPrePareRequest().check_crawl_title(self.item.get("title"))
-        #  '''
+            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
+            if code == 10106:
+                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
+
+        global xxc
+        xxc += 1
+
+    def open_spider(self):
+        pass
 
 class ListItem(Item):
     def __init__(self):
@@ -109,6 +123,7 @@ class ListItem(Item):
         self.count=0
         self.code=-1
         self.rel_count = 0
+        self.save=True
 
     def pre_to_db(self):
         time.sleep(0.1)
@@ -116,3 +131,10 @@ class ListItem(Item):
         if self.author is None:
             self.author = os.path.basename(os.getcwd())
         self.runtime = get_current_date(date_format="%Y-%m-%d")
+        global xxc
+        print("xxc___________________",xxc)
+        self.rel_count = xxc
+        xxc = 0
+
+
+