|
@@ -1,11 +1,13 @@
|
|
|
from feapder import Item
|
|
|
-from untils.tools import int2long,substitute,text_search,CheckPrePareRequest
|
|
|
+from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
|
|
|
import time
|
|
|
from feapder.utils.log import log
|
|
|
from feapder.utils.tools import get_current_date
|
|
|
from datetime import datetime
|
|
|
import os
|
|
|
from feapder import setting
|
|
|
+global xxc
|
|
|
+xxc = 0
|
|
|
|
|
|
class DataBakItem(Item):
|
|
|
|
|
@@ -30,8 +32,10 @@ class DataBakItem(Item):
|
|
|
self.contenthtml = "" # 快照页源码
|
|
|
self.detail = "" # 快照页源码清洗之后招投标文本
|
|
|
self.projectinfo = None # 快照页源码清洗之后招投标文本
|
|
|
+ self.save = True
|
|
|
def stop(self):
|
|
|
- print(self.cc_err)
|
|
|
+ self.save=False
|
|
|
+ raise HtmlEmptyError
|
|
|
|
|
|
def pre_to_db(self):
|
|
|
# 生成入库时间戳(秒级), 定义为long型
|
|
@@ -64,7 +68,6 @@ class DataBakItem(Item):
|
|
|
self.sendflag = "true"
|
|
|
|
|
|
|
|
|
-
|
|
|
class MgpListItem(Item):
|
|
|
def __init__(self):
|
|
|
# self.__table_name__='ggg_list'
|
|
@@ -79,7 +82,7 @@ class MgpListItem(Item):
|
|
|
self.parse_url = "" # 定义解析详情页主页内容的xpath
|
|
|
self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,
|
|
|
# 必须与requests请求的参数名称对应,否则无法识别
|
|
|
- self.failed = 0 # 定义callback所需的参数,诸如render,headers,method等等
|
|
|
+ self.failed = 0 #失败请求的计数
|
|
|
self.author = "开发及维护人员" # 开发及维护人员
|
|
|
self.ex_js = '' # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
|
|
|
self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
|
|
@@ -88,6 +91,8 @@ class MgpListItem(Item):
|
|
|
self.files = False # 附件采集配置
|
|
|
self.error = None
|
|
|
self.spidercode = ""
|
|
|
+ self.save=True
|
|
|
+
|
|
|
# self.error_info =
|
|
|
def pre_to_db(self):
|
|
|
# 生成入库时间戳(秒级), 定义为long型
|
|
@@ -95,10 +100,19 @@ class MgpListItem(Item):
|
|
|
self.spidercode = self.item.get("spidercode")
|
|
|
|
|
|
if "通知公告" in self.item.get("channel"):
|
|
|
- CheckPrePareRequest().check_crawl_title(self.item.get("title"))
|
|
|
+ code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
|
|
|
+ if code == 10106:
|
|
|
+ log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
|
|
|
elif "公告公示" in self.item.get("channel"):
|
|
|
- CheckPrePareRequest().check_crawl_title(self.item.get("title"))
|
|
|
- # '''
|
|
|
+ code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
|
|
|
+ if code == 10106:
|
|
|
+ log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
|
|
|
+
|
|
|
+ global xxc
|
|
|
+ xxc += 1
|
|
|
+
|
|
|
+ def open_spider(self):
|
|
|
+ pass
|
|
|
|
|
|
class ListItem(Item):
|
|
|
def __init__(self):
|
|
@@ -109,6 +123,7 @@ class ListItem(Item):
|
|
|
self.count=0
|
|
|
self.code=-1
|
|
|
self.rel_count = 0
|
|
|
+ self.save=True
|
|
|
|
|
|
def pre_to_db(self):
|
|
|
time.sleep(0.1)
|
|
@@ -116,3 +131,10 @@ class ListItem(Item):
|
|
|
if self.author is None:
|
|
|
self.author = os.path.basename(os.getcwd())
|
|
|
self.runtime = get_current_date(date_format="%Y-%m-%d")
|
|
|
+ global xxc
|
|
|
+ print("xxc___________________",xxc)
|
|
|
+ self.rel_count = xxc
|
|
|
+ xxc = 0
|
|
|
+
|
|
|
+
|
|
|
+
|