|
@@ -1,140 +0,0 @@
|
|
|
-from feapder import Item
|
|
|
-from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
|
|
|
-import time
|
|
|
-from feapder.utils.log import log
|
|
|
-from feapder.utils.tools import get_current_date
|
|
|
-from datetime import datetime
|
|
|
-import os
|
|
|
-from feapder import setting
|
|
|
-global xxc
|
|
|
-xxc = 0
|
|
|
-
|
|
|
-class DataBakItem(Item):
|
|
|
-
|
|
|
- def __init__(self):
|
|
|
- self.title = "" # 文章标题
|
|
|
- self.publishtime = "" # 文章发布时间(日期格式 xxxx-xx-xx)
|
|
|
- self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
|
|
|
- self.site = "" # 采集的站点(编辑器爬虫平台定义)
|
|
|
- self.channel = "" # 采集的版块(编辑器爬虫平台定义)
|
|
|
- self.area = "全国" # 省
|
|
|
- self.city = "" # 市
|
|
|
- self.competehref = None # 竞品快照页地址
|
|
|
- self.href = "" # 非竞品快照页地址
|
|
|
- self.publishdept = ""
|
|
|
- self.iscompete=True
|
|
|
- self.type = ""
|
|
|
- self.T = "bidding"
|
|
|
- self.l_np_publishtime = "" # 发布时间的时间戳(秒级), 需定义为long型
|
|
|
- self.comeintime = "" # 入库时间戳(秒级), 需定义为long型
|
|
|
- self.sendflag = "false"
|
|
|
- self._d = "comeintime"
|
|
|
- self.contenthtml = "" # 快照页源码
|
|
|
- self.detail = "" # 快照页源码清洗之后招投标文本
|
|
|
- self.projectinfo = None # 快照页源码清洗之后招投标文本
|
|
|
- self.save = True
|
|
|
- def stop(self):
|
|
|
- self.save=False
|
|
|
- raise HtmlEmptyError
|
|
|
-
|
|
|
- def pre_to_db(self):
|
|
|
- # 生成入库时间戳(秒级), 定义为long型
|
|
|
- self.comeintime = int2long(time.time())
|
|
|
- # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
|
|
|
- '''如果无法解析到发布时间、可以考虑补一个发布时间
|
|
|
- '''
|
|
|
- if ":" in self.publishtime:
|
|
|
- self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
|
|
|
- else:
|
|
|
- self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
|
|
|
-
|
|
|
- # 数据获取失败处理:输出错误日志
|
|
|
- if self.contenthtml is None and self.projectinfo is None:
|
|
|
- log.error(f"{self.href},此链接数据正文抓取失败")
|
|
|
- # self.sendflag = "true"
|
|
|
- self.stop()
|
|
|
- if not self.title or not self.publishtime or not self.href:
|
|
|
- # self.sendflag = "true"
|
|
|
- log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.title}")
|
|
|
- self.stop()
|
|
|
- # html处理正文
|
|
|
- if self.contenthtml is not None and self.detail =='':
|
|
|
- self.detail = substitute(self.contenthtml)
|
|
|
- '''
|
|
|
- detail:去头、去尾
|
|
|
- '''
|
|
|
- if text_search(self.detail).total == 0:
|
|
|
- # 无正文内容时,该内容直接标记true, 不在被统计
|
|
|
- self.sendflag = "true"
|
|
|
-
|
|
|
-
|
|
|
-class MgpListItem(Item):
|
|
|
- def __init__(self):
|
|
|
- # self.__table_name__='ggg_list'
|
|
|
-
|
|
|
- self.parse = "" # 需要调用的方法名称
|
|
|
- self.item = "" # 传过来的参数
|
|
|
- self.parser_name = "" # 处理详情页的爬虫名
|
|
|
- self.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前日期时间
|
|
|
- self.comeintime = int2long(int(time.time())) # 当前日期时间戳
|
|
|
- self.deal_detail = [] # 定义解析详情页主页内容的解析,detail_get是一个xpath列表,detail_post 则是一段处理代码
|
|
|
- self.create_time = None # 定义解析详情页发布时间的xpath,列表页无发布时间时应用
|
|
|
- self.parse_url = "" # 定义解析详情页主页内容的xpath
|
|
|
- self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,
|
|
|
- # 必须与requests请求的参数名称对应,否则无法识别
|
|
|
- self.failed = 0 #失败请求的计数
|
|
|
- self.author = "开发及维护人员" # 开发及维护人员
|
|
|
- self.ex_js = '' # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
|
|
|
- self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
|
|
|
- self.pri = 1 # 爬虫报警级 可分9级
|
|
|
- self.proxies = True # 爬虫报警级 可分9级
|
|
|
- self.files = False # 附件采集配置
|
|
|
- self.error = None
|
|
|
- self.spidercode = ""
|
|
|
- self.save=True
|
|
|
-
|
|
|
- # self.error_info =
|
|
|
- def pre_to_db(self):
|
|
|
- # 生成入库时间戳(秒级), 定义为long型
|
|
|
- self.author = os.path.basename(os.getcwd())
|
|
|
- self.spidercode = self.item.get("spidercode")
|
|
|
-
|
|
|
- if "通知公告" in self.item.get("channel"):
|
|
|
- code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
|
|
|
- if code == 10106:
|
|
|
- log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
|
|
|
- elif "公告公示" in self.item.get("channel"):
|
|
|
- code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
|
|
|
- if code == 10106:
|
|
|
- log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
|
|
|
-
|
|
|
- global xxc
|
|
|
- xxc += 1
|
|
|
-
|
|
|
- def open_spider(self):
|
|
|
- pass
|
|
|
-
|
|
|
-class ListItem(Item):
|
|
|
- def __init__(self):
|
|
|
- self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
|
|
|
- self.site = "" # 采集的站点(编辑器爬虫平台定义)
|
|
|
- self.channel = "" # 采集的版块(编辑器爬虫平台定义)
|
|
|
- self.url = ''
|
|
|
- self.count=0
|
|
|
- self.code=-1
|
|
|
- self.rel_count = 0
|
|
|
- self.save=True
|
|
|
-
|
|
|
- def pre_to_db(self):
|
|
|
- time.sleep(0.1)
|
|
|
- self.author = setting.author.get(os.path.basename(os.getcwd()))
|
|
|
- if self.author is None:
|
|
|
- self.author = os.path.basename(os.getcwd())
|
|
|
- self.runtime = get_current_date(date_format="%Y-%m-%d")
|
|
|
- global xxc
|
|
|
- print("xxc___________________",xxc)
|
|
|
- self.rel_count = xxc
|
|
|
- xxc = 0
|
|
|
-
|
|
|
-
|
|
|
-
|