123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- import feapder.utils.tools as tools
- from feapder.utils.log import log
- from items.base_item import SwordFishProjectItem
- from untils.check_data import CheckData
- from untils.tools import (
- int2long,
- substitute,
- text_search,
- )
- class DataBakItem(SwordFishProjectItem):
- """标讯数据"""
- def __init__(self):
- super(DataBakItem, self).__init__()
- self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
- self.site = "" # 采集的站点(编辑器爬虫平台定义)
- self.channel = "" # 采集的版块(编辑器爬虫平台定义)
- self.title = "" # 文章标题
- self.s_title = "" # 详情页标题(有必填),默认提供列表页标题
- self.area = "全国" # 省
- self.city = "" # 市
- self.district = "" # 区/县
- self.publishtime = "" # 文章发布时间(列表页或者详情页发布时间)
- self.l_np_publishtime = "" # 发布时间的时间戳(秒级), 需定义为long型
- self.comeintime = "" # 入库时间戳(秒级), 需定义为long型
- self.contenthtml = "" # 详情页源码
- self.detail = "" # 详情页源码清洗之后的文本
- self.href = "" # 非竞品详情页地址
- self.competehref = None # 竞品详情页地址
- self.projectinfo = None # 附件信息,详见剑鱼招投标规范
- self.iscompete = True # 新爬虫
- self.sendflag = "false"
- self.T = "bidding"
- self.infoformat = 1
- # 默认设置
- self.type = ""
- self.publishdept = ""
- self._d = "comeintime"
- def pre_to_db(self):
- if not self.s_title:
- self.s_title = self.title
- log.debug("请检测 < s_title > 是否正确!")
- self.comeintime = int2long(tools.get_current_timestamp()) # 生成入库时间戳(秒级), 定义为long型
- if ":" in self.publishtime:
- self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
- else:
- self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
- # html处理正文
- if not self.contenthtml:
- log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
- self.save = False
- else:
- if not self.detail:
- self.detail = substitute(self.contenthtml)
- if text_search(self.detail).total == 0:
- self.sendflag = "true" # 无内容数据,数据不入保存服务
- if not self.title or not self.publishtime or not self.href:
- log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
- self.save = False
- # 竞品网站-详情页地址标识字段
- if not self.competehref:
- del self.competehref
- # 详情无附件,不需要 projectinfo 字段
- if not self.projectinfo:
- del self.projectinfo
- class ExamineAndApproveItem(DataBakItem):
- """审批数据"""
- def __init__(self):
- super(ExamineAndApproveItem, self).__init__()
- self.table_name = "data_bak"
- self.T = "bidding"
- self.infoformat = 2
- class PropertyRightItem(DataBakItem):
- """产权数据"""
- def __init__(self):
- super(PropertyRightItem, self).__init__()
- self.table_name = "data_bak"
- self.T = "bidding_other"
- self.infoformat = 3
- class MgpListItem(SwordFishProjectItem):
- def __init__(self):
- super(MgpListItem, self).__init__()
- self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
- self.parse_url = "" # 详情爬虫访问地址
- self.parser_name = "" # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
- self.parse = "" # 详情爬虫解析回调方法名
- self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
- self.proxies = True # 代理
- self.comeintime = int2long(tools.get_current_timestamp()) # 入库时间
- self.deal_detail = [] # 定义解析详情页主页内容的xpath列表
- self.ex_js = "" # 定义需要执行的js代码,包括但不限于script、文件路径等
- self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
- self.files = False # 采集附件配置
- @property
- def item(self) -> dict:
- return self.__dict__["item"]
- @item.setter
- def item(self, data_item: DataBakItem):
- self.__dict__["item"] = data_item.to_dict
- def pre_to_db(self):
- self.spidercode = self.item["spidercode"]
- title = self.item.get("title")
- channel = self.item["channel"]
- if CheckData.channel(channel):
- code, reason = CheckData.title(title)
- if code == 10106:
- log.warning(f"{title}--不可入库,原因:{reason}")
- self.save = False
|