|
@@ -12,48 +12,58 @@ from untils.tools import (
|
|
|
|
|
|
|
|
|
class DataBakItem(BaseDetailItem):
|
|
|
- """招标(标讯)类"""
|
|
|
- def __init__(self):
|
|
|
- super(DataBakItem, self).__init__()
|
|
|
-
|
|
|
- self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
|
|
|
- self.site = "" # 采集的站点(编辑器爬虫平台定义)
|
|
|
- self.channel = "" # 采集的版块(编辑器爬虫平台定义)
|
|
|
+ """招投标(标讯)类"""
|
|
|
+
|
|
|
+ __attr__ = {
|
|
|
+ 'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area', 'city',
|
|
|
+ 'district', 'href', 'title', 'contenthtml', 'detail', 'sendflag',
|
|
|
+ 'projectinfo',
|
|
|
+ }
|
|
|
+ __ignore_attr__ = {
|
|
|
+ 'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
|
|
|
+ 'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error'
|
|
|
+ }
|
|
|
+
|
|
|
+ def __init__(self, ignore=None, **kwargs):
|
|
|
+ """
|
|
|
+
|
|
|
+ @param ignore: 自定义需删除的实例属性
|
|
|
+ @param kwargs:
|
|
|
+ """
|
|
|
+ kw = {k: v for k, v in kwargs.items() if k in self.__attr__}
|
|
|
+ super(DataBakItem, self).__init__(**kw)
|
|
|
|
|
|
- self.title = "" # 文章标题
|
|
|
self.s_title = "" # 详情页标题(有必填),默认提供列表页标题
|
|
|
- self.area = "全国" # 省
|
|
|
- self.city = "" # 市
|
|
|
- self.district = "" # 区/县
|
|
|
self.publishtime = "" # 文章发布时间(列表页或者详情页发布时间)
|
|
|
self.l_np_publishtime = "" # 发布时间的时间戳(秒级), 需定义为long型
|
|
|
- self.comeintime = "" # 入库时间戳(秒级), 需定义为long型
|
|
|
- self.contenthtml = "" # 详情页源码
|
|
|
- self.detail = "" # 详情页源码清洗之后的文本
|
|
|
- self.list_comeintime = int2long(tools.get_current_timestamp()) # 列表页采集时间
|
|
|
|
|
|
- self.href = "" # 非竞品详情页地址
|
|
|
self.competehref = None # 竞品详情页地址
|
|
|
- self.projectinfo = None # 附件信息,详见剑鱼招投标规范
|
|
|
-
|
|
|
- self.iscompete = True # 新爬虫
|
|
|
|
|
|
- self.sendflag = "false"
|
|
|
self.T = "bidding"
|
|
|
self.infoformat = 1
|
|
|
|
|
|
- # 默认设置
|
|
|
- self.type = ""
|
|
|
- self.publishdept = ""
|
|
|
+ '''招投标默认属性'''
|
|
|
+ self.iscompete = True # 新爬虫标识
|
|
|
self._d = "comeintime"
|
|
|
+ self.publishdept = ""
|
|
|
+ self.type = ""
|
|
|
+
|
|
|
+ self._init(ignore, **kwargs) # 先声明实例属性再处理无关实例属性
|
|
|
+
|
|
|
+ def _init(self, ignore, **kwargs):
|
|
|
+ if ignore is not None:
|
|
|
+ ignore = [ignore] if isinstance(ignore, str) else ignore
|
|
|
+ for attr in ignore:
|
|
|
+ self.__ignore_attr__.add(attr)
|
|
|
+
|
|
|
+ kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
|
|
|
+ self.__dict__.update(kw)
|
|
|
|
|
|
def pre_to_db(self):
|
|
|
if not self.s_title:
|
|
|
self.s_title = self.title
|
|
|
log.debug("请检测 < s_title > 是否正确!")
|
|
|
|
|
|
- self.comeintime = int2long(tools.get_current_timestamp()) # 生成入库时间戳(秒级), 定义为long型
|
|
|
-
|
|
|
if ":" in self.publishtime:
|
|
|
self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
|
|
|
else:
|
|
@@ -90,8 +100,8 @@ class DataBakItem(BaseDetailItem):
|
|
|
|
|
|
class ExamineAndApproveItem(DataBakItem):
|
|
|
"""审批类"""
|
|
|
- def __init__(self):
|
|
|
- super(ExamineAndApproveItem, self).__init__()
|
|
|
+ def __init__(self, **kwargs):
|
|
|
+ super(ExamineAndApproveItem, self).__init__(**kwargs)
|
|
|
|
|
|
self.table_name = "data_bak"
|
|
|
|
|
@@ -101,8 +111,8 @@ class ExamineAndApproveItem(DataBakItem):
|
|
|
|
|
|
class PropertyRightItem(DataBakItem):
|
|
|
"""产权类"""
|
|
|
- def __init__(self):
|
|
|
- super(PropertyRightItem, self).__init__()
|
|
|
+ def __init__(self, **kwargs):
|
|
|
+ super(PropertyRightItem, self).__init__(**kwargs)
|
|
|
|
|
|
self.table_name = "data_bak"
|
|
|
|
|
@@ -110,48 +120,32 @@ class PropertyRightItem(DataBakItem):
|
|
|
self.infoformat = 3
|
|
|
|
|
|
|
|
|
-class MgpListItem(BaseListItem):
|
|
|
+class BidingListItem(BaseListItem):
|
|
|
|
|
|
def __init__(self):
|
|
|
- super(MgpListItem, self).__init__()
|
|
|
+ super(BidingListItem, self).__init__()
|
|
|
|
|
|
- self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
|
|
|
+ self.title = "" # 标题
|
|
|
+ self.publishtime = "" # 列表页文章发布时间
|
|
|
|
|
|
self.parse_url = "" # 详情爬虫访问地址
|
|
|
- self.parser_name = "" # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
|
|
|
+ self.parser_name = "" # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
|
|
|
self.parse = "" # 详情爬虫解析回调方法名
|
|
|
|
|
|
- self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
|
|
|
self.proxies = True # 代理
|
|
|
|
|
|
- self.comeintime = int2long(tools.get_current_timestamp()) # 入库时间
|
|
|
-
|
|
|
self.deal_detail = [] # 定义解析详情页主页内容的xpath列表
|
|
|
self.ex_js = "" # 定义需要执行的js代码,包括但不限于script、文件路径等
|
|
|
- self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
|
|
|
+ self.ex_python = None # 定义需要执行的python代码,生成params/date,如headers和cookies特殊,最好使用特殊定义法
|
|
|
|
|
|
self.files = False # 采集附件配置
|
|
|
|
|
|
- self.table_name = 'biding_list'
|
|
|
-
|
|
|
- @property
|
|
|
- def item(self) -> dict:
|
|
|
- return self.__dict__["item"]
|
|
|
-
|
|
|
- @item.setter
|
|
|
- def item(self, data_item: DataBakItem):
|
|
|
- self.__dict__["item"] = data_item.to_dict
|
|
|
-
|
|
|
def pre_to_db(self):
|
|
|
- self.spidercode = self.item["spidercode"]
|
|
|
-
|
|
|
- title = self.item.get("title")
|
|
|
- channel = self.item["channel"]
|
|
|
- if CheckData.channel(channel):
|
|
|
- code, reason = CheckData.title(title)
|
|
|
+ if CheckData.channel(self.channel):
|
|
|
+ code, reason = CheckData.title(self.title)
|
|
|
if code == 10106:
|
|
|
- log.warning(f"{title}--不可入库,原因:{reason}")
|
|
|
+ log.warning(f"{self.title}--不可入库,原因:{reason}")
|
|
|
self.save = False
|
|
|
|
|
|
|
|
|
-BidingListItem = MgpListItem
|
|
|
+MgpListItem = BidingListItem
|