1 năm trước cách đây · 7ac013be0e
--- a/FworkSpider/items/njpc_item.py
+++ b/FworkSpider/items/njpc_item.py
@@ -9,29 +9,27 @@ from untils.tools import int2long, substitute, text_search
 
				 
			
 
				 class DataNjpcItem(BaseDetailItem):
			
 
				     """拟建类"""
			
 
				-    def __init__(self):
			
 
				-        super(DataNjpcItem, self).__init__()
			
 
				 
			
 
				-        self.table_name = "data_bak"  # 拟建数据存储表名
			
 
				+    __attr__ = {
			
 
				+        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area', 'city',
			
 
				+        'district', 'href', 'title', 'contenthtml', 'detail', 'sendflag',
			
 
				+        'projectinfo',
			
 
				+    }
			
 
				+    __ignore_attr__ = {
			
 
				+        'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
			
 
				+        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error'
			
 
				+    }
			
 
				 
			
 
				-        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				-        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
			
 
				-        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）
			
 
				+    def __init__(self, ignore=None, **kwargs):
			
 
				+        kw = {k: v for k, v in kwargs.items() if k in self.__attr__}  # 必需属性
			
 
				+        super(DataNjpcItem, self).__init__(**kw)
			
 
				+
			
 
				+        self.table_name = "data_bak"  # 拟建数据存储表名
			
 
				 
			
 
				-        # 一类字段
			
 
				-        self.href = ""  # 非竞品详情页地址
			
 
				-        self.title = ""  # 标题
			
 
				         self.projectname = ""  # 项目名称
			
 
				         self.publishtime = ""  # 文章发布时间（时间戳），单位:秒
			
 
				-        self.area = "全国"  # 省
			
 
				-        self.city = ""  # 市
			
 
				-        self.district = ""  # 区/县
			
 
				-        self.contenthtml = ""  # 详情页源码
			
 
				-        self.detail = ""  # 详情页源码清洗之后的文本
			
 
				-        self.projectinfo = None  # 附件信息,详见剑鱼拟建规范
			
 
				-        self.list_comeintime = ""  # 列表页入库时间
			
 
				+
			
 
				         # 默认设置
			
 
				-        self.sendflag = "false"
			
 
				         self.T = "bidding"
			
 
				         self.infoformat = 2
			
 
				 
			
@@ -92,17 +90,22 @@ class DataNjpcItem(BaseDetailItem):
 
				         # 施工单位联系方式	constructionunittel
			
 
				         # 施工单位地址	constructionunitaddr
			
 
				 
			
 
				-    def pre_to_db(self):
			
 
				+        self._init(ignore, **kwargs)  # 过滤自定义实例属性
			
 
				+
			
 
				+    def _init(self, ignore, **kwargs):
			
 
				+        if ignore is not None:
			
 
				+            ignore = [ignore] if isinstance(ignore, str) else ignore
			
 
				+            for attr in ignore:
			
 
				+                self.__ignore_attr__.add(attr)
			
 
				 
			
 
				+        kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
			
 
				+        self.__dict__.update(kw)
			
 
				+
			
 
				+    def pre_to_db(self):
			
 
				         if not self.title:
			
 
				             self.title = self.projectname
			
 
				             log.debug("请检测 < title > 是否正确！")
			
 
				 
			
 
				-        if not self.list_comeintime:
			
 
				-            self.list_comeintime = int2long(tools.get_current_timestamp())  # 列表页入库时间戳（秒级）
			
 
				-
			
 
				-        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳（秒级）
			
 
				-
			
 
				         if "-" in str(self.publishtime) and ":" in str(self.publishtime):
			
 
				             self.publishtime = int2long(tools.date_to_timestamp(self.publishtime))
			
 
				         elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
			
@@ -139,24 +142,11 @@ class NjpcListItem(BaseListItem):
 
				     def __init__(self):
			
 
				         super(NjpcListItem, self).__init__()
			
 
				 
			
 
				-        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				-        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
			
 
				-        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）
			
 
				-
			
 
				-        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
			
 
				+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
			
 
				         self.parser_url = ""  # 详情页数据地址
			
 
				 
			
 
				-        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
			
 
				-        self.list_comeintime = int2long(tools.get_current_timestamp())  # 列表页采集时间
			
 
				-        # 一类字段
			
 
				-        self.href = ""  # 非竞品详情页地址
			
 
				         self.projectname = ""  # 项目名称
			
 
				         self.publishtime = ""  # 文章发布时间
			
 
				-        self.area = "全国"  # 省
			
 
				-        self.city = ""  # 市
			
 
				-        self.district = ""  # 区/县
			
 
				-
			
 
				-        self.request_params = {}  # 定义callback所需的参数，诸如render，headers，method，data，params等等，必须与requests请求的参数名称对应，否则无法识别
			
 
				 
			
 
				     def pre_to_db(self):
			
 
				         if CheckData.channel(self.channel, group="njpc"):
			
--- a/FworkSpider/items/spider_item.py
+++ b/FworkSpider/items/spider_item.py
@@ -12,48 +12,58 @@ from untils.tools import (
 
				 
			
 
				 
			
 
				 class DataBakItem(BaseDetailItem):
			
 
				-    """招标(标讯)类"""
			
 
				-    def __init__(self):
			
 
				-        super(DataBakItem, self).__init__()
			
 
				-
			
 
				-        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				-        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
			
 
				-        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）
			
 
				+    """招投标(标讯)类"""
			
 
				+
			
 
				+    __attr__ = {
			
 
				+        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area', 'city',
			
 
				+        'district', 'href', 'title', 'contenthtml', 'detail', 'sendflag',
			
 
				+        'projectinfo',
			
 
				+    }
			
 
				+    __ignore_attr__ = {
			
 
				+        'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
			
 
				+        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error'
			
 
				+    }
			
 
				+
			
 
				+    def __init__(self, ignore=None, **kwargs):
			
 
				+        """
			
 
				+
			
 
				+        @param ignore: 自定义需删除的实例属性
			
 
				+        @param kwargs:
			
 
				+        """
			
 
				+        kw = {k: v for k, v in kwargs.items() if k in self.__attr__}
			
 
				+        super(DataBakItem, self).__init__(**kw)
			
 
				 
			
 
				-        self.title = ""  # 文章标题
			
 
				         self.s_title = ""  # 详情页标题（有必填），默认提供列表页标题
			
 
				-        self.area = "全国"  # 省
			
 
				-        self.city = ""  # 市
			
 
				-        self.district = ""  # 区/县
			
 
				         self.publishtime = ""  # 文章发布时间（列表页或者详情页发布时间）
			
 
				         self.l_np_publishtime = ""  # 发布时间的时间戳（秒级）, 需定义为long型
			
 
				-        self.comeintime = ""  # 入库时间戳（秒级）, 需定义为long型
			
 
				-        self.contenthtml = ""  # 详情页源码
			
 
				-        self.detail = ""  # 详情页源码清洗之后的文本
			
 
				-        self.list_comeintime = int2long(tools.get_current_timestamp())  # 列表页采集时间
			
 
				 
			
 
				-        self.href = ""  # 非竞品详情页地址
			
 
				         self.competehref = None  # 竞品详情页地址
			
 
				-        self.projectinfo = None  # 附件信息,详见剑鱼招投标规范
			
 
				-
			
 
				-        self.iscompete = True  # 新爬虫
			
 
				 
			
 
				-        self.sendflag = "false"
			
 
				         self.T = "bidding"
			
 
				         self.infoformat = 1
			
 
				 
			
 
				-        # 默认设置
			
 
				-        self.type = ""
			
 
				-        self.publishdept = ""
			
 
				+        '''招投标默认属性'''
			
 
				+        self.iscompete = True  # 新爬虫标识
			
 
				         self._d = "comeintime"
			
 
				+        self.publishdept = ""
			
 
				+        self.type = ""
			
 
				+
			
 
				+        self._init(ignore, **kwargs)  # 先声明实例属性再处理无关实例属性
			
 
				+
			
 
				+    def _init(self, ignore, **kwargs):
			
 
				+        if ignore is not None:
			
 
				+            ignore = [ignore] if isinstance(ignore, str) else ignore
			
 
				+            for attr in ignore:
			
 
				+                self.__ignore_attr__.add(attr)
			
 
				+
			
 
				+        kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
			
 
				+        self.__dict__.update(kw)
			
 
				 
			
 
				     def pre_to_db(self):
			
 
				         if not self.s_title:
			
 
				             self.s_title = self.title
			
 
				             log.debug("请检测 < s_title > 是否正确！")
			
 
				 
			
 
				-        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳（秒级）, 定义为long型
			
 
				-
			
 
				         if ":" in self.publishtime:
			
 
				             self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
			
 
				         else:
			
@@ -90,8 +100,8 @@ class DataBakItem(BaseDetailItem):
 
				 
			
 
				 class ExamineAndApproveItem(DataBakItem):
			
 
				     """审批类"""
			
 
				-    def __init__(self):
			
 
				-        super(ExamineAndApproveItem, self).__init__()
			
 
				+    def __init__(self, **kwargs):
			
 
				+        super(ExamineAndApproveItem, self).__init__(**kwargs)
			
 
				 
			
 
				         self.table_name = "data_bak"
			
 
				 
			
@@ -101,8 +111,8 @@ class ExamineAndApproveItem(DataBakItem):
 
				 
			
 
				 class PropertyRightItem(DataBakItem):
			
 
				     """产权类"""
			
 
				-    def __init__(self):
			
 
				-        super(PropertyRightItem, self).__init__()
			
 
				+    def __init__(self, **kwargs):
			
 
				+        super(PropertyRightItem, self).__init__(**kwargs)
			
 
				 
			
 
				         self.table_name = "data_bak"
			
 
				 
			
@@ -110,48 +120,32 @@ class PropertyRightItem(DataBakItem):
 
				         self.infoformat = 3
			
 
				 
			
 
				 
			
 
				-class MgpListItem(BaseListItem):
			
 
				+class BidingListItem(BaseListItem):
			
 
				 
			
 
				     def __init__(self):
			
 
				-        super(MgpListItem, self).__init__()
			
 
				+        super(BidingListItem, self).__init__()
			
 
				 
			
 
				-        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				+        self.title = ""  # 标题
			
 
				+        self.publishtime = ""  # 列表页文章发布时间
			
 
				 
			
 
				         self.parse_url = ""  # 详情爬虫访问地址
			
 
				-        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
			
 
				+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
			
 
				         self.parse = ""  # 详情爬虫解析回调方法名
			
 
				 
			
 
				-        self.request_params = {}  # 定义callback所需的参数，诸如render，headers，method，data，params等等，必须与requests请求的参数名称对应，否则无法识别
			
 
				         self.proxies = True  # 代理
			
 
				 
			
 
				-        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
			
 
				-
			
 
				         self.deal_detail = []  # 定义解析详情页主页内容的xpath列表
			
 
				         self.ex_js = ""  # 定义需要执行的js代码,包括但不限于script、文件路径等
			
 
				-        self.ex_python = None  # 定义需要执行的python代码，生成params/date，如header和cookie特殊，最好使用特殊定义法
			
 
				+        self.ex_python = None  # 定义需要执行的python代码，生成params/date，如headers和cookies特殊，最好使用特殊定义法
			
 
				 
			
 
				         self.files = False  # 采集附件配置
			
 
				 
			
 
				-        self.table_name = 'biding_list'
			
 
				-
			
 
				-    @property
			
 
				-    def item(self) -> dict:
			
 
				-        return self.__dict__["item"]
			
 
				-
			
 
				-    @item.setter
			
 
				-    def item(self, data_item: DataBakItem):
			
 
				-        self.__dict__["item"] = data_item.to_dict
			
 
				-
			
 
				     def pre_to_db(self):
			
 
				-        self.spidercode = self.item["spidercode"]
			
 
				-
			
 
				-        title = self.item.get("title")
			
 
				-        channel = self.item["channel"]
			
 
				-        if CheckData.channel(channel):
			
 
				-            code, reason = CheckData.title(title)
			
 
				+        if CheckData.channel(self.channel):
			
 
				+            code, reason = CheckData.title(self.title)
			
 
				             if code == 10106:
			
 
				-                log.warning(f"{title}--不可入库，原因:{reason}")
			
 
				+                log.warning(f"{self.title}--不可入库，原因:{reason}")
			
 
				                 self.save = False
			
 
				 
			
 
				 
			
 
				-BidingListItem = MgpListItem
			
 
				+MgpListItem = BidingListItem