Browse Source

调整数据通道实例属性

dongzhaorui 1 năm trước cách đây
mục cha
commit
7ac013be0e
2 tập tin đã thay đổi với 76 bổ sung92 xóa
  1. 27 37
      FworkSpider/items/njpc_item.py
  2. 49 55
      FworkSpider/items/spider_item.py

+ 27 - 37
FworkSpider/items/njpc_item.py

@@ -9,29 +9,27 @@ from untils.tools import int2long, substitute, text_search
 
 class DataNjpcItem(BaseDetailItem):
     """拟建类"""
-    def __init__(self):
-        super(DataNjpcItem, self).__init__()
 
-        self.table_name = "data_bak"  # 拟建数据存储表名
+    __attr__ = {
+        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area', 'city',
+        'district', 'href', 'title', 'contenthtml', 'detail', 'sendflag',
+        'projectinfo',
+    }
+    __ignore_attr__ = {
+        'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
+        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error'
+    }
 
-        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+    def __init__(self, ignore=None, **kwargs):
+        kw = {k: v for k, v in kwargs.items() if k in self.__attr__}  # 必需属性
+        super(DataNjpcItem, self).__init__(**kw)
+
+        self.table_name = "data_bak"  # 拟建数据存储表名
 
-        # 一类字段
-        self.href = ""  # 非竞品详情页地址
-        self.title = ""  # 标题
         self.projectname = ""  # 项目名称
         self.publishtime = ""  # 文章发布时间(时间戳),单位:秒
-        self.area = "全国"  # 省
-        self.city = ""  # 市
-        self.district = ""  # 区/县
-        self.contenthtml = ""  # 详情页源码
-        self.detail = ""  # 详情页源码清洗之后的文本
-        self.projectinfo = None  # 附件信息,详见剑鱼拟建规范
-        self.list_comeintime = ""  # 列表页入库时间
+
         # 默认设置
-        self.sendflag = "false"
         self.T = "bidding"
         self.infoformat = 2
 
@@ -92,17 +90,22 @@ class DataNjpcItem(BaseDetailItem):
         # 施工单位联系方式	constructionunittel
         # 施工单位地址	constructionunitaddr
 
-    def pre_to_db(self):
+        self._init(ignore, **kwargs)  # 过滤自定义实例属性
+
+    def _init(self, ignore, **kwargs):
+        if ignore is not None:
+            ignore = [ignore] if isinstance(ignore, str) else ignore
+            for attr in ignore:
+                self.__ignore_attr__.add(attr)
 
+        kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
+        self.__dict__.update(kw)
+
+    def pre_to_db(self):
         if not self.title:
             self.title = self.projectname
             log.debug("请检测 < title > 是否正确!")
 
-        if not self.list_comeintime:
-            self.list_comeintime = int2long(tools.get_current_timestamp())  # 列表页入库时间戳(秒级)
-
-        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳(秒级)
-
         if "-" in str(self.publishtime) and ":" in str(self.publishtime):
             self.publishtime = int2long(tools.date_to_timestamp(self.publishtime))
         elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
@@ -139,24 +142,11 @@ class NjpcListItem(BaseListItem):
     def __init__(self):
         super(NjpcListItem, self).__init__()
 
-        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
-
-        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
         self.parser_url = ""  # 详情页数据地址
 
-        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
-        self.list_comeintime = int2long(tools.get_current_timestamp())  # 列表页采集时间
-        # 一类字段
-        self.href = ""  # 非竞品详情页地址
         self.projectname = ""  # 项目名称
         self.publishtime = ""  # 文章发布时间
-        self.area = "全国"  # 省
-        self.city = ""  # 市
-        self.district = ""  # 区/县
-
-        self.request_params = {}  # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
 
     def pre_to_db(self):
         if CheckData.channel(self.channel, group="njpc"):

+ 49 - 55
FworkSpider/items/spider_item.py

@@ -12,48 +12,58 @@ from untils.tools import (
 
 
 class DataBakItem(BaseDetailItem):
-    """招标(标讯)类"""
-    def __init__(self):
-        super(DataBakItem, self).__init__()
-
-        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+    """招投标(标讯)类"""
+
+    __attr__ = {
+        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area', 'city',
+        'district', 'href', 'title', 'contenthtml', 'detail', 'sendflag',
+        'projectinfo',
+    }
+    __ignore_attr__ = {
+        'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
+        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error'
+    }
+
+    def __init__(self, ignore=None, **kwargs):
+        """
+
+        @param ignore: 自定义需删除的实例属性
+        @param kwargs:
+        """
+        kw = {k: v for k, v in kwargs.items() if k in self.__attr__}
+        super(DataBakItem, self).__init__(**kw)
 
-        self.title = ""  # 文章标题
         self.s_title = ""  # 详情页标题(有必填),默认提供列表页标题
-        self.area = "全国"  # 省
-        self.city = ""  # 市
-        self.district = ""  # 区/县
         self.publishtime = ""  # 文章发布时间(列表页或者详情页发布时间)
         self.l_np_publishtime = ""  # 发布时间的时间戳(秒级), 需定义为long型
-        self.comeintime = ""  # 入库时间戳(秒级), 需定义为long型
-        self.contenthtml = ""  # 详情页源码
-        self.detail = ""  # 详情页源码清洗之后的文本
-        self.list_comeintime = int2long(tools.get_current_timestamp())  # 列表页采集时间
 
-        self.href = ""  # 非竞品详情页地址
         self.competehref = None  # 竞品详情页地址
-        self.projectinfo = None  # 附件信息,详见剑鱼招投标规范
-
-        self.iscompete = True  # 新爬虫
 
-        self.sendflag = "false"
         self.T = "bidding"
         self.infoformat = 1
 
-        # 默认设置
-        self.type = ""
-        self.publishdept = ""
+        '''招投标默认属性'''
+        self.iscompete = True  # 新爬虫标识
         self._d = "comeintime"
+        self.publishdept = ""
+        self.type = ""
+
+        self._init(ignore, **kwargs)  # 先声明实例属性再处理无关实例属性
+
+    def _init(self, ignore, **kwargs):
+        if ignore is not None:
+            ignore = [ignore] if isinstance(ignore, str) else ignore
+            for attr in ignore:
+                self.__ignore_attr__.add(attr)
+
+        kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
+        self.__dict__.update(kw)
 
     def pre_to_db(self):
         if not self.s_title:
             self.s_title = self.title
             log.debug("请检测 < s_title > 是否正确!")
 
-        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳(秒级), 定义为long型
-
         if ":" in self.publishtime:
             self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
         else:
@@ -90,8 +100,8 @@ class DataBakItem(BaseDetailItem):
 
 class ExamineAndApproveItem(DataBakItem):
     """审批类"""
-    def __init__(self):
-        super(ExamineAndApproveItem, self).__init__()
+    def __init__(self, **kwargs):
+        super(ExamineAndApproveItem, self).__init__(**kwargs)
 
         self.table_name = "data_bak"
 
@@ -101,8 +111,8 @@ class ExamineAndApproveItem(DataBakItem):
 
 class PropertyRightItem(DataBakItem):
     """产权类"""
-    def __init__(self):
-        super(PropertyRightItem, self).__init__()
+    def __init__(self, **kwargs):
+        super(PropertyRightItem, self).__init__(**kwargs)
 
         self.table_name = "data_bak"
 
@@ -110,48 +120,32 @@ class PropertyRightItem(DataBakItem):
         self.infoformat = 3
 
 
-class MgpListItem(BaseListItem):
+class BidingListItem(BaseListItem):
 
     def __init__(self):
-        super(MgpListItem, self).__init__()
+        super(BidingListItem, self).__init__()
 
-        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
+        self.title = ""  # 标题
+        self.publishtime = ""  # 列表页文章发布时间
 
         self.parse_url = ""  # 详情爬虫访问地址
-        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
+        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
         self.parse = ""  # 详情爬虫解析回调方法名
 
-        self.request_params = {}  # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
         self.proxies = True  # 代理
 
-        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间
-
         self.deal_detail = []  # 定义解析详情页主页内容的xpath列表
         self.ex_js = ""  # 定义需要执行的js代码,包括但不限于script、文件路径等
-        self.ex_python = None  # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
+        self.ex_python = None  # 定义需要执行的python代码,生成params/date,如headers和cookies特殊,最好使用特殊定义法
 
         self.files = False  # 采集附件配置
 
-        self.table_name = 'biding_list'
-
-    @property
-    def item(self) -> dict:
-        return self.__dict__["item"]
-
-    @item.setter
-    def item(self, data_item: DataBakItem):
-        self.__dict__["item"] = data_item.to_dict
-
     def pre_to_db(self):
-        self.spidercode = self.item["spidercode"]
-
-        title = self.item.get("title")
-        channel = self.item["channel"]
-        if CheckData.channel(channel):
-            code, reason = CheckData.title(title)
+        if CheckData.channel(self.channel):
+            code, reason = CheckData.title(self.title)
             if code == 10106:
-                log.warning(f"{title}--不可入库,原因:{reason}")
+                log.warning(f"{self.title}--不可入库,原因:{reason}")
                 self.save = False
 
 
-BidingListItem = MgpListItem
+MgpListItem = BidingListItem