Browse Source

更新数据item结构

dongzhaorui 1 year ago
parent
commit
9ee3b1a893
3 changed files with 40 additions and 109 deletions
  1. 4 1
      FworkSpider/feapder/network/item.py
  2. 25 81
      FworkSpider/items/njpc_item.py
  3. 11 27
      FworkSpider/items/spider_item.py

+ 4 - 1
FworkSpider/feapder/network/item.py

@@ -159,7 +159,7 @@ class BaseItem(Item):
     """数据采集基础类"""
 
     def __init__(self, site='', channel='', spidercode='',
-                 area='全国', city='', district='', href='', pyuuid=None):
+                 area='全国', city='', district='', href='', pyuuid=None, **kwargs):
         """
 
         @param pyuuid: 采集数据唯一标识
@@ -188,6 +188,9 @@ class BaseItem(Item):
 
         self.href = href
 
+        kwargs = {k: v for k, v in kwargs.items() if k not in self.__dict__}
+        self.__dict__.update(kwargs)
+
     @property
     def fingerprint(self):
         args = []

+ 25 - 81
FworkSpider/items/njpc_item.py

@@ -11,97 +11,41 @@ class DataNjpcItem(BaseDetailItem):
     """拟建类"""
 
     __attr__ = {
-        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area', 'city',
-        'district', 'href', 'title', 'contenthtml', 'detail', 'sendflag',
-        'projectinfo',
-    }
-    __ignore_attr__ = {
-        'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
-        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error',
-        'failed_times', 'queue_name', 'state', 'update_at'
+        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area',
+        'city', 'district', 'href', 'title', 'contenthtml', 'detail',
+        'sendflag', 'projectinfo', 'phone', 'startdate',
+        'constructionunit', 'projecttype', 'ownertel', 'designunittel',
+        'scale', 'designunit', 'area', 'owneraddr', 'structure',
+        'house', 'building', 'investment', 'approvestatus', 'person',
+        'floors', 'materials', 'designunitaddr', 'approvecode', 'other',
+        'completedate', 'ownerperson', 'approvenumber',
+        'constructionunittel', 'heating', 'constructionunitaddr',
+        'approvecontent', 'construction', 'parking', 'floor', 'wall',
+        'designunitperson', 'constructionunitperson', 'steel', 'info',
+        'total', 'freshair', 'air', 'projectperiod', 'elevator',
+        'funds', 'pace', 'owner', 'projectaddr', 'system', 'exterior',
+        'method', 'passive', 'conditioner', 'approvedept', 'project',
+        'prefabricated'
     }
 
-    def __init__(self, ignore=None, **kwargs):
-        kw = {k: v for k, v in kwargs.items() if k in self.__attr__}  # 必需属性
-        super(DataNjpcItem, self).__init__(**kw)
+    def __init__(self, projectname='', publishtime='', **kwargs):
+        """
+
+        @param projectname: 项目名称
+        @param publishtime: 文章发布时间(时间戳),单位:秒
+        """
+        kwargs = {k: v for k, v in kwargs.items() if k in self.__attr__}
+        super(DataNjpcItem, self).__init__(**kwargs)
 
         self.table_name = "data_bak"  # 拟建数据存储表名
 
-        self.projectname = ""  # 项目名称
-        self.publishtime = ""  # 文章发布时间(时间戳),单位:秒
+        self.projectname = projectname
+        self.publishtime = publishtime
 
         # 默认设置
         self.T = "bidding"
         self.infoformat = 2
 
-        # 以下字段为 二类字段,没有则不做存储,不在存储结构中
-        # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
-        #                       "fid":"附件id"
-        #                       "filename":"附件名称"
-        #                       "ftype":"文件类型"
-        #                       "org_url":"附件原始地址"
-        #                       "size":"附件大小"
-        #                       "url":"附件地址"}]
-        # 事项名称(审批事项)	approvecontent
-        # 项目代码(审批代码)	approvecode
-        # 批准文号	approvenumber
-        # 总投资	total_investment
-        # 资金来源	funds
-        # 业主单位	owner
-        # 申报方式(项目类型)	projecttype
-        # 建设地点	projectaddr
-        # 建设年限	projectperiod
-        # 开工时间	project_startdate
-        # 竣工时间	project_completedate
-        # 审批部门	approvedept
-        # 审批结果	approvestatus
-        # 项目联系人  project_person
-        # 项目联系电话  project_phone
-
-        # 建设规模及主要内容	project_scale_info
-        # 	project_scale
-        # 建筑面积	construction_area
-        # 占地面积	floor_area
-        # 建筑层数	building_floors
-        # 钢结构	steel_structure
-        # 外墙材料	exterior_wall_materials
-        # 车库停车位	parking_pace
-        # 电梯	elevator
-        # 空调	air_conditioner
-        # 新风系统	freshair_system
-        # 供暖方式	heating_method
-        # 装配式建筑	prefabricated_building
-        # 被动房	passive_house
-        # 其它建设内容描述	other_project_scale
-
-        # 三类字段,难以处理时可以不处理
-        # 业主及其联系方式	owner_info
-        # 业主单位/建设单位	owner
-        # 业主单位联系人	ownerperson
-        # 业主单位联系方式	ownertel
-        # 业主单位地址	owneraddr
-        # 设计院及其联系方式	designunit_info
-        # 设计单位	designunit
-        # 设计单位联系人	designunitperson
-        # 设计单位联系方式	designunittel
-        # 设计单位地址	designunitaddr
-        # 施工单位及其联系方式	constructionunit_info
-        # 施工单位	constructionunit
-        # 施工单位联系人	constructionunitperson
-        # 施工单位联系方式	constructionunittel
-        # 施工单位地址	constructionunitaddr
-
-        self._init(ignore, **kwargs)  # 过滤自定义实例属性
-
-    def _init(self, ignore, **kwargs):
-        if ignore is not None:
-            ignore = [ignore] if isinstance(ignore, str) else ignore
-            for attr in ignore:
-                self.__ignore_attr__.add(attr)
-
-        kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
-        self.__dict__.update(kw)
-
     def handle_publish_time(self):
         # 时间格式处理
         publishtime = str(self.publishtime)

+ 11 - 27
FworkSpider/items/spider_item.py

@@ -14,28 +14,23 @@ class DataBakItem(BaseDetailItem):
     """招投标(标讯)类"""
 
     __attr__ = {
-        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area', 'city',
-        'district', 'href', 'title', 'contenthtml', 'detail', 'sendflag',
-        'projectinfo',
-    }
-    __ignore_attr__ = {
-        'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
-        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error',
-        'failed_times', 'queue_name', 'state', 'update_at'
+        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area',
+        'city', 'district', 'href', 'title', 'contenthtml', 'detail',
+        'sendflag', 'projectinfo'
     }
 
-    def __init__(self, ignore=None, **kwargs):
+    def __init__(self, s_title='', publishtime='', **kwargs):
         """
 
-        @param ignore: 自定义需删除的实例属性
+        @param s_title: 详情页标题(有必填),默认提供列表页标题
+        @param publishtime: 文章发布时间(列表页或者详情页发布时间)
         @param kwargs:
         """
-        kw = {k: v for k, v in kwargs.items() if k in self.__attr__}
-        super(DataBakItem, self).__init__(**kw)
-
-        self.s_title = ""  # 详情页标题(有必填),默认提供列表页标题
-        self.publishtime = ""  # 文章发布时间(列表页或者详情页发布时间)
-        self.l_np_publishtime = ""  # 发布时间的时间戳(秒级), 需定义为long型
+        kwargs = {k: v for k, v in kwargs.items() if k in self.__attr__}
+        super(DataBakItem, self).__init__(**kwargs)
+        self.s_title = s_title
+        self.publishtime = publishtime
+        self.l_np_publishtime = 0  # 发布时间的时间戳(秒级), 需定义为long型
 
         self.competehref = None  # 竞品详情页地址
 
@@ -48,17 +43,6 @@ class DataBakItem(BaseDetailItem):
         self.publishdept = ""
         self.type = ""
 
-        self._init(ignore, **kwargs)  # 先声明实例属性再处理无关实例属性
-
-    def _init(self, ignore, **kwargs):
-        if ignore is not None:
-            ignore = [ignore] if isinstance(ignore, str) else ignore
-            for attr in ignore:
-                self.__ignore_attr__.add(attr)
-
-        kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
-        self.__dict__.update(kw)
-
     def cleanup(self):
         # 竞品网站-详情页地址标识字段
         if not self.competehref: