2 년 전 · 867176958d
--- a/FworkSpider/items/njpc_item.py
+++ b/FworkSpider/items/njpc_item.py
@@ -0,0 +1,140 @@
 
				+from feapder import Item
			
 
				+from untils.tools import int2long,substitute,text_search,CheckPrePareRequest
			
 
				+import time
			
 
				+from feapder.utils.log import log
			
 
				+global xxc
			
 
				+xxc = 0
			
 
				+class DataNjpcItem(Item):
			
 
				+    def __init__(self):
			
 
				+        # 一类字段
			
 
				+        self.href = ""  # 非竞品快照页地址
			
 
				+        self.projectname = ""  # 项目名称
			
 
				+        self.publishtime = ""  # 文章发布时间（日期格式 xxxx-xx-xx）
			
 
				+        self.detail = ""  # 快照页源码清洗之后招投标文本
			
 
				+        self.contentlhtml = ""  # 快照页源码
			
 
				+        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				+        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
			
 
				+        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）
			
 
				+        self.area = "全国"  # 省
			
 
				+        self.city = ""  # 市
			
 
				+        self.district = ""  # 区县
			
 
				+
			
 
				+        # 辅助字段 存储时的辅助字段
			
 
				+        self.save = True  # 区县
			
 
				+
			
 
				+        # 以下字段为 二类字段，没有则不做存储，不在存储结构中
			
 
				+        # 附件，默认为Null 正确的格式为 projectinfo.attachments = [{
			
 
				+        #                       "fid":"附件id"
			
 
				+        #                       "filename":"附件名称"
			
 
				+        #                       "ftype":"文件类型"
			
 
				+        #                       "org_url":"附件原始地址"
			
 
				+        #                       "size":"附件大小"
			
 
				+        #                       "url":"附件地址"}]
			
 
				+        # 事项名称（审批事项）	approvecontent
			
 
				+        # 项目代码(审批代码)	approvecode
			
 
				+        # 批准文号	approvenumber
			
 
				+        # 总投资	total_investment
			
 
				+        # 资金来源	funds
			
 
				+        # 业主单位	owner
			
 
				+        # 申报方式（项目类型）	projecttype
			
 
				+        # 建设地点	projectaddr
			
 
				+        # 建设年限	projectperiod
			
 
				+        # 开工时间	project_startdate
			
 
				+        # 竣工时间	project_completedate
			
 
				+        # 审批部门	approvedept
			
 
				+        # 审批结果	approvestatus
			
 
				+        # 项目联系人  project_person
			
 
				+        # 项目联系电话  project_phone
			
 
				+
			
 
				+        # 建设规模及主要内容	project_scale_info
			
 
				+        # 	project_scale
			
 
				+        # 建筑面积	construction_area
			
 
				+        # 占地面积	floor_area
			
 
				+        # 建筑层数	building_floors
			
 
				+        # 钢结构	steel_structure
			
 
				+        # 外墙材料	exterior_wall_materials
			
 
				+        # 车库停车位	parking_pace
			
 
				+        # 电梯	elevator
			
 
				+        # 空调	air_conditioner
			
 
				+        # 新风系统	freshair_system
			
 
				+        # 供暖方式	heating_method
			
 
				+        # 装配式建筑	prefabricated_building
			
 
				+        # 被动房	passive_house
			
 
				+        # 其它建设内容描述	other_project_scale
			
 
				+
			
 
				+        # 三类字段，难以处理时可以不处理
			
 
				+        # 业主及其联系方式	owner_info
			
 
				+        # 业主单位/建设单位	owner
			
 
				+        # 业主单位联系人	ownerperson
			
 
				+        # 业主单位联系方式	ownertel
			
 
				+        # 业主单位地址	owneraddr
			
 
				+        # 设计院及其联系方式	designunit_info
			
 
				+        # 设计单位	designunit
			
 
				+        # 设计单位联系人	designunitperson
			
 
				+        # 设计单位联系方式	designunittel
			
 
				+        # 设计单位地址	designunitaddr
			
 
				+        # 施工单位及其联系方式	constructionunit_info
			
 
				+        # 施工单位	constructionunit
			
 
				+        # 施工单位联系人	constructionunitperson
			
 
				+        # 施工单位联系方式	constructionunittel
			
 
				+        # 施工单位地址	constructionunitaddr
			
 
				+    def pre_to_db(self):
			
 
				+        # 生成入库时间戳（秒级）, 定义为long型
			
 
				+        self.comeintime = int2long(time.time())
			
 
				+        # 根据文章发布时间 生成发布时间的时间戳（秒级）, 定义为long型
			
 
				+        '''
			
 
				+        如果无法解析到发布时间、可以考虑补一个发布时间
			
 
				+        '''
			
 
				+        # if "-" in self.publishtime:
			
 
				+        #     self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
			
 
				+        # else:
			
 
				+        #     self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
			
 
				+
			
 
				+        if "-" in str(self.publishtime) and ":" in str(self.publishtime):
			
 
				+            self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
			
 
				+        elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
			
 
				+            self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
			
 
				+        elif len(str(self.publishtime)) == 10 or len(str(self.publishtime)) == 13:
			
 
				+            self.publishtime = int2long(int(str(self.publishtime)[:10]))
			
 
				+        else:
			
 
				+            raise ValueError("The publication time format is incorrect -> %r " %(self.publishtime))
			
 
				+
			
 
				+        # 数据获取失败处理：输出错误日志
			
 
				+        if self.contentlhtml is None:
			
 
				+            log.error(f"{self.href},此链接数据正文抓取失败")
			
 
				+            self.save=False
			
 
				+        if not self.projectname or not self.publishtime or not self.href:
			
 
				+            log.error(f"部分数据抓取失败，数据详情：\n 链接：{self.href}\n 发布时间：{self.publishtime}\n标题:{self.title}")
			
 
				+            self.save=False
			
 
				+        if self.contentlhtml is not None and self.detail =='':
			
 
				+            self.detail = substitute(self.contentlhtml)
			
 
				+            '''
			
 
				+            detail:去头、去尾
			
 
				+            '''
			
 
				+            if text_search(self.detail).total == 0:
			
 
				+                # 无正文内容时，该内容直接标记true, 不在被统计、不入生产库
			
 
				+                self.sendflag = "true"
			
 
				+
			
 
				+class NjpcListItem(Item):
			
 
				+    def __init__(self):
			
 
				+        # 一类字段
			
 
				+        self.href = ""  # 非竞品快照页地址
			
 
				+        self.projectname = ""  # 项目名称
			
 
				+        self.publishtime = ""  # 文章发布时间（日期格式 xxxx-xx-xx）
			
 
				+        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				+        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
			
 
				+        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）
			
 
				+        self.area = "全国"  # 省
			
 
				+        self.city = ""  # 市
			
 
				+        self.district = ""  # 区县
			
 
				+
			
 
				+        # 辅助字段 存储时的辅助字段
			
 
				+        self.save = True  # 区县
			
 
				+        self.parser_name = ""  # 处理详情页爬虫的名称
			
 
				+        self.parser_url = ""  # 处理详情页的url
			
 
				+        self.failed = 0 #失败请求的计数
			
 
				+
			
 
				+
			
 
				+    def pre_to_db(self):
			
 
				+        pass
			
 
				+