|
@@ -0,0 +1,140 @@
|
|
|
|
+from feapder import Item
|
|
|
|
+from untils.tools import int2long,substitute,text_search,CheckPrePareRequest
|
|
|
|
+import time
|
|
|
|
+from feapder.utils.log import log
|
|
|
|
+global xxc
|
|
|
|
+xxc = 0
|
|
|
|
+class DataNjpcItem(Item):
|
|
|
|
+ def __init__(self):
|
|
|
|
+ # 一类字段
|
|
|
|
+ self.href = "" # 非竞品快照页地址
|
|
|
|
+ self.projectname = "" # 项目名称
|
|
|
|
+ self.publishtime = "" # 文章发布时间(日期格式 xxxx-xx-xx)
|
|
|
|
+ self.detail = "" # 快照页源码清洗之后招投标文本
|
|
|
|
+ self.contentlhtml = "" # 快照页源码
|
|
|
|
+ self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
|
|
|
|
+ self.site = "" # 采集的站点(编辑器爬虫平台定义)
|
|
|
|
+ self.channel = "" # 采集的版块(编辑器爬虫平台定义)
|
|
|
|
+ self.area = "全国" # 省
|
|
|
|
+ self.city = "" # 市
|
|
|
|
+ self.district = "" # 区县
|
|
|
|
+
|
|
|
|
+ # 辅助字段 存储时的辅助字段
|
|
|
|
+ self.save = True # 区县
|
|
|
|
+
|
|
|
|
+ # 以下字段为 二类字段,没有则不做存储,不在存储结构中
|
|
|
|
+ # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
|
|
|
|
+ # "fid":"附件id"
|
|
|
|
+ # "filename":"附件名称"
|
|
|
|
+ # "ftype":"文件类型"
|
|
|
|
+ # "org_url":"附件原始地址"
|
|
|
|
+ # "size":"附件大小"
|
|
|
|
+ # "url":"附件地址"}]
|
|
|
|
+ # 事项名称(审批事项) approvecontent
|
|
|
|
+ # 项目代码(审批代码) approvecode
|
|
|
|
+ # 批准文号 approvenumber
|
|
|
|
+ # 总投资 total_investment
|
|
|
|
+ # 资金来源 funds
|
|
|
|
+ # 业主单位 owner
|
|
|
|
+ # 申报方式(项目类型) projecttype
|
|
|
|
+ # 建设地点 projectaddr
|
|
|
|
+ # 建设年限 projectperiod
|
|
|
|
+ # 开工时间 project_startdate
|
|
|
|
+ # 竣工时间 project_completedate
|
|
|
|
+ # 审批部门 approvedept
|
|
|
|
+ # 审批结果 approvestatus
|
|
|
|
+ # 项目联系人 project_person
|
|
|
|
+ # 项目联系电话 project_phone
|
|
|
|
+
|
|
|
|
+ # 建设规模及主要内容 project_scale_info
|
|
|
|
+ # project_scale
|
|
|
|
+ # 建筑面积 construction_area
|
|
|
|
+ # 占地面积 floor_area
|
|
|
|
+ # 建筑层数 building_floors
|
|
|
|
+ # 钢结构 steel_structure
|
|
|
|
+ # 外墙材料 exterior_wall_materials
|
|
|
|
+ # 车库停车位 parking_pace
|
|
|
|
+ # 电梯 elevator
|
|
|
|
+ # 空调 air_conditioner
|
|
|
|
+ # 新风系统 freshair_system
|
|
|
|
+ # 供暖方式 heating_method
|
|
|
|
+ # 装配式建筑 prefabricated_building
|
|
|
|
+ # 被动房 passive_house
|
|
|
|
+ # 其它建设内容描述 other_project_scale
|
|
|
|
+
|
|
|
|
+ # 三类字段,难以处理时可以不处理
|
|
|
|
+ # 业主及其联系方式 owner_info
|
|
|
|
+ # 业主单位/建设单位 owner
|
|
|
|
+ # 业主单位联系人 ownerperson
|
|
|
|
+ # 业主单位联系方式 ownertel
|
|
|
|
+ # 业主单位地址 owneraddr
|
|
|
|
+ # 设计院及其联系方式 designunit_info
|
|
|
|
+ # 设计单位 designunit
|
|
|
|
+ # 设计单位联系人 designunitperson
|
|
|
|
+ # 设计单位联系方式 designunittel
|
|
|
|
+ # 设计单位地址 designunitaddr
|
|
|
|
+ # 施工单位及其联系方式 constructionunit_info
|
|
|
|
+ # 施工单位 constructionunit
|
|
|
|
+ # 施工单位联系人 constructionunitperson
|
|
|
|
+ # 施工单位联系方式 constructionunittel
|
|
|
|
+ # 施工单位地址 constructionunitaddr
|
|
|
|
+ def pre_to_db(self):
|
|
|
|
+ # 生成入库时间戳(秒级), 定义为long型
|
|
|
|
+ self.comeintime = int2long(time.time())
|
|
|
|
+ # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
|
|
|
|
+ '''
|
|
|
|
+ 如果无法解析到发布时间、可以考虑补一个发布时间
|
|
|
|
+ '''
|
|
|
|
+ # if "-" in self.publishtime:
|
|
|
|
+ # self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
|
|
|
|
+ # else:
|
|
|
|
+ # self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
|
|
|
|
+
|
|
|
|
+ if "-" in str(self.publishtime) and ":" in str(self.publishtime):
|
|
|
|
+ self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
|
|
|
|
+ elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
|
|
|
|
+ self.publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
|
|
|
|
+ elif len(str(self.publishtime)) == 10 or len(str(self.publishtime)) == 13:
|
|
|
|
+ self.publishtime = int2long(int(str(self.publishtime)[:10]))
|
|
|
|
+ else:
|
|
|
|
+ raise ValueError("The publication time format is incorrect -> %r " %(self.publishtime))
|
|
|
|
+
|
|
|
|
+ # 数据获取失败处理:输出错误日志
|
|
|
|
+ if self.contentlhtml is None:
|
|
|
|
+ log.error(f"{self.href},此链接数据正文抓取失败")
|
|
|
|
+ self.save=False
|
|
|
|
+ if not self.projectname or not self.publishtime or not self.href:
|
|
|
|
+ log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.title}")
|
|
|
|
+ self.save=False
|
|
|
|
+ if self.contentlhtml is not None and self.detail =='':
|
|
|
|
+ self.detail = substitute(self.contentlhtml)
|
|
|
|
+ '''
|
|
|
|
+ detail:去头、去尾
|
|
|
|
+ '''
|
|
|
|
+ if text_search(self.detail).total == 0:
|
|
|
|
+ # 无正文内容时,该内容直接标记true, 不在被统计、不入生产库
|
|
|
|
+ self.sendflag = "true"
|
|
|
|
+
|
|
|
|
+class NjpcListItem(Item):
|
|
|
|
+ def __init__(self):
|
|
|
|
+ # 一类字段
|
|
|
|
+ self.href = "" # 非竞品快照页地址
|
|
|
|
+ self.projectname = "" # 项目名称
|
|
|
|
+ self.publishtime = "" # 文章发布时间(日期格式 xxxx-xx-xx)
|
|
|
|
+ self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
|
|
|
|
+ self.site = "" # 采集的站点(编辑器爬虫平台定义)
|
|
|
|
+ self.channel = "" # 采集的版块(编辑器爬虫平台定义)
|
|
|
|
+ self.area = "全国" # 省
|
|
|
|
+ self.city = "" # 市
|
|
|
|
+ self.district = "" # 区县
|
|
|
|
+
|
|
|
|
+ # 辅助字段 存储时的辅助字段
|
|
|
|
+ self.save = True # 区县
|
|
|
|
+ self.parser_name = "" # 处理详情页爬虫的名称
|
|
|
|
+ self.parser_url = "" # 处理详情页的url
|
|
|
|
+ self.failed = 0 #失败请求的计数
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def pre_to_db(self):
|
|
|
|
+ pass
|
|
|
|
+
|