|
@@ -4,7 +4,7 @@ import hashlib
|
|
|
import re
|
|
|
from collections import namedtuple
|
|
|
from string import whitespace
|
|
|
-
|
|
|
+from bs4 import BeautifulSoup
|
|
|
import bson
|
|
|
import requests
|
|
|
|
|
@@ -124,6 +124,7 @@ def text_search(content: str) -> SearchText:
|
|
|
return SearchText(len(results))
|
|
|
|
|
|
|
|
|
+
|
|
|
def int2long(param: int):
|
|
|
"""int 转换成 long """
|
|
|
return bson.int64.Int64(param)
|
|
@@ -157,9 +158,9 @@ def njpc_fields_extract(html, data_item, is_clean=False):
|
|
|
|
|
|
data_item.title = data_item.projectname
|
|
|
projectname = re.findall('项目名称(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
- approvecode = re.findall('项目代码(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
+ approvecode = re.findall('项目(?:代码|编码)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
approvecontent = re.findall('(?:事项名称|审批事项)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
- owner = re.findall('建设(?:单位|单位名称)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
+ owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
projectaddr = re.findall('建设(?:地点|地址)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
total_investment = re.findall('总投资(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
|
project_person = re.findall('联系人(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
|
|
@@ -216,6 +217,79 @@ def njpc_fields_extract(html, data_item, is_clean=False):
|
|
|
return data_item
|
|
|
|
|
|
|
|
|
+# 拟建爬虫字段正则抽取(抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符)
|
|
|
+def njpc_fields_extract_special(html, data_item):
|
|
|
+ """
|
|
|
+ 抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符
|
|
|
+ :param str html: 页面源码
|
|
|
+ :param Items data_item: 详情页item
|
|
|
+ :return: 抽取完成字段表
|
|
|
+ """
|
|
|
+ # 清洗掉所有标签
|
|
|
+ soup = BeautifulSoup(html, 'html.parser')
|
|
|
+ html = "".join(soup.get_text().split()).strip()
|
|
|
+ # 抽取字段
|
|
|
+ data_item.title = data_item.projectname
|
|
|
+ projectname = re.findall('项目名称(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ approvecode = re.findall('项目(?:代码|编码)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ approvecontent = re.findall('(?:事项名称|审批事项)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ projectaddr = re.findall('建设(?:地点|地址)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ total_investment = re.findall('总投资(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[。;;、::]', html, re.S)
|
|
|
+ project_person = re.findall('联系人(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ project_phone = re.findall('联系(?:电话|方式)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ approvedept = re.findall('审批部门(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ approvenumber = re.findall('(?:审批|批准)文号(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
|
|
|
+ approvetime = re.findall('审批时间(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
|
|
|
+ project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]+[、::]', html, re.S))
|
|
|
+ project_completedate = re.findall('竣工日期(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
|
|
|
+
|
|
|
+ if project_scale:
|
|
|
+ construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
|
|
|
+ floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
|
|
|
+ if not construction_area:
|
|
|
+ construction_area = ""
|
|
|
+ else:
|
|
|
+ construction_area = re.sub(":|:", "", construction_area)
|
|
|
+
|
|
|
+ if not floor_area:
|
|
|
+ floor_area = ""
|
|
|
+ else:
|
|
|
+ floor_area = re.sub(":|:", "", floor_area)
|
|
|
+
|
|
|
+ data_item.project_scale = project_scale
|
|
|
+ data_item.project_scale_info = {
|
|
|
+ "construction_area": construction_area,
|
|
|
+ "floor_area": floor_area,
|
|
|
+ } # 建设规模及主要内容
|
|
|
+
|
|
|
+ fields_dict = {
|
|
|
+ "projectname": projectname,
|
|
|
+ "owner": owner,
|
|
|
+ "total_investment": total_investment,
|
|
|
+ "project_person": project_person,
|
|
|
+ "project_phone": project_phone,
|
|
|
+ "approvedept": approvedept,
|
|
|
+ "approvetime": approvetime,
|
|
|
+ "project_completedate": project_completedate,
|
|
|
+ "projectaddr": projectaddr,
|
|
|
+ "approvecode": approvecode,
|
|
|
+ "approvecontent": approvecontent,
|
|
|
+ "approvenumber": approvenumber
|
|
|
+ }
|
|
|
+ for fields_k, fields_v in fields_dict.items():
|
|
|
+ if fields_v:
|
|
|
+ fields_v[0] = clean_chars(fields_v[0])
|
|
|
+ if not fields_v[0]:
|
|
|
+ continue
|
|
|
+
|
|
|
+ data_item[fields_k] = re.sub(
|
|
|
+ r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
|
|
|
+ "", fields_v[0])
|
|
|
+
|
|
|
+ return data_item
|
|
|
+
|
|
|
+
|
|
|
def get_proxy():
|
|
|
headers = {
|
|
|
"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
|
|
@@ -259,3 +333,26 @@ def get_construction_area(project_scale):
|
|
|
else:
|
|
|
construction_area = construction_area.replace(':', '').replace(':', '')
|
|
|
return construction_area
|
|
|
+
|
|
|
+
|
|
|
+# 过滤详情页无效数据
|
|
|
+def remove_htmldata(remove_info_list:list, html:str, response):
|
|
|
+ """
|
|
|
+
|
|
|
+ Args:
|
|
|
+ remove_info_list: 需删除内容的xpath或文本 -> list
|
|
|
+ html: 待清洗文本
|
|
|
+ response: 原文响应体
|
|
|
+
|
|
|
+ Returns: 清洗后的文本
|
|
|
+
|
|
|
+ """
|
|
|
+ if html and remove_info_list:
|
|
|
+ for extra_item in remove_info_list:
|
|
|
+ if re.search('^//.*', extra_item):
|
|
|
+ extra_html = response.xpath(extra_item).extract_first()
|
|
|
+ else:
|
|
|
+ extra_html = extra_item
|
|
|
+ if extra_html:
|
|
|
+ html = html.replace(extra_html, '')
|
|
|
+ return html
|