tools.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. import copy
  2. import functools
  3. import hashlib
  4. import re
  5. from collections import namedtuple
  6. from string import whitespace
  7. from bs4 import BeautifulSoup
  8. import bson
  9. import requests
  10. from untils.clean_html import cleaner
  11. SearchText = namedtuple('SearchText', ['total'])
  12. def substitute(html_str, special=None, completely=False):
  13. """HTML 替换"""
  14. html_str = cleaner(html=html_str, special=special, completely=completely)
  15. return html_str
  16. def merge_files(*files):
  17. """合并文件"""
  18. res = {}
  19. for file_ in files:
  20. if isinstance(file_, dict):
  21. for _, attachment in file_.items():
  22. res[str(len(res) + 1)] = attachment
  23. return res
  24. def is_all_chinese(strs):
  25. """检验是否全是中文字符"""
  26. for _char in strs:
  27. if not '\u4e00' <= _char <= '\u9fa5':
  28. return False
  29. return True
  30. def clean_document(*fields):
  31. """
  32. 清洗mongo文档
  33. :param fields: 清洗字段
  34. # 用例:
  35. # >>> clean_document('dzr')(lambda *args, **kw: None)(document)
  36. """
  37. def clean(func):
  38. @functools.wraps(func)
  39. def wrapper(*args, **kwargs):
  40. defaults = {
  41. "_id",
  42. "parser_name", "parser_url", "request_params",
  43. "failed", "error"
  44. }
  45. removes = defaults if not fields else {*defaults, *fields}
  46. item = args[0] if not kwargs else kwargs
  47. data_dict = item if isinstance(item, dict) else item.to_dict
  48. copy_data_dict = copy.deepcopy(data_dict)
  49. for k in copy_data_dict.keys():
  50. if k in removes:
  51. del data_dict[k]
  52. try:
  53. delattr(item, k) # 删除 Item 类实例属性
  54. except AttributeError:
  55. pass
  56. return func(*args, **kwargs)
  57. return wrapper
  58. return clean
  59. def clean_chars(text, charsets=whitespace):
  60. """
  61. 按照字符集,删除字符
  62. :param str text: 文本
  63. :param charsets: 字符集
  64. :return: 干净的文本
  65. """
  66. if text is not None:
  67. for char in charsets:
  68. if char in text:
  69. text = text.replace(char, '')
  70. return text
  71. def get_signature(content: str) -> str:
  72. """
  73. 十六进制数字字符串形式摘要值
  74. @param content: 字符串文本
  75. @return: 摘要值
  76. """
  77. sha1 = hashlib.sha1()
  78. sha1.update(content.encode("utf-8"))
  79. return sha1.hexdigest()
  80. def get_md5(val):
  81. md5 = hashlib.md5()
  82. if isinstance(val, bytes):
  83. md5.update(str(val).encode("utf-8"))
  84. elif isinstance(val, str):
  85. md5.update(val.encode("utf-8"))
  86. return md5.hexdigest()
  87. def text_search(content: str) -> SearchText:
  88. """
  89. 中文检索
  90. :param content: 文本
  91. :return: 中文数量
  92. """
  93. if not content:
  94. return SearchText(0)
  95. results = re.findall('[\u4e00-\u9fa5]', content, re.S)
  96. # 列表长度即是中文的字数
  97. return SearchText(len(results))
  98. def int2long(param: int):
  99. """int 转换成 long """
  100. return bson.int64.Int64(param)
  101. def njpc_hpsj_filt_keywords(text: str, special_kw=None):
  102. if special_kw is None:
  103. special_kw = {}
  104. keywords = {'项目', '工程', '验收', '评价', *special_kw}
  105. for keyword in keywords:
  106. result = re.match(f'.*{keyword}', text, re.S)
  107. if result is not None:
  108. return True # 需要采集
  109. else:
  110. return False # 丢弃
  111. # 拟建爬虫字段正则抽取
  112. def njpc_fields_extract(html, data_item, is_clean=False):
  113. """
  114. 拟建爬虫字段正则抽取
  115. :param str html: 页面源码
  116. :param Items data_item: 详情页item
  117. :param bool is_clean: 是否对源码进行清洗
  118. :return:
  119. """
  120. if is_clean:
  121. html = substitute(html)
  122. data_item.title = data_item.projectname
  123. projectname = re.findall('项目名称(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  124. approvecode = re.findall('项目(?:代码|编码)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  125. approvecontent = re.findall('(?:事项名称|审批事项)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  126. owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  127. projectaddr = re.findall('建设(?:地点|地址)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  128. total_investment = re.findall('总投资(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  129. project_person = re.findall('联系人(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  130. project_phone = re.findall('联系(?:电话|方式)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  131. approvedept = re.findall('审批部门(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  132. approvenumber = re.findall('(?:审批|批准)文号(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  133. approvetime = re.findall('审批时间(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  134. project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S))
  135. project_completedate = re.findall('竣工日期(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  136. if project_scale:
  137. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  138. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  139. if not construction_area:
  140. construction_area = ""
  141. else:
  142. construction_area = re.sub(":|:", "", construction_area)
  143. if not floor_area:
  144. floor_area = ""
  145. else:
  146. floor_area = re.sub(":|:", "", floor_area)
  147. data_item.project_scale = project_scale
  148. data_item.project_scale_info = {
  149. "construction_area": construction_area,
  150. "floor_area": floor_area,
  151. } # 建设规模及主要内容
  152. fields_dict = {
  153. "projectname": projectname,
  154. "owner": owner,
  155. "total_investment": total_investment,
  156. "project_person": project_person,
  157. "project_phone": project_phone,
  158. "approvedept": approvedept,
  159. "approvetime": approvetime,
  160. "project_completedate": project_completedate,
  161. "projectaddr": projectaddr,
  162. "approvecode": approvecode,
  163. "approvecontent": approvecontent,
  164. "approvenumber": approvenumber
  165. }
  166. for fields_k, fields_v in fields_dict.items():
  167. if fields_v:
  168. fields_v[0] = clean_chars(fields_v[0])
  169. if not fields_v[0]:
  170. continue
  171. data_item[fields_k] = re.sub(
  172. r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
  173. "", fields_v[0])
  174. return data_item
  175. # 拟建爬虫字段正则抽取(抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符)
  176. def njpc_fields_extract_special(html, data_item):
  177. """
  178. 抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符
  179. :param str html: 页面源码
  180. :param Items data_item: 详情页item
  181. :return: 抽取完成字段表
  182. """
  183. # 清洗掉所有标签
  184. soup = BeautifulSoup(html, 'html.parser')
  185. html = "".join(soup.get_text().split()).strip()
  186. # 抽取字段
  187. data_item.title = data_item.projectname
  188. projectname = re.findall('项目名称(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  189. approvecode = re.findall('项目(?:代码|编码)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  190. approvecontent = re.findall('(?:事项名称|审批事项)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  191. owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  192. projectaddr = re.findall('建设(?:地点|地址)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  193. total_investment = re.findall('总投资(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[。;;、::]', html, re.S)
  194. project_person = re.findall('联系人(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  195. project_phone = re.findall('联系(?:电话|方式)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  196. approvedept = re.findall('审批部门(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  197. approvenumber = re.findall('(?:审批|批准)文号(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  198. approvetime = re.findall('审批时间(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
  199. project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]+[、::]', html, re.S))
  200. project_completedate = re.findall('竣工日期(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
  201. if project_scale:
  202. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  203. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  204. if not construction_area:
  205. construction_area = ""
  206. else:
  207. construction_area = re.sub(":|:", "", construction_area)
  208. if not floor_area:
  209. floor_area = ""
  210. else:
  211. floor_area = re.sub(":|:", "", floor_area)
  212. data_item.project_scale = project_scale
  213. data_item.project_scale_info = {
  214. "construction_area": construction_area,
  215. "floor_area": floor_area,
  216. } # 建设规模及主要内容
  217. fields_dict = {
  218. "projectname": projectname,
  219. "owner": owner,
  220. "total_investment": total_investment,
  221. "project_person": project_person,
  222. "project_phone": project_phone,
  223. "approvedept": approvedept,
  224. "approvetime": approvetime,
  225. "project_completedate": project_completedate,
  226. "projectaddr": projectaddr,
  227. "approvecode": approvecode,
  228. "approvecontent": approvecontent,
  229. "approvenumber": approvenumber
  230. }
  231. for fields_k, fields_v in fields_dict.items():
  232. if fields_v:
  233. fields_v[0] = clean_chars(fields_v[0])
  234. if not fields_v[0]:
  235. continue
  236. data_item[fields_k] = re.sub(
  237. r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
  238. "", fields_v[0])
  239. return data_item
  240. def get_proxy():
  241. headers = {
  242. "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  243. }
  244. proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
  245. print(f"切换代理:{proxy.get('data')}")
  246. return proxy.get("data").get("http")
  247. def search(pattern, string):
  248. result = re.search(pattern, string)
  249. if result:
  250. return result.groups()[0]
  251. def search_construction(string):
  252. result = re.search('pattern', string)
  253. if result:
  254. return result.groups()[0]
  255. def search_floor(string):
  256. result = re.search('pattern', string)
  257. if result:
  258. return result.groups()[0]
  259. def get_floor_area(project_scale):
  260. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  261. if not floor_area:
  262. floor_area = ""
  263. else:
  264. floor_area = floor_area.replace(':', '').replace(':', '')
  265. return floor_area
  266. def get_construction_area(project_scale):
  267. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  268. if not construction_area:
  269. construction_area = ""
  270. else:
  271. construction_area = construction_area.replace(':', '').replace(':', '')
  272. return construction_area
  273. # 过滤详情页无效数据
  274. def remove_htmldata(remove_info_list:list, html:str, response):
  275. """
  276. Args:
  277. remove_info_list: 需删除内容的xpath或文本 -> list
  278. html: 待清洗文本
  279. response: 原文响应体
  280. Returns: 清洗后的文本
  281. """
  282. if html and remove_info_list:
  283. for extra_item in remove_info_list:
  284. if re.search('^//.*', extra_item):
  285. extra_html = response.xpath(extra_item).extract_first()
  286. else:
  287. extra_html = extra_item
  288. if extra_html:
  289. html = html.replace(extra_html, '')
  290. return html