tools.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. import copy
  2. import functools
  3. import hashlib
  4. import re
  5. from collections import namedtuple
  6. from string import whitespace
  7. import bson
  8. import requests
  9. from untils.clean_html import cleaner
  10. SearchText = namedtuple('SearchText', ['total'])
  11. def substitute(html_str, special=None, completely=False):
  12. """HTML 替换"""
  13. html_str = cleaner(html=html_str, special=special, completely=completely)
  14. return html_str
  15. def merge_files(*files):
  16. """合并文件"""
  17. res = {}
  18. for file_ in files:
  19. if isinstance(file_, dict):
  20. for _, attachment in file_.items():
  21. res[str(len(res) + 1)] = attachment
  22. return res
  23. def is_all_chinese(strs):
  24. """检验是否全是中文字符"""
  25. for _char in strs:
  26. if not '\u4e00' <= _char <= '\u9fa5':
  27. return False
  28. return True
  29. def clean_document(*fields):
  30. """
  31. 清洗mongo文档
  32. :param fields: 清洗字段
  33. # 用例:
  34. # >>> clean_document('dzr')(lambda *args, **kw: None)(document)
  35. """
  36. def clean(func):
  37. @functools.wraps(func)
  38. def wrapper(*args, **kwargs):
  39. defaults = {
  40. "_id",
  41. "parser_name", "parser_url", "request_params",
  42. "failed", "error"
  43. }
  44. removes = defaults if not fields else {*defaults, *fields}
  45. item = args[0] if not kwargs else kwargs
  46. data_dict = item if isinstance(item, dict) else item.to_dict
  47. copy_data_dict = copy.deepcopy(data_dict)
  48. for k in copy_data_dict.keys():
  49. if k in removes:
  50. del data_dict[k]
  51. try:
  52. delattr(item, k) # 删除 Item 类实例属性
  53. except AttributeError:
  54. pass
  55. return func(*args, **kwargs)
  56. return wrapper
  57. return clean
  58. def clean_chars(text, charsets=whitespace):
  59. """
  60. 按照字符集,删除字符
  61. :param str text: 文本
  62. :param charsets: 字符集
  63. :return: 干净的文本
  64. """
  65. if text is not None:
  66. for char in charsets:
  67. if char in text:
  68. text = text.replace(char, '')
  69. return text
  70. def get_signature(content: str) -> str:
  71. """
  72. 十六进制数字字符串形式摘要值
  73. @param content: 字符串文本
  74. @return: 摘要值
  75. """
  76. sha1 = hashlib.sha1()
  77. sha1.update(content.encode("utf-8"))
  78. return sha1.hexdigest()
  79. def get_md5(val):
  80. md5 = hashlib.md5()
  81. if isinstance(val, bytes):
  82. md5.update(str(val).encode("utf-8"))
  83. elif isinstance(val, str):
  84. md5.update(val.encode("utf-8"))
  85. return md5.hexdigest()
  86. def text_search(content: str) -> SearchText:
  87. """
  88. 中文检索
  89. :param content: 文本
  90. :return: 中文数量
  91. """
  92. if not content:
  93. return SearchText(0)
  94. results = re.findall('[\u4e00-\u9fa5]', content, re.S)
  95. # 列表长度即是中文的字数
  96. return SearchText(len(results))
  97. def int2long(param: int):
  98. """int 转换成 long """
  99. return bson.int64.Int64(param)
  100. def njpc_hpsj_filt_keywords(text: str, special_kw=None):
  101. if special_kw is None:
  102. special_kw = {}
  103. keywords = {'项目', '工程', '验收', '评价', *special_kw}
  104. for keyword in keywords:
  105. result = re.match(f'.*{keyword}', text, re.S)
  106. if result is not None:
  107. return True # 需要采集
  108. else:
  109. return False # 丢弃
  110. # 拟建爬虫字段正则抽取
  111. def njpc_fields_extract(html, data_item, is_clean=False):
  112. """
  113. 拟建爬虫字段正则抽取
  114. :param str html: 页面源码
  115. :param Items data_item: 详情页item
  116. :param bool is_clean: 是否对源码进行清洗
  117. :return:
  118. """
  119. if is_clean:
  120. html = substitute(html)
  121. data_item.title = data_item.projectname
  122. projectname = re.findall('项目名称(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  123. approvecode = re.findall('项目代码(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  124. approvecontent = re.findall('(?:事项名称|审批事项)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  125. owner = re.findall('建设(?:单位|单位名称)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  126. projectaddr = re.findall('建设(?:地点|地址)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  127. total_investment = re.findall('总投资(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  128. project_person = re.findall('联系人(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  129. project_phone = re.findall('联系(?:电话|方式)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  130. approvedept = re.findall('审批部门(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  131. approvenumber = re.findall('(?:审批|批准)文号(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  132. approvetime = re.findall('审批时间(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  133. project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S))
  134. project_completedate = re.findall('竣工日期(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  135. if project_scale:
  136. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  137. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  138. if not construction_area:
  139. construction_area = ""
  140. else:
  141. construction_area = re.sub(":|:", "", construction_area)
  142. if not floor_area:
  143. floor_area = ""
  144. else:
  145. floor_area = re.sub(":|:", "", floor_area)
  146. data_item.project_scale = project_scale
  147. data_item.project_scale_info = {
  148. "construction_area": construction_area,
  149. "floor_area": floor_area,
  150. } # 建设规模及主要内容
  151. fields_dict = {
  152. "projectname": projectname,
  153. "owner": owner,
  154. "total_investment": total_investment,
  155. "project_person": project_person,
  156. "project_phone": project_phone,
  157. "approvedept": approvedept,
  158. "approvetime": approvetime,
  159. "project_completedate": project_completedate,
  160. "projectaddr": projectaddr,
  161. "approvecode": approvecode,
  162. "approvecontent": approvecontent,
  163. "approvenumber": approvenumber
  164. }
  165. for fields_k, fields_v in fields_dict.items():
  166. if fields_v:
  167. fields_v[0] = clean_chars(fields_v[0])
  168. if not fields_v[0]:
  169. continue
  170. data_item[fields_k] = re.sub(
  171. r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
  172. "", fields_v[0])
  173. return data_item
  174. def get_proxy():
  175. headers = {
  176. "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  177. }
  178. proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
  179. print(f"切换代理:{proxy.get('data')}")
  180. return proxy.get("data").get("http")
  181. def search(pattern, string):
  182. result = re.search(pattern, string)
  183. if result:
  184. return result.groups()[0]
  185. def search_construction(string):
  186. result = re.search('pattern', string)
  187. if result:
  188. return result.groups()[0]
  189. def search_floor(string):
  190. result = re.search('pattern', string)
  191. if result:
  192. return result.groups()[0]
  193. def get_floor_area(project_scale):
  194. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  195. if not floor_area:
  196. floor_area = ""
  197. else:
  198. floor_area = floor_area.replace(':', '').replace(':', '')
  199. return floor_area
  200. def get_construction_area(project_scale):
  201. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  202. if not construction_area:
  203. construction_area = ""
  204. else:
  205. construction_area = construction_area.replace(':', '').replace(':', '')
  206. return construction_area