tools.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. import copy
  2. import functools
  3. import hashlib
  4. import re
  5. from collections import namedtuple
  6. from string import whitespace
  7. import bson
  8. from bs4 import BeautifulSoup
  9. from feapder.network.proxy_pool import DirectProxyPool
  10. from feapder.utils.clean_html import cleaner
  11. SearchText = namedtuple('SearchText', ['total'])
  12. def substitute(html_str, special=None, completely=False):
  13. """HTML 替换"""
  14. html_str = cleaner(html=html_str, special=special, completely=completely)
  15. return html_str
  16. def merge_files(*files):
  17. """合并文件"""
  18. res = {}
  19. for file_ in files:
  20. if isinstance(file_, dict):
  21. for _, attachment in file_.items():
  22. res[str(len(res) + 1)] = attachment
  23. return res
  24. def is_all_chinese(strs):
  25. """检验是否全是中文字符"""
  26. for _char in strs:
  27. if not '\u4e00' <= _char <= '\u9fa5':
  28. return False
  29. return True
  30. def clean_document(*fields):
  31. """
  32. 清洗mongo文档
  33. :param fields: 清洗字段
  34. # 用例:
  35. # >>> clean_document('dzr')(lambda *args, **kw: None)(document)
  36. """
  37. def clean(func):
  38. @functools.wraps(func)
  39. def wrapper(*args, **kwargs):
  40. defaults = {
  41. "_id",
  42. "parser_name", "parser_url", "request_params",
  43. "failed", "error"
  44. }
  45. removes = defaults if not fields else {*defaults, *fields}
  46. item = args[0] if not kwargs else kwargs
  47. data_dict = item if isinstance(item, dict) else item.to_dict
  48. copy_data_dict = copy.deepcopy(data_dict)
  49. for k in copy_data_dict.keys():
  50. if k in removes:
  51. del data_dict[k]
  52. try:
  53. delattr(item, k) # 删除 Item 类实例属性
  54. except AttributeError:
  55. pass
  56. return func(*args, **kwargs)
  57. return wrapper
  58. return clean
  59. def clean_chars(text, charsets=whitespace):
  60. """
  61. 按照字符集,删除字符
  62. :param str text: 文本
  63. :param charsets: 字符集
  64. :return: 干净的文本
  65. """
  66. if text is not None:
  67. for char in charsets:
  68. if char in text:
  69. text = text.replace(char, '')
  70. return text
  71. def get_signature(content: str) -> str:
  72. """
  73. 十六进制数字字符串形式摘要值
  74. @param content: 字符串文本
  75. @return: 摘要值
  76. """
  77. sha1 = hashlib.sha1()
  78. sha1.update(content.encode("utf-8"))
  79. return sha1.hexdigest()
  80. def get_md5(val):
  81. md5 = hashlib.md5()
  82. if isinstance(val, bytes):
  83. md5.update(str(val).encode("utf-8"))
  84. elif isinstance(val, str):
  85. md5.update(val.encode("utf-8"))
  86. return md5.hexdigest()
  87. def text_search(content: str) -> SearchText:
  88. """
  89. 中文检索
  90. :param content: 文本
  91. :return: 中文数量
  92. """
  93. if not content:
  94. return SearchText(0)
  95. results = re.findall('[\u4e00-\u9fa5]', content, re.S)
  96. # 列表长度即是中文的字数
  97. return SearchText(len(results))
  98. def int2long(param: int):
  99. """int 转换成 long """
  100. return bson.int64.Int64(param)
  101. def njpc_hpsj_filt_keywords(text: str, special_kw=None):
  102. if special_kw is None:
  103. special_kw = {}
  104. keywords = {'项目', '工程', '验收', '评价', *special_kw}
  105. for keyword in keywords:
  106. result = re.match(f'.*{keyword}', text, re.S)
  107. if result is not None:
  108. return True # 需要采集
  109. else:
  110. return False # 丢弃
  111. # 拟建爬虫字段正则抽取
  112. def njpc_fields_extract(html, data_item, is_clean=False):
  113. """
  114. 拟建爬虫字段正则抽取
  115. :param str html: 页面源码
  116. :param Items data_item: 详情页item
  117. :param bool is_clean: 是否对源码进行清洗
  118. :return:
  119. """
  120. if is_clean:
  121. html = substitute(html)
  122. data_item.title = data_item.projectname
  123. projectname = re.findall('项目名称(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  124. approvecode = re.findall('项目(?:代码|编码)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  125. approvecontent = re.findall('(?:事项名称|审批事项)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  126. owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  127. projectaddr = re.findall('建设(?:地点|地址)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  128. total_investment = re.findall('总投资(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  129. project_person = re.findall('联系人(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  130. project_phone = re.findall('联系(?:电话|方式)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  131. approvedept = re.findall('审批部门(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  132. approvenumber = re.findall('(?:审批|批准)文号(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  133. approvetime = re.findall('审批时间(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  134. project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S))
  135. project_completedate = re.findall('竣工日期(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
  136. if project_scale:
  137. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  138. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  139. if not construction_area:
  140. construction_area = ""
  141. else:
  142. construction_area = re.sub(":|:", "", construction_area)
  143. if not floor_area:
  144. floor_area = ""
  145. else:
  146. floor_area = re.sub(":|:", "", floor_area)
  147. data_item.project_scale = project_scale
  148. data_item.project_scale_info = {
  149. "construction_area": construction_area,
  150. "floor_area": floor_area,
  151. } # 建设规模及主要内容
  152. fields_dict = {
  153. "projectname": projectname,
  154. "owner": owner,
  155. "total_investment": total_investment,
  156. "project_person": project_person,
  157. "project_phone": project_phone,
  158. "approvedept": approvedept,
  159. "approvetime": approvetime,
  160. "project_completedate": project_completedate,
  161. "projectaddr": projectaddr,
  162. "approvecode": approvecode,
  163. "approvecontent": approvecontent,
  164. "approvenumber": approvenumber
  165. }
  166. for fields_k, fields_v in fields_dict.items():
  167. if fields_v:
  168. fields_v[0] = clean_chars(fields_v[0])
  169. if not fields_v[0]:
  170. continue
  171. data_item[fields_k] = re.sub(
  172. r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
  173. "", fields_v[0])
  174. return data_item
  175. # 拟建爬虫字段正则抽取(抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符)
  176. def njpc_fields_extract_special(html, data_item):
  177. """
  178. 抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符
  179. :param str html: 页面源码
  180. :param Items data_item: 详情页item
  181. :return: 抽取完成字段表
  182. """
  183. # 清洗掉所有标签
  184. soup = BeautifulSoup(html, 'html.parser')
  185. html = "".join(soup.get_text().split()).strip()
  186. # 抽取字段
  187. data_item.title = data_item.projectname
  188. projectname = re.findall('项目名称(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]+[,,.。;;、::]', html, re.S)
  189. approvecode = re.findall('项目(?:代码|编码)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  190. approvecontent = re.findall('(?:事项名称|审批事项)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]+[,,.。;;、::]', html, re.S)
  191. owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  192. projectaddr = re.findall('建设(?:地点|地址)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]+[,,.。;;、::]', html, re.S)
  193. total_investment = re.findall('总投资(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[。;;、::]', html, re.S)
  194. project_person = re.findall('联系人(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  195. project_phone = re.findall('联系(?:电话|方式)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  196. approvedept = re.findall('审批部门(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  197. approvenumber = re.findall('(?:审批|批准)文号(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
  198. approvetime = re.findall('审批时间(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
  199. project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]+[、::]', html, re.S))
  200. project_completedate = re.findall('竣工日期(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
  201. if project_scale:
  202. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  203. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  204. if not construction_area:
  205. construction_area = ""
  206. else:
  207. construction_area = re.sub(":|:", "", construction_area)
  208. if not floor_area:
  209. floor_area = ""
  210. else:
  211. floor_area = re.sub(":|:", "", floor_area)
  212. data_item.project_scale = project_scale
  213. data_item.project_scale_info = {
  214. "construction_area": construction_area,
  215. "floor_area": floor_area,
  216. } # 建设规模及主要内容
  217. fields_dict = {
  218. "projectname": projectname,
  219. "owner": owner,
  220. "total_investment": total_investment,
  221. "project_person": project_person,
  222. "project_phone": project_phone,
  223. "approvedept": approvedept,
  224. "approvetime": approvetime,
  225. "project_completedate": project_completedate,
  226. "projectaddr": projectaddr,
  227. "approvecode": approvecode,
  228. "approvecontent": approvecontent,
  229. "approvenumber": approvenumber
  230. }
  231. for fields_k, fields_v in fields_dict.items():
  232. if fields_v:
  233. fields_v[0] = clean_chars(fields_v[0])
  234. if not fields_v[0]:
  235. continue
  236. data_item[fields_k] = re.sub(
  237. r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
  238. "", fields_v[0])
  239. return data_item
  240. def get_proxy(scheme=None, default=None, socks5h=False):
  241. proxy = DirectProxyPool(
  242. "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch",
  243. "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  244. )
  245. try:
  246. proxies = proxy.get_proxy()
  247. except Exception:
  248. proxies = default
  249. print(f"切换代理:{proxies}")
  250. if proxies is not None:
  251. if socks5h:
  252. proxyh = {
  253. "http": proxies.get("http").replace("socks5", "socks5h"),
  254. "https": proxies.get("http").replace("socks5", "socks5h")
  255. }
  256. proxies = proxyh
  257. if not scheme:
  258. return proxies
  259. else:
  260. return proxies.get(scheme, default)
  261. def search(pattern, string):
  262. result = re.search(pattern, string)
  263. if result:
  264. return result.groups()[0]
  265. def search_construction(string):
  266. result = re.search('pattern', string)
  267. if result:
  268. return result.groups()[0]
  269. def search_floor(string):
  270. result = re.search('pattern', string)
  271. if result:
  272. return result.groups()[0]
  273. def get_floor_area(project_scale):
  274. floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  275. if not floor_area:
  276. floor_area = ""
  277. else:
  278. floor_area = floor_area.replace(':', '').replace(':', '')
  279. return floor_area
  280. def get_construction_area(project_scale):
  281. construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
  282. if not construction_area:
  283. construction_area = ""
  284. else:
  285. construction_area = construction_area.replace(':', '').replace(':', '')
  286. return construction_area
  287. def remove_htmldata(remove_info_list:list, html:str, response):
  288. """
  289. 过滤详情页无效数据
  290. Args:
  291. remove_info_list: 需删除内容的xpath或文本 -> list [xpath,re,str] eg:['<re>data:image/(.*?)"',]
  292. html: 待清洗文本
  293. response: 原文响应体
  294. Returns: 清洗后的文本
  295. """
  296. if html and remove_info_list:
  297. for extra_item in remove_info_list:
  298. if re.search('^//.*', extra_item):
  299. extra_html_list = response.xpath(extra_item).extract()
  300. for extra_html in extra_html_list:
  301. if extra_html:
  302. html = html.replace(extra_html, '')
  303. elif re.search('^<re>.*', extra_item):
  304. extra_item = extra_item.replace('<re>','')
  305. extra_html_list = re.findall(f'{extra_item}',html,re.S|re.I|re.M)
  306. if extra_html_list:
  307. for exhtml in extra_html_list:
  308. html = html.replace(exhtml, '')
  309. else:
  310. extra_html = extra_item
  311. if extra_html:
  312. html = html.replace(extra_html, '')
  313. return html
  314. def extract_file_type(file_name="附件名", file_url="附件地址",file_type_list=[]):
  315. """
  316. 抽取附件类型
  317. Args:
  318. file_name: 附件名
  319. file_url: 附件地址
  320. file_type_list: 其他附件后缀
  321. Returns: 附件类型
  322. """
  323. if file_name and file_url:
  324. file_name = file_name.strip()
  325. file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
  326. 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
  327. if file_type_list:
  328. ftp_list = list(map(lambda x: x.lower(), file_type_list))
  329. file_types.extend(ftp_list)
  330. file_type = file_url.split('?')[0].split('.')[-1].lower()
  331. if file_type not in file_types:
  332. file_type = file_url.split('?')[-1].split('.')[-1].lower()
  333. if file_type in file_types:
  334. return file_type
  335. else:
  336. for ftp in file_types:
  337. file_type = re.search(ftp, file_name) or re.search("\." + ftp, file_url)
  338. if file_type:
  339. return file_type.group(0).replace('.','')
  340. else:
  341. return file_type
  342. return None