tools.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import hashlib
  2. import socket
  3. import re
  4. import requests
  5. from loguru import logger
  6. def get_proxy():
  7. headers = {
  8. "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  9. }
  10. proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
  11. proxy = proxy.get("data")
  12. proxyh = {}
  13. proxyh["http"] = proxy.get("http").replace("socks5","socks5h")
  14. proxyh["https"] = proxy.get("http").replace("socks5","socks5h")
  15. logger.info("切换代理:{}".format(proxyh))
  16. return proxyh
  17. def sha1(text: str):
  18. """
  19. 十六进制数字字符串形式摘要值
  20. @param text: 字符串文本
  21. @return: 摘要值
  22. """
  23. _sha1 = hashlib.sha1()
  24. _sha1.update(text.encode("utf-8"))
  25. return _sha1.hexdigest()
  26. def get_host_ip():
  27. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  28. try:
  29. s.connect(('8.8.8.8', 80))
  30. ip = s.getsockname()[0]
  31. finally:
  32. s.close()
  33. return ip
  34. def check_crawl_title(title: str):
  35. crawl_keywords = {
  36. '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
  37. '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
  38. '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
  39. '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
  40. '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
  41. '终止', '系统'
  42. }
  43. for keyword in crawl_keywords:
  44. valid_keyword = re.search(keyword, title)
  45. if valid_keyword is not None:
  46. return valid_keyword
  47. else:
  48. return None