3
0

tools.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import hashlib
  2. import json
  3. import re
  4. from collections import namedtuple
  5. import requests
  6. from setting import WECHAT_WARNING_URL,WECHAT_WARNING_PHONE,WARNING_INTERVAL,WECHAT_WARNING_ALL
  7. import bson
  8. from feapder.utils.log import log
  9. from feapder.db.mongodb import MongoDB
  10. from .cleaner import cleaner
  11. import sys
  12. SearchText = namedtuple('SearchText', ['total'])
  13. def substitute(html_str,special=None, completely=False):
  14. """HTML 替换"""
  15. html_str = cleaner(html=html_str,special=None, completely=False)
  16. return html_str
  17. def get_signature(content: str) -> str:
  18. """
  19. 十六进制数字字符串形式摘要值
  20. @param content: 字符串文本
  21. @return: 摘要值
  22. """
  23. sha1 = hashlib.sha1()
  24. sha1.update(content.encode("utf-8"))
  25. return sha1.hexdigest()
  26. def text_search(content: str) -> SearchText:
  27. """
  28. 中文检索
  29. :param content: 文本
  30. :return: 中文数量
  31. """
  32. if not content:
  33. return SearchText(0)
  34. results = re.findall('[\u4e00-\u9fa5]', content, re.S)
  35. # 列表长度即是中文的字数
  36. return SearchText(len(results))
  37. def int2long(param: int):
  38. """int 转换成 long """
  39. return bson.int64.Int64(param)
  40. def get_spiders(menus):
  41. db = MongoDB(db="editor")
  42. for menu in menus:
  43. spider_info = db.find('luaconfig',{"code":menu.code})
  44. if len(spider_info) >0:
  45. if spider_info[0].get("state") not in (11,):
  46. menus.remove(menu)
  47. def wechat_warning(
  48. message,
  49. message_prefix=None,
  50. rate_limit=None,
  51. url=None,
  52. user_phone=None,
  53. all_users: bool = None,
  54. ):
  55. """企业微信报警"""
  56. # 为了加载最新的配置
  57. rate_limit = rate_limit if rate_limit is not None else WARNING_INTERVAL
  58. url = url or WECHAT_WARNING_URL
  59. user_phone = user_phone or WECHAT_WARNING_PHONE
  60. all_users = all_users if all_users is not None else WECHAT_WARNING_ALL
  61. if isinstance(user_phone, str):
  62. user_phone = [user_phone] if user_phone else []
  63. if all_users is True or not user_phone:
  64. user_phone = ["@all"]
  65. if not all([url, message]):
  66. return
  67. data = {
  68. "msgtype": "text",
  69. "text": {"content": message, "mentioned_mobile_list": user_phone},
  70. }
  71. headers = {"Content-Type": "application/json"}
  72. try:
  73. response = requests.post(
  74. url, headers=headers, data=json.dumps(data).encode("utf8")
  75. )
  76. result = response.json()
  77. response.close()
  78. if result.get("errcode") == 0:
  79. return True
  80. else:
  81. raise Exception(result.get("errmsg"))
  82. except Exception as e:
  83. log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
  84. return False
  85. class JyBasicException(Exception):
  86. def __init__(self, code: int, reason: str, **kwargs):
  87. self.code = code
  88. self.reason = reason
  89. self.err_details = kwargs
  90. for key, val in kwargs.items():
  91. setattr(self, key, val)
  92. class CustomCheckError(JyBasicException):
  93. def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
  94. self.code = code
  95. self.reason = reason
  96. self.err_details = kwargs
  97. for key, val in kwargs.items():
  98. setattr(self, key, val)
  99. class HtmlEmptyError(JyBasicException):
  100. def __init__(self, code: int = 10002, reason: str = '正文获取异常,正文为空', **kwargs):
  101. self.code = code
  102. self.reason = reason
  103. self.err_details = kwargs
  104. for key, val in kwargs.items():
  105. setattr(self, key, val)
  106. class CheckPrePareRequest:
  107. def __init__(self):
  108. self.crawl_keywords = {
  109. '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
  110. '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
  111. '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
  112. '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
  113. '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
  114. '终止', '系统'
  115. }
  116. def check_crawl_title(self, title: str):
  117. for keyword in self.crawl_keywords:
  118. valid_keyword = re.search(keyword, title)
  119. if valid_keyword is not None:
  120. break
  121. else:
  122. # raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
  123. return 10106,'标题未检索到采集关键词'
  124. return 200,'ok'
  125. def __check(self, rows: dict):
  126. title, publish_time = rows['title'], rows['l_np_publishtime']
  127. self.check_crawl_title(title)
  128. def __call__(self, rows: dict, *args, **kwargs):
  129. self.__check(rows)
  130. def get_proxy():
  131. headers = {
  132. "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  133. }
  134. proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
  135. print(f"切换代理:{proxy.get('data')}")
  136. return proxy.get("data").get("http")
  137. import json
  138. class Obj(object):
  139. def __init__(self, dict_):
  140. self.__dict__.update(dict_)
  141. def get_argvs():
  142. argvs = {"next_page":False,"max_page":10}
  143. for item in sys.argv[1:]:
  144. print(item)
  145. if item.startswith("--"):
  146. argvs[item.replace("--", "").split('=')[0]] = int(item.split('=')[-1])
  147. return json.loads(json.dumps(argvs), object_hook=Obj)
  148. def search(pattern, string):
  149. result = re.search(pattern, string)
  150. if result:
  151. return result.groups()[0]
  152. def search_construction(string):
  153. result = re.search('pattern', string)
  154. if result:
  155. return result.groups()[0]
  156. def search_floor(string):
  157. result = re.search('pattern', string)
  158. if result:
  159. return result.groups()[0]