123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- import hashlib
- import socket
- import re
- import requests
- from loguru import logger
- def get_proxy():
- headers = {
- "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
- }
- proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
- proxy = proxy.get("data")
- proxyh = {}
- proxyh["http"] = proxy.get("http").replace("socks5","socks5h")
- proxyh["https"] = proxy.get("http").replace("socks5","socks5h")
- logger.info("切换代理:{}".format(proxyh))
- return proxyh
- def sha1(text: str):
- """
- 十六进制数字字符串形式摘要值
- @param text: 字符串文本
- @return: 摘要值
- """
- _sha1 = hashlib.sha1()
- _sha1.update(text.encode("utf-8"))
- return _sha1.hexdigest()
- def get_host_ip():
- s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
- try:
- s.connect(('8.8.8.8', 80))
- ip = s.getsockname()[0]
- finally:
- s.close()
- return ip
- def check_crawl_title(title: str):
- crawl_keywords = {
- '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
- '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
- '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
- '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
- '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
- '终止', '系统'
- }
- for keyword in crawl_keywords:
- valid_keyword = re.search(keyword, title)
- if valid_keyword is not None:
- return valid_keyword
- else:
- return None
|