3 years ago · 56a0673f4e
--- a/qlm/config/__init__.py
+++ b/qlm/config/__init__.py
--- a/qlm/config/conf.yaml
+++ b/qlm/config/conf.yaml
@@ -0,0 +1,39 @@
 
															+# mongo
														
 
															+mongo:
														
 
															+  host: 172.17.4.87
														
 
															+  port: !!int 27080
														
 
															+#  host: 127.0.0.1
														
 
															+#  port: !!int 27017
														
 
															+
														
 
															+
														
 
															+# redis
														
 
															+redis:
														
 
															+  host: 127.0.0.1
														
 
															+  port: !!int 8379
														
 
															+  pwd: ""
														
 
															+  db: !!int 0
														
 
															+
														
 
															+
														
 
															+# 阿里oss
														
 
															+ali_oss:
														
 
															+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
														
 
															+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
														
 
															+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
														
 
															+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
														
 
															+  bucket_name: jy-datafile
														
 
															+
														
 
															+
														
 
															+es:
														
 
															+  host: 172.17.145.170
														
 
															+#  host: 127.0.0.1
														
 
															+#  host: 192.168.3.206
														
 
															+  port: !!int 9800
														
 
															+  db: biddingall
														
 
															+
														
 
															+
														
 
															+# 代理
														
 
															+proxy:
														
 
															+  socks5:
														
 
															+    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
														
 
															+    auth:
														
 
															+      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB
														
--- a/qlm/config/constants.yaml
+++ b/qlm/config/constants.yaml
@@ -0,0 +1,2 @@
 
															+headers:
														
 
															+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
														
--- a/qlm/config/load.py
+++ b/qlm/config/load.py
@@ -0,0 +1,36 @@
 
															+from pathlib import Path
														
 
															+
														
 
															+import yaml
														
 
															+
														
 
															+__all__ = [
														
 
															+    'mongo_conf',
														
 
															+    'redis_conf',
														
 
															+    'oss_conf',
														
 
															+    'jy_proxy',
														
 
															+    'es_conf',
														
 
															+    'headers',
														
 
															+    'analyze_url',
														
 
															+    'node_module_path'
														
 
															+]
														
 
															+
														
 
															+_base_path = Path(__file__).parent
														
 
															+_yaml_conf = (_base_path / 'conf.yaml').resolve()
														
 
															+_yaml_constants = (_base_path / 'constants.yaml').resolve()
														
 
															+_node_modules = (_base_path.parent / 'node_modules').resolve()
														
 
															+
														
 
															+with open(_yaml_conf, encoding="utf-8") as f:
														
 
															+    conf = yaml.safe_load(f)
														
 
															+    mongo_conf = conf['mongo']
														
 
															+    redis_conf = conf['redis']
														
 
															+    oss_conf: dict = conf['ali_oss']
														
 
															+    es_conf: dict = conf['es']
														
 
															+    jy_proxy: dict = conf['proxy']
														
 
															+
														
 
															+
														
 
															+with open(_yaml_constants, encoding="utf-8") as fp:
														
 
															+    constants = yaml.safe_load(fp)
														
 
															+    headers: dict = constants['headers']
														
 
															+    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
														
 
															+
														
 
															+
														
 
															+node_module_path = _node_modules
														
--- a/qlm/source_qianlima.py
+++ b/qlm/source_qianlima.py
@@ -0,0 +1,99 @@
 
															+# coding: utf-8
														
 
															+import time
														
 
															+
														
 
															+import requests
														
 
															+
														
 
															+from utils.databases import mongo_table, redis_client
														
 
															+from utils.log import logger
														
 
															+from utils.tools import sha1
														
 
															+
														
 
															+qlm = mongo_table('qlm', 'qlm_2021')
														
 
															+r = redis_client()
														
 
															+redis_key = "qianlima_2021"
														
 
															+
														
 
															+'''
														
 
															+# areas  地区
														
 
															+# currentPage 页码
														
 
															+# numPerPage 每页的条目数
														
 
															+# types
														
 
															+全部
														
 
															+公告 0
														
 
															+预告 1
														
 
															+变更 2
														
 
															+中标 3
														
 
															+其他 5
														
 
															+'''
														
 
															+PROXIES = None
														
 
															+
														
 
															+
														
 
															+def crawl_request(url, headers):
														
 
															+    """
														
 
															+    公共方法，get获取url 解析json 数据
														
 
															+
														
 
															+    :param url: 访问的url
														
 
															+    :param headers: 携带参数url
														
 
															+    :return:
														
 
															+    """
														
 
															+    while True:
														
 
															+        try:
														
 
															+            get_html = requests.get(url, headers=headers, timeout=5000)
														
 
															+            # 自动编码，自适应字符编码
														
 
															+            get_html.encoding = get_html.apparent_encoding
														
 
															+            logger.info(get_html.status_code)
														
 
															+            if get_html.status_code in [403, 404, 400, 502, 302]:
														
 
															+                continue
														
 
															+            elif get_html.status_code in [200]:
														
 
															+                return get_html
														
 
															+        except requests.exceptions.ConnectTimeout:
														
 
															+            logger.error("Reacquire proxy")
														
 
															+        except requests.RequestException:
														
 
															+            time.sleep(3)
														
 
															+            continue
														
 
															+
														
 
															+
														
 
															+def crawl_spider(area, _type, i):
														
 
															+    headers = {
														
 
															+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															+        "Accept-Encoding": "gzip, deflate",
														
 
															+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
														
 
															+        "Cache-Control": "no-cache",
														
 
															+        "Connection": "keep-alive",
														
 
															+        "DNT": "1",
														
 
															+        "Host": "search.qianlima.com",
														
 
															+        "Pragma": "no-cache",
														
 
															+        "Upgrade-Insecure-Requests": "1",
														
 
															+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68",
														
 
															+        "Cookie": 'UM_distinctid=178af0c6f6f2f3-0e81be36d60604-7166786d-144000-178af0c6f70294; BAIDU_SSP_lcr=https://cn.bing.com/; guest_id=ac5769d7-b906-499d-ab85-47809ee9bc56; gr_user_id=d2cc35f6-ffa2-441b-a9ff-f836345e6f75; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844534; seo_refUrl=https%3A//cn.bing.com/; seo_curUrl=www.qianlima.com; qlm_referrer=https://cn.bing.com/; delClose200811=firstGoIn; __jsluid_h=5f702d3c66f33654fc8d1f109062bb23; __jsl_clearance=1617844553.848|0|oACHKEqjLj1O5rc480L59DWlTO4%3D; CNZZDATA1277608403=736687752-1617840159-http%253A%252F%252Fsearch.qianlima.com%252F%7C1617840159; nts_login_tip=1; fromWhereUrl="http://www.qianlima.com/mfzb/"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844615'
														
 
															+    }
														
 
															+    url = "http://search.qianlima.com/api/v1/website/search?filtermode=1&timeType=-1&areas={}&types={}&isfirst=false&searchMode=2&keywords=%20&currentPage={}&numPerPage=1000"
														
 
															+    list_url = url.format(area, _type, i)
														
 
															+    req = crawl_request(list_url, headers)
														
 
															+    info_list = req.json()["data"]["data"]
														
 
															+    item_list = []
														
 
															+    for info in info_list:
														
 
															+        tmid = sha1(str(info["contentid"]))
														
 
															+        if r.hget(redis_key, tmid) is None:
														
 
															+            r.hset(redis_key, tmid, str(info["contentid"]))
														
 
															+            if "popTitle" in info:
														
 
															+                info["title"] = info["popTitle"]
														
 
															+            else:
														
 
															+                info["title"] = info["showTitle"]
														
 
															+            item_list.append(info)
														
 
															+    if item_list:
														
 
															+        qlm.insert_many(item_list)
														
 
															+    logger.info("{}--{}抓取第{}页数据，共{}条".format(area, _type, i, len(item_list)))
														
 
															+
														
 
															+
														
 
															+def start():
														
 
															+    # 遍历省份
														
 
															+    for area in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 11, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]:
														
 
															+        # 遍历状态
														
 
															+        # [0,1,2,3,5]
														
 
															+        for _type in [0, 1, 2, 3, 5]:
														
 
															+            # 遍历页码
														
 
															+            # for i in range(1, 11):
														
 
															+            crawl_spider(area, _type, 1)
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    start()
														
--- a/qlm/utils/__init__.py
+++ b/qlm/utils/__init__.py
--- a/qlm/utils/aliyun.py
+++ b/qlm/utils/aliyun.py
@@ -0,0 +1,23 @@
 
															+import oss2
														
 
															+
														
 
															+from config.load import oss_conf
														
 
															+
														
 
															+
														
 
															+class AliYunService:
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.__acc_key_id = oss_conf['key_id']
														
 
															+        self.__acc_key_secret = oss_conf['key_secret']
														
 
															+        self.__endpoint = oss_conf['endpoint']
														
 
															+        self.__bucket_name = oss_conf['bucket_name']
														
 
															+
														
 
															+    def _push_oss_from_local(self, key, filename):
														
 
															+        """
														
 
															+        上传一个本地文件到OSS的普通文件
														
 
															+
														
 
															+        :param str key: 上传到OSS的文件名
														
 
															+        :param str filename: 本地文件名，需要有可读权限
														
 
															+        """
														
 
															+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
														
 
															+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
														
 
															+        bucket.put_object_from_file(key, filename)
														
--- a/qlm/utils/attachment.py
+++ b/qlm/utils/attachment.py
@@ -0,0 +1,153 @@
 
															+import traceback
														
 
															+import uuid
														
 
															+
														
 
															+import urllib3
														
 
															+
														
 
															+from config.load import headers
														
 
															+from utils.aliyun import AliYunService
														
 
															+from utils.clean_file import *
														
 
															+from utils.execptions import AttachmentNullError
														
 
															+from utils.socks5 import Proxy
														
 
															+
														
 
															+urllib3.disable_warnings()
														
 
															+
														
 
															+
														
 
															+class AttachmentDownloader(AliYunService):
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(AttachmentDownloader, self).__init__()
														
 
															+        self.dir_name = 'file'
														
 
															+
														
 
															+    def _create_file(self, filename, filetype):
														
 
															+        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
														
 
															+        file = "{filename}.{filetype}".format(
														
 
															+            filename=sha1("{}_{}".format(filename, uuid.uuid4())),
														
 
															+            filetype=filetype
														
 
															+        )
														
 
															+        return "{}/{}".format(self.dir_name, file)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _create_fid(file_stream: bytes):
														
 
															+        return sha1(file_stream)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _origin_filename(fid: str, filetype: str):
														
 
															+        return "{}.{}".format(fid, filetype)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _file_size(file: str):
														
 
															+        _kb = float(getsize(file)) / 1024
														
 
															+        if _kb >= 1024:
														
 
															+            _M = _kb / 1024
														
 
															+            if _M >= 1024:
														
 
															+                _G = _M / 1024
														
 
															+                return "{:.1f} G".format(_G)
														
 
															+            else:
														
 
															+                return "{:.1f} M".format(_M)
														
 
															+        else:
														
 
															+            return "{:.1f} kb".format(_kb)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _download(
														
 
															+            url: str,
														
 
															+            file: str,
														
 
															+            enable_proxy=False,
														
 
															+            allow_show_exception=False,
														
 
															+            **kwargs
														
 
															+    ):
														
 
															+        request_params = {}
														
 
															+        request_params.setdefault('headers', kwargs.get('headers') or headers)
														
 
															+        request_params.setdefault('proxies', kwargs.get('proxies'))
														
 
															+        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
														
 
															+        request_params.setdefault('stream', kwargs.get('stream') or True)
														
 
															+        request_params.setdefault('verify', kwargs.get('verify') or False)
														
 
															+        proxy = Proxy(enable_proxy)
														
 
															+        retries = 0
														
 
															+        while retries < 3:
														
 
															+            try:
														
 
															+                with requests.get(url, **request_params) as req:
														
 
															+                    for req_kw in req_keywords:
														
 
															+                        if req_kw in req.text:
														
 
															+                            with requests.post(url, **request_params) as req:
														
 
															+                                if req.status_code == 200:
														
 
															+                                    stream = req.content
														
 
															+                                    with open(file, 'wb') as f:
														
 
															+                                        f.write(stream)
														
 
															+                                    return stream
														
 
															+                                else:
														
 
															+                                    retries += 1
														
 
															+                    else:
														
 
															+                        if req.status_code == 200:
														
 
															+                            stream = req.content
														
 
															+                            with open(file, 'wb') as f:
														
 
															+                                f.write(stream)
														
 
															+                            return stream
														
 
															+                        else:
														
 
															+                            retries += 1
														
 
															+
														
 
															+            except requests.RequestException:
														
 
															+                if allow_show_exception:
														
 
															+                    traceback.print_exc()
														
 
															+                if enable_proxy:
														
 
															+                    proxy.switch()
														
 
															+                    request_params.update({'proxies': proxy.proxies})
														
 
															+                retries += 1
														
 
															+        return b''
														
 
															+
														
 
															+    def download(
														
 
															+            self,
														
 
															+            file_name: str,
														
 
															+            file_type: str,
														
 
															+            download_url: str,
														
 
															+            enable_proxy=False,
														
 
															+            allow_request_exception=False,
														
 
															+            **kwargs
														
 
															+    ):
														
 
															+        if not file_name or not file_type or not download_url:
														
 
															+            raise AttachmentNullError
														
 
															+
														
 
															+        file_type = file_type.strip()
														
 
															+
														
 
															+        file_name = clean_file_name(file_name,file_type)
														
 
															+
														
 
															+        download_url = judge_file_url(download_url)
														
 
															+
														
 
															+        for app_param in modify_file_url_list:
														
 
															+            download_url = app_param(download_url)
														
 
															+
														
 
															+        local_tmp_file = self._create_file(file_name, file_type)
														
 
															+
														
 
															+
														
 
															+        file_stream = self._download(
														
 
															+            download_url,
														
 
															+            local_tmp_file,
														
 
															+            enable_proxy,
														
 
															+            allow_request_exception,
														
 
															+            **kwargs
														
 
															+        )
														
 
															+        result = {
														
 
															+            'filename': '{}.{}'.format(file_name, file_type),
														
 
															+            'org_url': download_url
														
 
															+        }
														
 
															+        if len(file_stream) > 0:
														
 
															+            try:
														
 
															+                fid = self._create_fid(file_stream)
														
 
															+                key = self._origin_filename(fid, file_type)
														
 
															+                result.setdefault('fid', key)
														
 
															+                result.setdefault('ftype', file_type)
														
 
															+                result.setdefault('size', self._file_size(local_tmp_file))
														
 
															+                result.setdefault('url', 'oss')
														
 
															+                super()._push_oss_from_local(key, local_tmp_file)
														
 
															+            except Exception as e:
														
 
															+                logger.warning(
														
 
															+                    "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
														
 
															+                )
														
 
															+        remove(local_tmp_file)
														
 
															+        '''上传/下载,无论失败/成功必须返回附件信息'''
														
 
															+        if "size" not in result:
														
 
															+            return result
														
 
															+        elif limit_file_size(result.get('size')):
														
 
															+            return result
														
 
															+        else:
														
 
															+            return {}
														
 
															+
														
--- a/qlm/utils/clean_file.py
+++ b/qlm/utils/clean_file.py
@@ -0,0 +1,243 @@
 
															+import hashlib
														
 
															+import os
														
 
															+import re
														
 
															+from urllib.parse import urlparse, unquote
														
 
															+
														
 
															+import requests
														
 
															+
														
 
															+from utils.log import logger
														
 
															+
														
 
															+# 文件文档类型
														
 
															+DOCTYPE = {
														
 
															+    'txt', 'rtf', 'dps', 'et', 'ett', 'xls',
														
 
															+    'xlsx', 'xlsb', 'xlsm', 'xlt', 'ods', 'pmd', 'pmdx',
														
 
															+    'doc', 'docm', 'docx', 'dot', 'dotm', 'dotx',
														
 
															+    'odt', 'wps', 'csv', 'xml', 'xps'
														
 
															+}
														
 
															+# 压缩类型
														
 
															+COMPRESSION_TYPE = {
														
 
															+    'rar', 'zip', 'gzzb', '7z', 'tar', 'gz', 'bz2', 'jar', 'iso', 'cab',
														
 
															+    'arj', 'lzh', 'ace', 'uue', 'edxz',
														
 
															+}
														
 
															+# 图片类型
														
 
															+IMAGE_TYPE = {
														
 
															+    'jpg', 'png', 'jpeg', 'tiff', 'gif', 'psd', 'raw', 'eps', 'svg', 'bmp',
														
 
															+    'pdf'
														
 
															+}
														
 
															+# 其他类型
														
 
															+OTHER_TYPE = {
														
 
															+    'swf', 'nxzf', 'xezf', 'nxcf'
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def sha1(val):
														
 
															+    _sha1 = hashlib.sha1()
														
 
															+    if isinstance(val, bytes):
														
 
															+        _sha1.update(str(val).encode("utf-8"))
														
 
															+    elif isinstance(val, str):
														
 
															+        _sha1.update(val.encode("utf-8"))
														
 
															+    return _sha1.hexdigest()
														
 
															+
														
 
															+
														
 
															+def remove(file_path: str):
														
 
															+    os.remove(file_path)
														
 
															+
														
 
															+
														
 
															+def getsize(file):
														
 
															+    try:
														
 
															+        return os.path.getsize(file)
														
 
															+    except FileNotFoundError:
														
 
															+        return 0
														
 
															+
														
 
															+
														
 
															+def discern_file_format(text):
														
 
															+
														
 
															+    text = text.strip()
														
 
															+
														
 
															+    file_types = {
														
 
															+        *DOCTYPE,
														
 
															+        *COMPRESSION_TYPE,
														
 
															+        *IMAGE_TYPE,
														
 
															+        *OTHER_TYPE
														
 
															+    }
														
 
															+    for file_type in file_types:
														
 
															+        all_file_format = [file_type, file_type.upper()]
														
 
															+        for t in all_file_format:
														
 
															+            result = re.match(f'.*{t}$', text, re.S)
														
 
															+            if result is not None:
														
 
															+                return t
														
 
															+    else:
														
 
															+        unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S)
														
 
															+        logger.warning(f'[附件类型识别]未定义的文件类型{unknown_type}')
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def extract_file_type(text):
														
 
															+    if text is None:
														
 
															+        return None
														
 
															+    return discern_file_format(text)
														
 
															+
														
 
															+
														
 
															+def extract_file_name_by_href(href: str, file_type: str):
														
 
															+    """从url中抽取文件名称"""
														
 
															+    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
														
 
															+    # 中文字符:[\u4e00 -\u9fa5]
														
 
															+    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
														
 
															+    parser = urlparse(href)
														
 
															+    query = (parser.query or parser.path)
														
 
															+    result = re.search(f'.*\\.{file_type}', query, re.S)
														
 
															+    if result is not None:
														
 
															+        encode_str = unquote(result.group())
														
 
															+        name = re.search(zh_char_pattern, encode_str)
														
 
															+        if name is not None:
														
 
															+            return unquote(name.group())
														
 
															+    return None
														
 
															+
														
 
															+
														
 
															+def extract_file_name(text):
														
 
															+    file_type = discern_file_format(text)
														
 
															+    if file_type is not None:
														
 
															+        repl = '.{}'.format(file_type)
														
 
															+        text = text.replace(repl, '')
														
 
															+    return text
														
 
															+
														
 
															+
														
 
															+def verify_file_name(name):
														
 
															+    if extract_file_type(name) is None:
														
 
															+        raise ValueError
														
 
															+
														
 
															+
														
 
															+# 去除附件名空格、两个后缀
														
 
															+def clean_file_name(file_name:str,file_type:str):
														
 
															+    file_name = file_name.strip()
														
 
															+    if file_type in file_name:
														
 
															+        file_name = file_name.replace(f'.{file_type}', '')
														
 
															+    return file_name
														
 
															+
														
 
															+
														
 
															+# 限制附件大小：size < 5 kb 不存入数据库
														
 
															+def limit_file_size(file_size:str):
														
 
															+    if "M" in file_size or "m" in file_size:
														
 
															+        file_size = float("".join(re.findall('^[0-9]\d*\.\d*|[1-9]\d*',file_size))) * 1000
														
 
															+    else:
														
 
															+        file_size = "".join(re.findall('^[0-9]\d*\.\d*|[1-9]\d*', file_size))
														
 
															+    if float(file_size) < 5:
														
 
															+        return False
														
 
															+    else:
														
 
															+        return True
														
 
															+
														
 
															+
														
 
															+# 判断附件地址是否正确
														
 
															+def judge_file_url(file_url:str):
														
 
															+    file_url = file_url.strip()
														
 
															+    if " " in file_url:
														
 
															+        file_url = file_url.split(" ")[0]
														
 
															+    return file_url
														
 
															+
														
 
															+
														
 
															+# 需二次请求 添加附件地址参数 appUrlFlag
														
 
															+def add_appUrlFlag_param(file_url):
														
 
															+    if "appUrlFlag" in file_url and "downloadztbattach" in file_url and "attachGuid" in file_url:
														
 
															+        file_url = file_url.replace('downloadztbattach','ztbAttachDownloadAction.action') + "&cmd=getContent"
														
 
															+    return file_url
														
 
															+
														
 
															+
														
 
															+
														
 
															+# 附件下载 需验证码
														
 
															+session = requests.session()
														
 
															+
														
 
															+headers = {
														
 
															+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
														
 
															+}
														
 
															+
														
 
															+# 打码接口
														
 
															+def get_code(file_path: str) -> dict:
														
 
															+    upload_address = "http://123.57.163.80:2119/v1/images/verify"
														
 
															+    with open(file_path, 'rb') as f:
														
 
															+        image_bytes = f.read()
														
 
															+    content = {'file': image_bytes}
														
 
															+    # json_resp = get_verify_code(upload_address, content)
														
 
															+    headers = {'accept': 'application/json'}
														
 
															+    response = session.post(upload_address, headers=headers, files=content, stream=True)
														
 
															+    return response.json()
														
 
															+
														
 
															+
														
 
															+# 识别验证码
														
 
															+def get_dealcode(img_url):
														
 
															+    res = session.get(img_url, headers=headers)
														
 
															+    img_path = 'image'
														
 
															+    if not os.path.exists(img_path):
														
 
															+        os.mkdir(img_path)
														
 
															+    with open(img_path + '/zgzbycgw.jpg', 'wb') as f:
														
 
															+        f.write(res.content)
														
 
															+    res = get_code(img_path + '/zgzbycgw.jpg')
														
 
															+    if res.get("msg") == "success":
														
 
															+        img_code = res.get("r").get("code")
														
 
															+    else:
														
 
															+        img_code = None
														
 
															+    return img_code
														
 
															+
														
 
															+
														
 
															+# 天津市政府采购网
														
 
															+# def tjzfcgw_file_yzm(file_url):
														
 
															+#     img_url = 'http://www.ccgp-tianjin.gov.cn/commons/image.jsp'
														
 
															+#     session.get(file_url, headers=headers, verify=False)
														
 
															+#
														
 
															+#     # 下载地址
														
 
															+#     file_url_yzm = "http://www.ccgp-tianjin.gov.cn/portal/documentView.do"
														
 
															+#
														
 
															+#     Yzm_result = get_dealcode(img_url).replace("=", "").replace("?", "")
														
 
															+#     if "x" in Yzm_result:
														
 
															+#         Yzm_result = Yzm_result.replace("x", "*")
														
 
															+#     try:
														
 
															+#         yzm = eval(Yzm_result)
														
 
															+#     except:
														
 
															+#         yzm = ""
														
 
															+#
														
 
															+#     params_yzm = {
														
 
															+#         "imageString": f"{yzm}",
														
 
															+#         "method": "downNewFiles"
														
 
															+#     }
														
 
															+#
														
 
															+#     file_result = session.get(file_url_yzm, headers=headers, params=params_yzm, verify=False)
														
 
															+#
														
 
															+#     req_count = 1
														
 
															+#     while "请输入验证码" in file_result.text:
														
 
															+#         if req_count >= 10:
														
 
															+#             break
														
 
															+#         Yzm_result = get_dealcode(img_url).replace("=", "").replace("?", "")
														
 
															+#         if "x" in Yzm_result:
														
 
															+#             Yzm_result = Yzm_result.replace("x", "*")
														
 
															+#         try:
														
 
															+#             yzm = eval(Yzm_result)
														
 
															+#         except:
														
 
															+#             yzm = ""
														
 
															+#
														
 
															+#         params_yzm = {
														
 
															+#             "imageString": f"{yzm}",
														
 
															+#             "method": "downNewFiles"
														
 
															+#         }
														
 
															+#
														
 
															+#         file_result = session.get(file_url_yzm, headers=headers, params=params_yzm, verify=False)
														
 
															+#         # 站点限制 访问频率 ，故休眠时间较大
														
 
															+#         time.sleep(random.randint(10,20))
														
 
															+#         req_count += 1
														
 
															+#
														
 
															+#     return file_result.content
														
 
															+
														
 
															+
														
 
															+# 判断 附件下载 是否需要 验证码
														
 
															+# yzm_keywords = ['method=downEnId']
														
 
															+
														
 
															+# 附件下载 需要 验证码 的方法
														
 
															+# site_list_yzm = [tjzfcgw_file_yzm]
														
 
															+
														
 
															+# 判断 附件下载 是否需要 修改 请求方式
														
 
															+
														
 
															+
														
 
															+req_keywords = ['请求类型防御']
														
 
															+
														
 
															+# 附件下载 需要 修改附件地址 的方法
														
 
															+modify_file_url_list = [add_appUrlFlag_param]
														
 
															+
														
 
															+
														
--- a/qlm/utils/databases.py
+++ b/qlm/utils/databases.py
@@ -0,0 +1,109 @@
 
															+import bson
														
 
															+import pymongo
														
 
															+import redis
														
 
															+import requests
														
 
															+from elasticsearch import Elasticsearch
														
 
															+
														
 
															+from config.load import mongo_conf, redis_conf, es_conf, analyze_url
														
 
															+
														
 
															+
														
 
															+# ---------------------------------- mongo ----------------------------------
														
 
															+def mongo_client(cfg=None):
														
 
															+    if cfg is None:
														
 
															+        cfg = mongo_conf
														
 
															+    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
														
 
															+
														
 
															+
														
 
															+def mongo_database(db: str):
														
 
															+    client = mongo_client()
														
 
															+    return client[db]
														
 
															+
														
 
															+
														
 
															+def mongo_table(db: str, coll: str):
														
 
															+    client = mongo_client()
														
 
															+    return client[db][coll]
														
 
															+
														
 
															+
														
 
															+def int2long(param: int):
														
 
															+    """int 转换成 long """
														
 
															+    return bson.int64.Int64(param)
														
 
															+
														
 
															+
														
 
															+def object_id(_id: str):
														
 
															+    return bson.objectid.ObjectId(_id)
														
 
															+
														
 
															+
														
 
															+# ---------------------------------- es ----------------------------------
														
 
															+def es_client(cfg=None):
														
 
															+    if cfg is None:
														
 
															+        cfg = es_conf
														
 
															+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
														
 
															+
														
 
															+
														
 
															+def es_participles_service(text: str):
														
 
															+    """
														
 
															+    获取文本的分词列表
														
 
															+
														
 
															+    :param text: 需要分词的文本
														
 
															+    :return: 分词列表
														
 
															+    """
														
 
															+    result = []
														
 
															+    params = {"text": text, "analyzer": "ik_smart"}
														
 
															+    res = requests.get(analyze_url, params=params, timeout=60)
														
 
															+    if res.status_code == 200:
														
 
															+        tokens = res.json().get('tokens', [])
														
 
															+        for x in tokens:
														
 
															+            if x["token"].encode('utf-8').isalpha():
														
 
															+                continue
														
 
															+            result.append(x["token"])
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def es_query(title: str, publish_time: int):
														
 
															+    """
														
 
															+    查询es
														
 
															+
														
 
															+    :param title: 标题
														
 
															+    :param publish_time: 发布时间
														
 
															+    :return:
														
 
															+    """
														
 
															+    client = es_client()
														
 
															+    stime = publish_time - 432000  # 往前推5天
														
 
															+    etime = publish_time + 432000
														
 
															+    conditions = []
														
 
															+    participles = es_participles_service(title)
														
 
															+    for word in participles:
														
 
															+        conditions.append({
														
 
															+            "multi_match": {
														
 
															+                "query": word,
														
 
															+                "type": "phrase",
														
 
															+                "fields": ["title"]
														
 
															+            }
														
 
															+        })
														
 
															+    conditions.append({
														
 
															+        "range": {"publishtime": {"from": stime, "to": etime}}
														
 
															+    })
														
 
															+    query = {
														
 
															+        "query": {
														
 
															+            "bool": {
														
 
															+                "must": conditions,
														
 
															+                "minimum_should_match": 1
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+    result = client.search(index='bidding', body=query, request_timeout=100)
														
 
															+    count = len(result['hits']['hits'])
														
 
															+    return count
														
 
															+
														
 
															+
														
 
															+# ---------------------------------- redis ----------------------------------
														
 
															+def redis_client(cfg=None):
														
 
															+    if cfg is None:
														
 
															+        cfg = redis_conf
														
 
															+    pool = redis.ConnectionPool(
														
 
															+        host=cfg['host'],
														
 
															+        port=cfg['port'],
														
 
															+        password=cfg['pwd'],
														
 
															+        db=cfg['db']
														
 
															+    )
														
 
															+    return redis.Redis(connection_pool=pool, decode_responses=True)
														
--- a/qlm/utils/execptions.py
+++ b/qlm/utils/execptions.py
@@ -0,0 +1,35 @@
 
															+
														
 
															+class JyBasicException(Exception):
														
 
															+
														
 
															+    def __init__(self, code: int, reason: str, **kwargs):
														
 
															+        self.code = code
														
 
															+        self.reason = reason
														
 
															+        self.err_details = kwargs
														
 
															+        for key, val in kwargs.items():
														
 
															+            setattr(self, key, val)
														
 
															+
														
 
															+
														
 
															+class CustomCheckError(JyBasicException):
														
 
															+
														
 
															+    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
														
 
															+        self.code = code
														
 
															+        self.reason = reason
														
 
															+        self.err_details = kwargs
														
 
															+        for key, val in kwargs.items():
														
 
															+            setattr(self, key, val)
														
 
															+
														
 
															+
														
 
															+class AttachmentNullError(JyBasicException):
														
 
															+
														
 
															+    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
														
 
															+        self.code = code
														
 
															+        self.reason = reason
														
 
															+        self.err_details = kwargs
														
 
															+        for key, val in kwargs.items():
														
 
															+            setattr(self, key, val)
														
 
															+
														
 
															+
														
 
															+class CustomAccountPrivilegeError(JyBasicException):
														
 
															+
														
 
															+    def __init__(self, *args, **kwargs):
														
 
															+        pass
														
--- a/qlm/utils/log.py
+++ b/qlm/utils/log.py
@@ -0,0 +1,14 @@
 
															+from pathlib import Path
														
 
															+
														
 
															+from loguru import logger
														
 
															+
														
 
															+_absolute = Path(__file__).absolute().parent.parent
														
 
															+_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
														
 
															+logger.add(
														
 
															+    _log_path,
														
 
															+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
														
 
															+    level='INFO',
														
 
															+    rotation='00:00',
														
 
															+    retention='1 week',
														
 
															+    encoding='utf-8',
														
 
															+)
														
--- a/qlm/utils/socks5.py
+++ b/qlm/utils/socks5.py
@@ -0,0 +1,44 @@
 
															+import threading
														
 
															+
														
 
															+import requests
														
 
															+
														
 
															+from config.load import jy_proxy
														
 
															+from utils.log import logger
														
 
															+
														
 
															+__all__ = ['Proxy']
														
 
															+
														
 
															+
														
 
															+class Socks5Proxy:
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self._lock = threading.RLock()
														
 
															+        self._enable_proxy = False
														
 
															+        self._url = jy_proxy['socks5']['url']
														
 
															+        self._auth = jy_proxy['socks5']['auth']
														
 
															+        self._proxies = None
														
 
															+
														
 
															+    @property
														
 
															+    def proxies(self):
														
 
															+        return self._proxies
														
 
															+
														
 
															+    def switch(self):
														
 
															+        with self._lock:
														
 
															+            if self._enable_proxy:
														
 
															+                self._proxies = self._fetch_proxies()
														
 
															+
														
 
															+    def _fetch_proxies(self):
														
 
															+        _proxy = {}
														
 
															+        try:
														
 
															+            _proxy = requests.get(self._url, headers=self._auth, timeout=10).json()
														
 
															+        finally:
														
 
															+            return _proxy.get("data")
														
 
															+
														
 
															+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
														
 
															+        self._enable_proxy = enable_proxy
														
 
															+        if self._enable_proxy:
														
 
															+            logger.info("[socks5代理 - 开启]")
														
 
															+            self._proxies = self._fetch_proxies()
														
 
															+        return self
														
 
															+
														
 
															+
														
 
															+Proxy = Socks5Proxy()
														
--- a/qlm/utils/tools.py
+++ b/qlm/utils/tools.py
@@ -0,0 +1,24 @@
 
															+import hashlib
														
 
															+import socket
														
 
															+
														
 
															+
														
 
															+def sha1(text: str):
														
 
															+    """
														
 
															+    十六进制数字字符串形式摘要值
														
 
															+
														
 
															+    @param text: 字符串文本
														
 
															+    @return: 摘要值
														
 
															+    """
														
 
															+    _sha1 = hashlib.sha1()
														
 
															+    _sha1.update(text.encode("utf-8"))
														
 
															+    return _sha1.hexdigest()
														
 
															+
														
 
															+
														
 
															+def get_host_ip():
														
 
															+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
														
 
															+    try:
														
 
															+        s.connect(('8.8.8.8', 80))
														
 
															+        ip = s.getsockname()[0]
														
 
															+    finally:
														
 
															+        s.close()
														
 
															+    return ip
	`@@ -0,0 +1,2 @@`
			`+headers:`
			`+ User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36`