소스 검색

更新项目配置

dongzhaorui 1 년 전
부모
커밋
19a0bdb4f9
6개의 변경된 파일0개의 추가작업 그리고 492개의 파일을 삭제
  1. 0 22
      qlm/config/conf.yaml
  2. 0 7
      qlm/config/load.py
  3. 0 23
      qlm/utils/aliyun.py
  4. 0 153
      qlm/utils/attachment.py
  5. 0 243
      qlm/utils/clean_file.py
  6. 0 44
      qlm/utils/socks5.py

+ 0 - 22
qlm/config/conf.yaml

@@ -13,25 +13,3 @@ redis:
 #  port: !!int 6379
 #  pwd: ""
   db: !!int 3
-
-
-ali_oss:
-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
-  endpoint: oss-cn-beijing-internal.aliyuncs.com
-#  endpoint: oss-cn-beijing.aliyuncs.com
-  bucket_name: jy-datafile
-
-
-es:
-  host: 172.17.145.178
-  port: !!int 9800
-  db: biddingall # es库别名
-
-
-# 代理
-proxy:
-  socks5:
-    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
-    auth:
-      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB

+ 0 - 7
qlm/config/load.py

@@ -5,9 +5,6 @@ import yaml
 __all__ = [
     'mongo_conf',
     'redis_conf',
-    'oss_conf',
-    'jy_proxy',
-    'es_conf',
     'headers',
     'node_module_path'
 ]
@@ -21,10 +18,6 @@ with open(_yaml_conf, encoding="utf-8") as f:
     conf = yaml.safe_load(f)
     mongo_conf = conf['mongo']
     redis_conf = conf['redis']
-    oss_conf: dict = conf['ali_oss']
-    es_conf: dict = conf['es']
-    jy_proxy: dict = conf['proxy']
-
 
 with open(_yaml_constants, encoding="utf-8") as fp:
     constants = yaml.safe_load(fp)

+ 0 - 23
qlm/utils/aliyun.py

@@ -1,23 +0,0 @@
-import oss2
-
-from config.load import oss_conf
-
-
-class AliYunService:
-
-    def __init__(self):
-        self.__acc_key_id = oss_conf['key_id']
-        self.__acc_key_secret = oss_conf['key_secret']
-        self.__endpoint = oss_conf['endpoint']
-        self.__bucket_name = oss_conf['bucket_name']
-
-    def _push_oss_from_local(self, key, filename):
-        """
-        上传一个本地文件到OSS的普通文件
-
-        :param str key: 上传到OSS的文件名
-        :param str filename: 本地文件名,需要有可读权限
-        """
-        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
-        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
-        bucket.put_object_from_file(key, filename)

+ 0 - 153
qlm/utils/attachment.py

@@ -1,153 +0,0 @@
-import traceback
-import uuid
-
-import urllib3
-
-from config.load import headers
-from utils.aliyun import AliYunService
-from utils.clean_file import *
-from utils.execptions import AttachmentNullError
-from utils.socks5 import Proxy
-
-urllib3.disable_warnings()
-
-
-class AttachmentDownloader(AliYunService):
-
-    def __init__(self):
-        super(AttachmentDownloader, self).__init__()
-        self.dir_name = 'file'
-
-    def _create_file(self, filename, filetype):
-        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
-        file = "{filename}.{filetype}".format(
-            filename=sha1("{}_{}".format(filename, uuid.uuid4())),
-            filetype=filetype
-        )
-        return "{}/{}".format(self.dir_name, file)
-
-    @staticmethod
-    def _create_fid(file_stream: bytes):
-        return sha1(file_stream)
-
-    @staticmethod
-    def _origin_filename(fid: str, filetype: str):
-        return "{}.{}".format(fid, filetype)
-
-    @staticmethod
-    def _file_size(file: str):
-        _kb = float(getsize(file)) / 1024
-        if _kb >= 1024:
-            _M = _kb / 1024
-            if _M >= 1024:
-                _G = _M / 1024
-                return "{:.1f} G".format(_G)
-            else:
-                return "{:.1f} M".format(_M)
-        else:
-            return "{:.1f} kb".format(_kb)
-
-    @staticmethod
-    def _download(
-            url: str,
-            file: str,
-            enable_proxy=False,
-            allow_show_exception=False,
-            **kwargs
-    ):
-        request_params = {}
-        request_params.setdefault('headers', kwargs.get('headers') or headers)
-        request_params.setdefault('proxies', kwargs.get('proxies'))
-        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
-        request_params.setdefault('stream', kwargs.get('stream') or True)
-        request_params.setdefault('verify', kwargs.get('verify') or False)
-        proxy = Proxy(enable_proxy)
-        retries = 0
-        while retries < 3:
-            try:
-                with requests.get(url, **request_params) as req:
-                    for req_kw in req_keywords:
-                        if req_kw in req.text:
-                            with requests.post(url, **request_params) as req:
-                                if req.status_code == 200:
-                                    stream = req.content
-                                    with open(file, 'wb') as f:
-                                        f.write(stream)
-                                    return stream
-                                else:
-                                    retries += 1
-                    else:
-                        if req.status_code == 200:
-                            stream = req.content
-                            with open(file, 'wb') as f:
-                                f.write(stream)
-                            return stream
-                        else:
-                            retries += 1
-
-            except requests.RequestException:
-                if allow_show_exception:
-                    traceback.print_exc()
-                if enable_proxy:
-                    proxy.switch()
-                    request_params.update({'proxies': proxy.proxies})
-                retries += 1
-        return b''
-
-    def download(
-            self,
-            file_name: str,
-            file_type: str,
-            download_url: str,
-            enable_proxy=False,
-            allow_request_exception=False,
-            **kwargs
-    ):
-        if not file_name or not file_type or not download_url:
-            raise AttachmentNullError
-
-        file_type = file_type.strip()
-
-        file_name = clean_file_name(file_name,file_type)
-
-        download_url = judge_file_url(download_url)
-
-        for app_param in modify_file_url_list:
-            download_url = app_param(download_url)
-
-        local_tmp_file = self._create_file(file_name, file_type)
-
-
-        file_stream = self._download(
-            download_url,
-            local_tmp_file,
-            enable_proxy,
-            allow_request_exception,
-            **kwargs
-        )
-        result = {
-            'filename': '{}.{}'.format(file_name, file_type),
-            'org_url': download_url
-        }
-        if len(file_stream) > 0:
-            try:
-                fid = self._create_fid(file_stream)
-                key = self._origin_filename(fid, file_type)
-                result.setdefault('fid', key)
-                result.setdefault('ftype', file_type)
-                result.setdefault('size', self._file_size(local_tmp_file))
-                result.setdefault('url', 'oss')
-                super()._push_oss_from_local(key, local_tmp_file)
-            except Exception as e:
-                logger.warning(
-                    "[{}]下载异常,原因:{}".format(file_name, e.__class__.__name__)
-                )
-        remove(local_tmp_file)
-        '''上传/下载,无论失败/成功必须返回附件信息'''
-        if "size" not in result:
-            return result
-        elif limit_file_size(result.get('size')):
-            return result
-        else:
-            return {}
-

+ 0 - 243
qlm/utils/clean_file.py

@@ -1,243 +0,0 @@
-import hashlib
-import os
-import re
-from urllib.parse import urlparse, unquote
-
-import requests
-
-from utils.log import logger
-
-# 文件文档类型
-DOCTYPE = {
-    'txt', 'rtf', 'dps', 'et', 'ett', 'xls',
-    'xlsx', 'xlsb', 'xlsm', 'xlt', 'ods', 'pmd', 'pmdx',
-    'doc', 'docm', 'docx', 'dot', 'dotm', 'dotx',
-    'odt', 'wps', 'csv', 'xml', 'xps'
-}
-# 压缩类型
-COMPRESSION_TYPE = {
-    'rar', 'zip', 'gzzb', '7z', 'tar', 'gz', 'bz2', 'jar', 'iso', 'cab',
-    'arj', 'lzh', 'ace', 'uue', 'edxz',
-}
-# 图片类型
-IMAGE_TYPE = {
-    'jpg', 'png', 'jpeg', 'tiff', 'gif', 'psd', 'raw', 'eps', 'svg', 'bmp',
-    'pdf'
-}
-# 其他类型
-OTHER_TYPE = {
-    'swf', 'nxzf', 'xezf', 'nxcf'
-}
-
-
-def sha1(val):
-    _sha1 = hashlib.sha1()
-    if isinstance(val, bytes):
-        _sha1.update(str(val).encode("utf-8"))
-    elif isinstance(val, str):
-        _sha1.update(val.encode("utf-8"))
-    return _sha1.hexdigest()
-
-
-def remove(file_path: str):
-    os.remove(file_path)
-
-
-def getsize(file):
-    try:
-        return os.path.getsize(file)
-    except FileNotFoundError:
-        return 0
-
-
-def discern_file_format(text):
-
-    text = text.strip()
-
-    file_types = {
-        *DOCTYPE,
-        *COMPRESSION_TYPE,
-        *IMAGE_TYPE,
-        *OTHER_TYPE
-    }
-    for file_type in file_types:
-        all_file_format = [file_type, file_type.upper()]
-        for t in all_file_format:
-            result = re.match(f'.*{t}$', text, re.S)
-            if result is not None:
-                return t
-    else:
-        unknown_type = re.findall('[^.\\/:*?"<>|\r\n]+$', text, re.S)
-        logger.warning(f'[附件类型识别]未定义的文件类型{unknown_type}')
-        return None
-
-
-def extract_file_type(text):
-    if text is None:
-        return None
-    return discern_file_format(text)
-
-
-def extract_file_name_by_href(href: str, file_type: str):
-    """从url中抽取文件名称"""
-    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
-    # 中文字符:[\u4e00 -\u9fa5]
-    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
-    parser = urlparse(href)
-    query = (parser.query or parser.path)
-    result = re.search(f'.*\\.{file_type}', query, re.S)
-    if result is not None:
-        encode_str = unquote(result.group())
-        name = re.search(zh_char_pattern, encode_str)
-        if name is not None:
-            return unquote(name.group())
-    return None
-
-
-def extract_file_name(text):
-    file_type = discern_file_format(text)
-    if file_type is not None:
-        repl = '.{}'.format(file_type)
-        text = text.replace(repl, '')
-    return text
-
-
-def verify_file_name(name):
-    if extract_file_type(name) is None:
-        raise ValueError
-
-
-# 去除附件名空格、两个后缀
-def clean_file_name(file_name:str,file_type:str):
-    file_name = file_name.strip()
-    if file_type in file_name:
-        file_name = file_name.replace(f'.{file_type}', '')
-    return file_name
-
-
-# 限制附件大小:size < 5 kb 不存入数据库
-def limit_file_size(file_size:str):
-    if "M" in file_size or "m" in file_size:
-        file_size = float("".join(re.findall('^[0-9]\d*\.\d*|[1-9]\d*',file_size))) * 1000
-    else:
-        file_size = "".join(re.findall('^[0-9]\d*\.\d*|[1-9]\d*', file_size))
-    if float(file_size) < 5:
-        return False
-    else:
-        return True
-
-
-# 判断附件地址是否正确
-def judge_file_url(file_url:str):
-    file_url = file_url.strip()
-    if " " in file_url:
-        file_url = file_url.split(" ")[0]
-    return file_url
-
-
-# 需二次请求 添加附件地址参数 appUrlFlag
-def add_appUrlFlag_param(file_url):
-    if "appUrlFlag" in file_url and "downloadztbattach" in file_url and "attachGuid" in file_url:
-        file_url = file_url.replace('downloadztbattach','ztbAttachDownloadAction.action') + "&cmd=getContent"
-    return file_url
-
-
-
-# 附件下载 需验证码
-session = requests.session()
-
-headers = {
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
-}
-
-# 打码接口
-def get_code(file_path: str) -> dict:
-    upload_address = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
-    with open(file_path, 'rb') as f:
-        image_bytes = f.read()
-    content = {'file': image_bytes}
-    # json_resp = get_verify_code(upload_address, content)
-    headers = {'accept': 'application/json'}
-    response = session.post(upload_address, headers=headers, files=content, stream=True)
-    return response.json()
-
-
-# 识别验证码
-def get_dealcode(img_url):
-    res = session.get(img_url, headers=headers)
-    img_path = 'image'
-    if not os.path.exists(img_path):
-        os.mkdir(img_path)
-    with open(img_path + '/zgzbycgw.jpg', 'wb') as f:
-        f.write(res.content)
-    res = get_code(img_path + '/zgzbycgw.jpg')
-    if res.get("msg") == "success":
-        img_code = res.get("r").get("code")
-    else:
-        img_code = None
-    return img_code
-
-
-# 天津市政府采购网
-# def tjzfcgw_file_yzm(file_url):
-#     img_url = 'http://www.ccgp-tianjin.gov.cn/commons/image.jsp'
-#     session.get(file_url, headers=headers, verify=False)
-#
-#     # 下载地址
-#     file_url_yzm = "http://www.ccgp-tianjin.gov.cn/portal/documentView.do"
-#
-#     Yzm_result = get_dealcode(img_url).replace("=", "").replace("?", "")
-#     if "x" in Yzm_result:
-#         Yzm_result = Yzm_result.replace("x", "*")
-#     try:
-#         yzm = eval(Yzm_result)
-#     except:
-#         yzm = ""
-#
-#     params_yzm = {
-#         "imageString": f"{yzm}",
-#         "method": "downNewFiles"
-#     }
-#
-#     file_result = session.get(file_url_yzm, headers=headers, params=params_yzm, verify=False)
-#
-#     req_count = 1
-#     while "请输入验证码" in file_result.text:
-#         if req_count >= 10:
-#             break
-#         Yzm_result = get_dealcode(img_url).replace("=", "").replace("?", "")
-#         if "x" in Yzm_result:
-#             Yzm_result = Yzm_result.replace("x", "*")
-#         try:
-#             yzm = eval(Yzm_result)
-#         except:
-#             yzm = ""
-#
-#         params_yzm = {
-#             "imageString": f"{yzm}",
-#             "method": "downNewFiles"
-#         }
-#
-#         file_result = session.get(file_url_yzm, headers=headers, params=params_yzm, verify=False)
-#         # 站点限制 访问频率 ,故休眠时间较大
-#         time.sleep(random.randint(10,20))
-#         req_count += 1
-#
-#     return file_result.content
-
-
-# 判断 附件下载 是否需要 验证码
-# yzm_keywords = ['method=downEnId']
-
-# 附件下载 需要 验证码 的方法
-# site_list_yzm = [tjzfcgw_file_yzm]
-
-# 判断 附件下载 是否需要 修改 请求方式
-
-
-req_keywords = ['请求类型防御']
-
-# 附件下载 需要 修改附件地址 的方法
-modify_file_url_list = [add_appUrlFlag_param]
-
-

+ 0 - 44
qlm/utils/socks5.py

@@ -1,44 +0,0 @@
-import threading
-
-import requests
-
-from config.load import jy_proxy
-from utils.log import logger
-
-__all__ = ['Proxy']
-
-
-class Socks5Proxy:
-
-    def __init__(self):
-        self._lock = threading.RLock()
-        self._enable_proxy = False
-        self._url = jy_proxy['socks5']['url']
-        self._auth = jy_proxy['socks5']['auth']
-        self._proxies = None
-
-    @property
-    def proxies(self):
-        return self._proxies
-
-    def switch(self):
-        with self._lock:
-            if self._enable_proxy:
-                self._proxies = self._fetch_proxies()
-
-    def _fetch_proxies(self):
-        _proxy = {}
-        try:
-            _proxy = requests.get(self._url, headers=self._auth, timeout=10).json()
-        finally:
-            return _proxy.get("data")
-
-    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
-        self._enable_proxy = enable_proxy
-        if self._enable_proxy:
-            logger.info("[socks5代理 - 开启]")
-            self._proxies = self._fetch_proxies()
-        return self
-
-
-Proxy = Socks5Proxy()