浏览代码

浙移集成

dzr 8 月之前
父节点
当前提交
013ab9cebc
共有 7 个文件被更改,包括 578 次插入0 次删除
  1. 二进制
      zyjc/rgg/.DS_Store
  2. 8 0
      zyjc/rgg/__init__.py
  3. 109 0
      zyjc/rgg/account.py
  4. 二进制
      zyjc/rgg/backup/bloomfilter.f
  5. 177 0
      zyjc/rgg/clean_html.py
  6. 14 0
      zyjc/rgg/log.py
  7. 270 0
      zyjc/rgg/net.py

二进制
zyjc/rgg/.DS_Store


+ 8 - 0
zyjc/rgg/__init__.py

@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-17 
+---------
+@summary:  
+---------
+
+"""

+ 109 - 0
zyjc/rgg/account.py

@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-10 
+---------
+@summary:  
+---------
+
+"""
+
+import json
+from pathlib import Path
+
+from DrissionPage import ChromiumPage, ChromiumOptions
+
+account_pool = [
+    ('DUDUDU101613', 'Admin330022'),
+]
+
+
+def auto_login(username, password, headless=False, proxy=False, auto_quit=False):
+    co = ChromiumOptions()
+
+    co.auto_port(tmp_path=f'./download/{username}')
+    co.set_user_data_path(f'./chrome/{username}')
+
+    # 禁用密码保存弹窗
+    co.set_argument('--disable-infobars')
+    co.set_argument('--disable-extensions')
+    co.set_argument('--disable-popup-blocking')
+
+    if proxy:
+        proxies = {
+            'https': 'socks5://27.54.248.242:8860',
+            'http': 'socks5://27.54.248.242:8860'
+        }
+        co.set_argument('--proxy-server', value=proxies['https'])
+    else:
+        proxies = None
+
+    if headless:
+        co.set_user_agent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
+        co.set_argument('--headless', value='new')
+        co.set_argument('--incognito')
+        co.set_argument('--no-sandbox')
+        co.set_argument('--disable-gpu')
+        co.set_argument('--disable-dev-shm-usage')
+
+    page = ChromiumPage(addr_or_opts=co)
+    try:
+        targets = [
+            'website-seo/v2/cm/getcatid',
+            'rest/detail/alltypesdetail/detail',
+            'rest/account/companySpace/checkNewUser'
+        ]
+        page.listen.start(targets=targets, res_type=['Document', 'XHR'])  # 开启路由监听
+
+        success = page.get('https://vip.qianlima.com/')  # 访问用户管理界面
+        if not success:
+            return
+
+        login = page.wait.ele_displayed('x://span[text()="登录"]', timeout=5)
+        if login:
+            page.ele('x://input[@name="username"]').input(username, clear=True)
+            page.ele('x://input[@name="password"]').input(password, clear=True)
+            page.ele('x://span[text()="登录"]/parent::*').click()
+
+        loaded = page.wait.ele_displayed(f'x://p[contains(text(), "{username}")]')  # 等待页面加载完成
+        if not loaded:
+            print(f'登录失败>{username}')
+            return
+
+        # page.get('http://www.qianlima.com/zb/detail/20241016_454396207.html')
+
+        packet = page.listen.wait()
+        root = Path(__file__).parent
+        if not (root / 'account').exists():
+            (root / 'account').mkdir(exist_ok=True)
+
+        file = (root / f'account/{username}.json').absolute()
+        with open(file, 'w') as f:
+            print(packet.url)  # 打印数据包url
+            # print(packet.response.body)
+            headers = dict(packet.request.headers)
+            print(f'** headers ** \n{json.dumps(headers, indent=4)}')
+            cookies = page.cookies(as_dict=True)
+            print(f'** cookies ** \n{json.dumps(cookies, indent=4)}')
+            user = {
+                'cookies': cookies,
+                'headers': headers,
+                'proxies': proxies
+            }
+            f.write(json.dumps(user, indent=4))
+            if not auto_quit:
+                f.flush()
+                while True:
+                    if input("退出>"):
+                        break
+
+    except KeyboardInterrupt:
+        pass
+
+    finally:
+        page.quit()
+        print('关闭浏览器')
+
+
+if __name__ == '__main__':
+    for username, password in account_pool:
+        auto_login(username, password, proxy=True, auto_quit=True, headless=True)

二进制
zyjc/rgg/backup/bloomfilter.f


+ 177 - 0
zyjc/rgg/clean_html.py

@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+import re
+
+__all__ = ['cleaner']
+
+'''独立元素'''
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '</input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+'''行内元素'''
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<link>|<link [^>]*>|</link>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+    'data:image(.*?) ': '',  # 图片base64
+}
+'''块级元素'''
+BLOCK_TAGS = {
+    '<div>\s*?</div>': '',
+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>': '<br>',  # 段落
+    '</p>': '',  # 段落
+    '<div>|<div [^>]*>': '<br>',  # 分割
+    '</div>': '',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+'''其他'''
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+}
+'''样式'''
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+}
+'''空白符'''
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+'''css标签集合'''
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+'''css属性集合'''
+ATTRS = {'id', 'class', 'style', 'width'}
+'''特殊样式的标签'''
+SPECIAL_TAGS = {
+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
+}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    return html
+
+
+def _lowercase_tag(html):
+    """元素标签转成小写,不影响页面文本"""
+    tags = re.findall("<[^>]+>", html)
+    tag_sets = set(tags)
+
+    if len(tag_sets) > 10000:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, 'lxml')
+        html = str(soup.body.next_element)
+    else:
+        for tag in tag_sets:
+            html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()  # 标签修复
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def _clear_special_tag(html):
+    """删除特殊元素标签"""
+    for tag, repl in SPECIAL_TAGS.items():
+        html = tag.sub(repl, html)
+    return html
+
+
+def _clear_input_tag(html, display=False):
+    """提取value值,替换input标签"""
+    if not display:
+        html = html.replace('<input', '<input style="border-color: transparent;"')  # 不显示输入框边框
+
+    tag = re.compile(r'<input .*?>', re.S)
+    value = re.compile(r'value=["|\'](.*?)["|\']')
+
+    lst = re.findall(tag, html) or []
+    for ipt in lst:
+        val = re.findall(value, ipt)
+        if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
+            html = html.replace(ipt, val[0])
+    return html
+
+
+def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
+    """
+    源码清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :param del_tag: 删除标签
+    :return: 页面源码
+    """
+    special = set() if special is None else special
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+
+    html = _lowercase_tag(html)
+    if del_tag:
+        html = _clear_special_tag(html)
+
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    html = _clear_input_tag(html, **kwargs)
+    return html

+ 14 - 0
zyjc/rgg/log.py

@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from loguru import logger
+
+_absolute = Path(__file__).absolute().parent.parent
+_log_path = (_absolute / 'logs/log_{time:YYYYMMDD}.log').resolve()
+logger.add(
+    _log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {thread.name} - {name}:{function}:{line} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)

+ 270 - 0
zyjc/rgg/net.py

@@ -0,0 +1,270 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-10 
+---------
+@summary:  千里马详情页专用下载器
+---------
+
+"""
+import copy
+import functools
+from datetime import datetime
+
+import execjs
+import requests
+
+from rgg.log import logger
+
+_cookies = {}
+_headers = {}
+_proxies = None
+
+
+def _account_supervision(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        err = None
+        for _ in range(3):
+            try:
+                return func(*args, **kwargs)
+            except requests.exceptions.RequestException as e:
+                err = e
+
+            except AssertionError:
+                logger.error('账号异常')
+                send_wechat_warning('浙移集成|访问失败|账号异常')
+                return
+
+        if err is not None:
+            # logger.exception(f'账号异常,原因:{err}')
+            raise err
+
+    return wrapper
+
+
+def set_cookies(ck):
+    global _cookies
+    _cookies = ck
+
+
+def set_headers(h):
+    global _headers
+    _headers = h
+
+
+def set_proxies(p):
+    global _proxies
+    _proxies = p
+
+
+def get_proxies(scheme=None):
+    global _proxies
+    if _proxies is None:
+        return
+
+    return _proxies if scheme is None else (_proxies or {}).get(scheme).replace('socks5://', '')
+
+
+def _extract_cid(href):
+    script = '''
+    function extractCid(url) {
+        if(url.indexOf('/zb/detail') != -1){
+            var cidArr = url.split('_');
+            if (cidArr.length > 1) {
+                var cid = cidArr[1].replace('.html', '');
+                if (cid.indexOf('-') != -1) {
+                    cid = cid.split("-")[1];
+                }
+                return cid
+            }
+        }
+        
+        if (url.indexOf('-') != -1) {
+            t = url.lastIndexOf("-")
+            n = url.substring(t + 1)
+            cid = n.split(".html")[0]
+            return cid
+        }
+        
+    }
+    '''
+    ctx = execjs.compile(script)
+    result = ctx.call('extractCid', href)
+    return result
+
+
+def _extract_referer(href, cid):
+    global _cookies, _proxies
+    href = str(href).replace('http:', 'https:')
+
+    url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
+    headers = {
+        'Accept': '*/*',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Pragma': 'no-cache',
+        'Referer': href,
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
+        'X-Requested-With': 'XMLHttpRequest',
+    }
+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
+    response = requests.get(url, timeout=10, **requests_params)
+    assert response.status_code == 200
+    text = response.content.decode()
+
+    script = '''
+    function extractDetailUrl(cid, dataStr) {
+        var data = JSON.parse(dataStr)
+        var catId = data.data;
+        var pageName;
+        switch (catId) {
+            case 301:
+            case 601:
+                pageName = "tenderDetail.html";
+                break;
+            case 202:
+                pageName = "projectDetail.html";
+                break;
+            case 201:
+                pageName = "tenderDetail.html";
+                break;
+            case 101:
+                pageName = "projectDetail.html";
+                break;
+            default:
+                pageName = "tenderDetail.html";
+                break;
+        }
+        return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
+    }
+    '''
+    ctx = execjs.compile(script)
+    result = ctx.call('extractDetailUrl', cid, text)
+    return result
+
+
+def _download_detail(href, referer=False, timeout=10):
+    global _cookies, _headers, _proxies
+    headers = copy.deepcopy(_headers)
+    cid = _extract_cid(href)
+    if not cid:
+        raise ValueError('cid is not exist')
+
+    url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
+    if referer:
+        referer = _extract_referer(href, cid)
+        headers['Referer'] = referer
+
+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
+    response = requests.post(url, timeout=timeout, **requests_params)
+    assert response.status_code == 200
+    result = response.json()
+    data = result['data']
+    if not data:
+        logger.warning(f'下载异常|{result}')
+        return data
+
+    logger.info(f'下载成功|{href}')
+    return data
+
+
+@_account_supervision
+def download_html(href, **kwargs):
+    result = _download_detail(href, **kwargs)
+    if not result:
+        return
+
+    return result['content']
+
+
+@_account_supervision
+def download_json(href, **kwargs):
+    result = _download_detail(href, timeout=30, **kwargs)
+    if result is None:
+        return False
+
+    '''{"code":700053,"msg":"该条信息已被撤销,请重新检索","data":null}'''
+    if 'code' in result and result['code'] == 700053:
+        logger.warning(f'检索失败|{result}')
+        return
+
+    return result
+
+
+@_account_supervision
+def download_list(keywords, page, page_size, **kwargs):
+    global _cookies, _headers, _proxies
+    today = datetime.now().strftime('%Y-%m-%d')
+    begin_time = kwargs.get('begin_time') or today
+    end_time = kwargs.get('end_time') or today
+
+    url = 'https://search.vip.qianlima.com/rest/service/website/search/solr'
+    data = {
+        "keywords": keywords,  # 检索标题
+        "timeType": 4,  # 自定义时间类型
+        "beginTime": begin_time,
+        "endTime": end_time,
+        "filtermode": 2,
+        "searchMode": 0,
+        "currentPage": page,  # 页码
+        "numPerPage": page_size,  # 每页最大条目数
+        "sortType": 6,
+        "allType": -1,
+        "noticeSegmentTypeStr": "",
+        "beginAmount": "",
+        "endAmount": "",
+        "purchasingUnitIdList": "",
+        "threeClassifyTagStr": "",
+        "fourLevelCategoryIdListStr": "",
+        "threeLevelCategoryIdListStr": "",
+        "levelId": "",
+        "tab": 0,
+        "searchDataType": 0,
+        "types": "-1",
+        "showContent": 1,
+        "hasTenderTransferProject": 1,
+        "newAreas": "",
+        "hasChooseSortType": 1,
+        "summaryType": 0
+    }
+    response = requests.post(
+        url,
+        cookies=_cookies,
+        headers=_headers,
+        json=data,
+        timeout=60,
+        proxies=_proxies
+    )
+    assert response.status_code == 200
+    result = response.json()
+
+    try:
+        result['data']['rowCount']
+    except TypeError:
+        logger.error(f'下载失败|{keywords}|第{page}页|{result}')
+        return
+
+    lst = result['data']
+    if not lst:
+        logger.warning(f'数据异常|{keywords}|第{page}页|{result}')
+        return
+
+    logger.debug(f'下载成功|{keywords}|第{page}页')
+    return result['data']['data']
+
+
+def send_wechat_warning(msg, send=True):
+    markdown = f'千里马会员账号采集异常,请相关同事注意。'
+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
+
+    if not send:
+        logger.info(markdown)
+        return
+
+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=bf53d666-bfa7-4176-b3e2-2d4b9d8a3bea'
+    headers_ = {'Content-Type': 'application/json'}
+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
+    request_params = dict(headers=headers_, json=json_data, timeout=10)
+    response = requests.post(url, **request_params)
+    logger.info(response.json())