dzr 5 ヶ月 前
コミット
c3355914d8

+ 32 - 0
account/18530014520.json

@@ -0,0 +1,32 @@
+{
+    "cookies": {
+        "userInfo": "{%22userId%22:6174239%2C%22username%22:%2218530014520%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E5%BE%90%E5%BF%97%E6%81%92%22%2C%22companyName%22:%22%E7%99%BE%E7%82%89%E5%B1%AF%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222019-12-14%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22https://gw-static.qianlima.com/gw/invoice/1721198286_1d8b871dc0.jpg%22%2C%22customerServicePhone%22:%22%20400-688-2000%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2218530014520%22%2C%22email%22:%221151584137@qq.com%22%2C%22dwmc%22:%22%E7%99%BE%E7%82%89%E5%B1%AF%22%2C%22zhiwu%22:%22%E5%90%88%E4%BC%99%E4%BA%BA%22%2C%22types%22:6%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22showExpireDate%22:true%2C%22companyNature%22:null%2C%22companyArea%22:%22%22%2C%22companyType%22:null%2C%22industry%22:null%2C%22product%22:null%2C%22contacts%22:%22%E5%BE%90%E5%BF%97%E6%81%92%22%2C%22contactNumber%22:%2218530014520%22%2C%22contactAddress%22:null%2C%22mainCustomerGroups%22:null%2C%22informationTypePreferences%22:null%2C%22registerSource%22:%22miniwechat%22%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}",
+        "login_time": "1733897438",
+        "useragent_hash": "0845b309c7b9b957afd9ecf775a4c21f",
+        "source": "1",
+        "qlm_password": "3KUEoC37mfjCEmmCCBffUp77fEoCE3C8",
+        "HWWAFSESTIME": "1733897419511",
+        "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1733897435",
+        "qlm_username": "18530014520",
+        "HMACCOUNT": "FC1B7E7FD8E10E3C",
+        "qlm_visitor_id": "383096361",
+        "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1733897435",
+        "xAuthToken": "2cae03be-8afa-4d3c-8f4e-cbe8926577e9",
+        "HWWAFSESID": "a918373c075b3fa3fc"
+    },
+    "headers": {
+        "X-Auth-Token": "2cae03be-8afa-4d3c-8f4e-cbe8926577e9",
+        "sec-ch-ua-platform": "\"macOS\"",
+        "Referer": "https://vip.qianlima.com/",
+        "Access-Captcha-Permission": "None",
+        "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
+        "sec-ch-ua-mobile": "?0",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        "Accept": "*/*",
+        "Content-Type": "application/json"
+    },
+    "proxies": {
+        "https": "socks5://58.221.59.179:8860",
+        "http": "socks5://58.221.59.179:8860"
+    }
+}

BIN
backup/bloomfilter.f


+ 41 - 0
bidding表查询数据保存情况.py

@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-10
+---------
+@summary:
+---------
+@author: Dzr
+"""
+import bson
+from pymongo import MongoClient
+from urllib.parse import quote_plus
+
+user = 'dataFx'
+password = 'data@fenxi'
+uri = "mongodb://%s:%s@%s" % (quote_plus(user), quote_plus(password), '127.0.0.1:27089')
+bidding_client = MongoClient(uri)
+bidding_db = bidding_client['qfw']
+bidding_nomal = bidding_db['bidding_nomal']
+
+client = MongoClient('192.168.3.182', 27080)
+coll = client['py_spider']['data_bak']
+
+# data_bak
+p = {'biddingid': 1, '_id': 0, 'biddingcoll': 1}
+q = {'spidercode': 'sdxzbiddingsjzypc', 'comeintime': 1733319761}
+with coll.find(q, projection=p) as cursor:
+    nomal_ids = [item['biddingid'] for item in cursor if 'biddingid' in item]
+    not_nomal_ids = [1 for item in cursor if 'biddingid' not in item]
+
+print(f'未上传数据量:{sum(not_nomal_ids)}')
+
+# bidding_nomal
+p = {'biddingid': 1, '_id': 0}
+not_bidding_ids = []
+for _id in nomal_ids:
+    ret = bidding_nomal.find_one({'_id': bson.ObjectId(_id)}, projection=p)
+    if not ret:
+        not_bidding_ids.append(_id)
+
+for _id in not_bidding_ids:
+    print('未入bidding库>>>', _id)

+ 189 - 0
clean_html.py

@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+import re
+
+from lxml.html import fromstring, HtmlElement, tostring
+
+__all__ = ['cleaner', 'drop_tree_by_lxml']
+
+'''独立元素'''
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '</input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+'''行内元素'''
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<link>|<link [^>]*>|</link>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+    'data:image(.*?) ': '',  # 图片base64
+}
+'''块级元素'''
+BLOCK_TAGS = {
+    '<div>\s*?</div>': '',
+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>': '<br>',  # 段落
+    '</p>': '',  # 段落
+    '<div>|<div [^>]*>': '<br>',  # 分割
+    '</div>': '',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+'''其他'''
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+}
+'''样式'''
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+}
+'''空白符'''
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+'''css标签集合'''
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+'''css属性集合'''
+ATTRS = {'id', 'class', 'style', 'width'}
+'''特殊样式的标签'''
+SPECIAL_TAGS = {
+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
+}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    return html
+
+
+def _lowercase_tag(html):
+    """元素标签转成小写,不影响页面文本"""
+    tags = re.findall("<[^>]+>", html)
+    tag_sets = set(tags)
+
+    if len(tag_sets) > 10000:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, 'lxml')
+        html = str(soup.body.next_element)
+    else:
+        for tag in tag_sets:
+            html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()  # 标签修复
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def _clear_special_tag(html):
+    """删除特殊元素标签"""
+    for tag, repl in SPECIAL_TAGS.items():
+        html = tag.sub(repl, html)
+    return html
+
+
+def _clear_input_tag(html, display=False):
+    """提取value值,替换input标签"""
+    if not display:
+        html = html.replace('<input', '<input style="border-color: transparent;"')  # 不显示输入框边框
+
+    tag = re.compile(r'<input .*?>', re.S)
+    value = re.compile(r'value=["|\'](.*?)["|\']')
+
+    lst = re.findall(tag, html) or []
+    for ipt in lst:
+        val = re.findall(value, ipt)
+        if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
+            html = html.replace(ipt, val[0])
+    return html
+
+
+def drop_tree_by_lxml(html, feature):
+    tree: HtmlElement = fromstring(html)
+    tag_lst = tree.xpath(feature)
+    for tag in tag_lst:
+        tag.drop_tree()
+
+    html = tostring(tree, encoding='utf8').decode('utf8')
+    return html
+
+
+def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
+    """
+    源码清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :param del_tag: 删除标签
+    :return: 页面源码
+    """
+    special = set() if special is None else special
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+
+    html = _lowercase_tag(html)
+    if del_tag:
+        html = _clear_special_tag(html)
+
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    html = _clear_input_tag(html, **kwargs)
+    return html

+ 115 - 0
login.py

@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-10 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+
+import json
+from pathlib import Path
+
+from DrissionPage import ChromiumPage, ChromiumOptions
+from DrissionPage._functions.tools import PortFinder
+
+
+account_pool = [
+    ('18530014520', 'qp!4LXH_'),
+]
+
+
+def auto_login(username, password, proxy=False, headless=False, auto_quit=False, accident_url=None):
+    co = ChromiumOptions()
+
+    port, _ = PortFinder(path='./').get_port()
+    co.set_paths(
+        local_port=port,
+        user_data_path=f'./chrome/{username}',
+        download_path=f'./download/{username}'
+    )
+    # 禁用密码保存弹窗
+    co.set_argument('--disable-extensions')
+
+    if proxy:
+        proxies = {
+            'https': 'socks5://58.221.59.179:8860',
+            'http': 'socks5://58.221.59.179:8860'
+        }
+        co.set_argument('--proxy-server', value=proxies['https'])
+    else:
+        proxies = None
+
+    if headless:
+        co.set_argument('--headless', value='new')
+        co.set_user_agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36')
+
+    page = ChromiumPage(addr_or_opts=co)
+    try:
+        targets = [
+            'website-seo/v2/cm/getcatid',
+            'rest/detail/alltypesdetail/detail',
+            'rest/account/companySpace/checkNewUser'
+        ]
+        page.listen.start(targets=targets, res_type=['Document', 'XHR'])  # 开启路由监听
+
+        success = page.get('https://vip.qianlima.com/')  # 访问用户管理界面
+        if not success:
+            return
+
+        login = page.wait.ele_displayed('x://span[text()="登录"]', timeout=5)
+        if login:
+            page.ele('x://input[@name="username"]').input(username, clear=True)
+            page.ele('x://input[@name="password"]').input(password, clear=True)
+            page.ele('x://span[text()="登录"]/parent::*').click()
+
+        loaded = page.wait.ele_displayed(f'x://p[contains(text(), "{username}")]')  # 等待页面加载完成
+        if not loaded:
+            print(f'登录失败>{username}')
+            return
+
+        # page.get('http://www.qianlima.com/zb/detail/20241016_454396207.html')
+
+        packet = page.listen.wait()
+        root = Path(__file__).parent
+        if not (root / 'account').exists():
+            (root / 'account').mkdir(exist_ok=True)
+
+        file = (root / f'account/{username}.json').absolute()
+        with open(file, 'w') as f:
+            print(packet.url)  # 打印数据包url
+            # print(packet.response.body)
+            headers = dict(packet.request.headers)
+            print(f'** headers ** \n{json.dumps(headers, indent=4)}')
+            cookies = page.cookies(as_dict=True)
+            print(f'** cookies ** \n{json.dumps(cookies, indent=4)}')
+            user = {
+                'cookies': cookies,
+                'headers': headers,
+                'proxies': proxies
+            }
+            f.write(json.dumps(user, indent=4))
+
+            if accident_url is not None:
+                while True:
+                    page.get(accident_url)  # 人工处理意外情况
+                    if input('异常已处理?[Y|N]').upper() == 'Y':
+                        break
+
+            if not auto_quit:
+                f.flush()
+                while True:
+                    if input("退出? >>>"):
+                        break
+
+    except KeyboardInterrupt:
+        pass
+
+    finally:
+        page.quit()
+        print('关闭浏览器')
+
+
+if __name__ == '__main__':
+    for username, password in account_pool:
+        auto_login(username, password, proxy=True, auto_quit=False, headless=False)

+ 191 - 0
net.py

@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-10 
+---------
+@summary:  千里马详情页专用下载器
+---------
+@author: Dzr
+"""
+import copy
+import functools
+import json
+
+import execjs
+import requests
+from loguru import logger
+
+
+# 来源 rest/detail/alltypesdetail/detail
+_cookies = {}
+_headers = {}
+_proxies = None
+
+
+def router(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except AssertionError:
+            logger.exception('账号异常')
+            return False, {}
+
+        except KeyboardInterrupt:
+            pass
+
+        except requests.exceptions.RequestException as e:
+            logger.exception(f'网络请求错误, 原因:{e}')
+            return False, {}
+
+    return wrapper
+
+
+def set_cookies(ck):
+    global _cookies
+    _cookies = ck
+
+
+def set_headers(h):
+    global _headers
+    _headers = h
+
+
+def set_proxies(p):
+    global _proxies
+    _proxies = p
+
+
+def _extract_cid(href):
+    script = '''
+    function extractCid(url) {
+        if(url.indexOf('/zb/detail') != -1){
+            var cidArr = url.split('_');
+            if (cidArr.length > 1) {
+                var cid = cidArr[1].replace('.html', '');
+                if (cid.indexOf('-') != -1) {
+                    cid = cid.split("-")[1];
+                }
+                return cid
+            }
+        }
+        
+        if (url.indexOf('-') != -1) {
+            t = url.lastIndexOf("-")
+            n = url.substring(t + 1)
+            cid = n.split(".html")[0]
+            return cid
+        }
+        
+    }
+    '''
+    ctx = execjs.compile(script)
+    result = ctx.call('extractCid', href)
+    return result
+
+
+def _extract_referer(href, cid):
+    global _cookies, _proxies
+    href = str(href).replace('http:', 'https:')
+
+    url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
+    headers = {
+        'Accept': '*/*',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Pragma': 'no-cache',
+        'Referer': href,
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
+        'X-Requested-With': 'XMLHttpRequest',
+    }
+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
+    response = requests.get(url, timeout=10, **requests_params)
+    assert response.status_code == 200
+    text = response.content.decode()
+
+    script = '''
+    function extractDetailUrl(cid, dataStr) {
+        var data = JSON.parse(dataStr)
+        var catId = data.data;
+        var pageName;
+        switch (catId) {
+            case 301:
+            case 601:
+                pageName = "tenderDetail.html";
+                break;
+            case 202:
+                pageName = "projectDetail.html";
+                break;
+            case 201:
+                pageName = "tenderDetail.html";
+                break;
+            case 101:
+                pageName = "projectDetail.html";
+                break;
+            default:
+                pageName = "tenderDetail.html";
+                break;
+        }
+        return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
+    }
+    '''
+    ctx = execjs.compile(script)
+    result = ctx.call('extractDetailUrl', cid, text)
+    return result
+
+
+def _download_detail(href, referer=False):
+    global _cookies, _headers, _proxies
+    headers = copy.deepcopy(_headers)
+    cid = _extract_cid(href)
+    if not cid:
+        raise ValueError('cid is not exist')
+
+    url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
+    if referer:
+        referer = _extract_referer(href, cid)
+        headers['Referer'] = referer
+
+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
+    try:
+        response = requests.post(url, timeout=10, **requests_params)
+    except requests.exceptions.Timeout:
+        logger.error(f'采集失败|访问超时|{href}')
+        return False, None  # 账号额度不足时,返回:None
+
+    username = _cookies['qlm_username']
+    status_code = response.status_code
+    if status_code != 200:
+        result = response.content.decode()
+        logger.error(f'采集失败|{username}|状态码|{status_code}|请求响应|{result}')
+        return False, status_code
+
+    result = response.json()
+    data = result['data']
+    if not data:
+        logger.warning(f'数据异常|{result}')
+        return False, data  # 账号额度不足时,返回:None
+
+    logger.info(f'采集成功[{href}]')
+    return True, data
+
+
+@router
+def download_json(href, **kwargs):
+    _, result = _download_detail(href, **kwargs)
+    if not result:
+        return False
+
+    if isinstance(result, int):
+        return result
+
+    return result
+
+
+@router
+def download_html(href, **kwargs):
+    _, result = _download_detail(href, **kwargs)
+    if not result:
+        return False
+
+    return result['content']

+ 6 - 0
requirements.txt

@@ -0,0 +1,6 @@
+pymongo==3.12.0
+pybloom_live==4.0.0
+loguru==0.5.3
+DrissionPage==4.0.5.6
+beautifulsoup4==4.12.3
+PyExecJS==1.5.1

+ 70 - 0
数据处理.py

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-19 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+import time
+from datetime import datetime
+
+import bson
+from pymongo import MongoClient
+
+Int64 = bson.int64.Int64
+
+client = MongoClient('192.168.3.182', 27017)
+qlm_coll = client['zjb_poc']['qlm_data_lst']
+jy_coll = client['zjb_poc']['jy_data_lst']
+
+count = 0
+insert_lst = []
+with qlm_coll.find() as cursor:
+    for item in cursor:
+        href = item['url']
+        title = item['popTitle'] if 'popTitle' in item else item['showTitle']
+        publishtime = item['updateTime']
+        l_np_publishtime = datetime.strptime(publishtime, '%Y-%m-%d').timestamp()
+
+        addr = str(item['areaName']).split('-')
+        area = addr[0] if len(addr) > 0 else ''
+        city = addr[1] if len(addr) > 1 else ''
+        if '国土' in item.get('progName', ''):
+            toptype = item['progName']
+        else:
+            toptype = (item['noticeSegmentTypeName'] or item['progName'])
+
+        data = {
+            'site': '千里马',
+            'channel': item['channel'],
+            'spidercode': 'sdxzbiddingsjzypc',
+            'area': area,
+            'city': city,
+            'district': '',
+            'comeintime': Int64(int(time.time())),
+            'isdownload': False,  # 是否下载
+            'isfailed': False,  # 是否失败
+            'title': title,  # 标题
+            'href': href,  # 信息链接
+            'publishtime': publishtime,  # 发布时间(字符串)
+            'l_np_publishtime': Int64(l_np_publishtime),  # 发布时间(时间戳)
+            'buyer': item['tenderees'],  # 招标单位
+            'toptype': toptype,  # 公告类型
+            'winner': item['bidder'] if item.get('bidder') is not None else '',  # 中标单位
+            'agency': item['agent'] if item.get('agent') is not None else '',  # 代理单位
+        }
+
+        insert_lst.append(data)
+        if len(insert_lst) == 100:
+            jy_coll.insert_many(insert_lst, ordered=False)
+            count += len(insert_lst)
+            insert_lst = []
+            print('已处理{}条'.format(count))
+
+    if len(insert_lst) > 0:
+        jy_coll.insert_many(insert_lst, ordered=False)
+        count += len(insert_lst)
+        print('已处理{}条'.format(count))
+
+print('数据处理结束')

+ 50 - 0
采集任务清单.py

@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-18 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+from pymongo import MongoClient
+import pandas as pd
+
+
+to_db = MongoClient('192.168.3.182', 27017)
+coll = to_db['31zg_poc']['keyword_company']
+
+suffix_lst = '重机,挖掘机,装载机,泵送,桥泵车,搅拌车,拖泵,搅拌站,车载泵,搅拌车,重起,汽车起重机,履带起重机,桩机,旋挖钻,大旋挖,中旋挖,小旋挖,路机,铣刨机,摊铺机,平地机,压路机,沥青站'.split(',')
+suffix_set = set(suffix_lst)
+
+f = '/Users/dongzhaorui/Desktop/qlm数据采集.xlsx'
+df = pd.read_excel(f)
+df.fillna('',  inplace=True)
+
+company_lst = []
+for _, i in df.iterrows():
+    items = i.to_dict()
+    s_company = str(items['集团名称']).strip()
+    s_sub_company = str(items['二级局名称']).strip()
+
+    if s_company and s_company not in company_lst:
+        company_lst.append(s_company)
+
+    if s_sub_company and s_sub_company not in company_lst:
+        company_lst.append(s_sub_company)
+
+data = []
+for suffix in suffix_set:
+    print(suffix)
+
+    for company in company_lst:
+        data.append({'s_suffix': suffix, 's_company': company, 's_keyword': f'{company}+{suffix}'})
+
+    if len(data) == 100:
+        coll.insert_many(data, ordered=False)
+        data = []
+
+if len(data) > 0:
+    coll.insert_many(data, ordered=False)
+
+print('1234')
+

+ 217 - 0
采集列表页(关键词).py

@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-06-01 
+---------
+@summary:  千里马列表页采集
+---------
+@author: Dzr
+"""
+import json
+import math
+import random
+import time
+from pathlib import Path
+
+import requests
+from loguru import logger
+from pybloom_live import BloomFilter
+from pymongo import MongoClient
+from login import auto_login, account_pool
+
+
+_cookies = None
+_headers = None
+_proxies = None
+
+
+def send_wechat_warning(msg, send=True):
+    markdown = f'采集异常中断,请切换d模式处理。'
+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
+
+    if not send:
+        logger.info(markdown)
+        return
+
+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
+    headers_ = {'Content-Type': 'application/json'}
+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
+    request_params = dict(headers=headers_, json=json_data, timeout=10)
+    response = requests.post(url, **request_params)
+    logger.info(response.json())
+
+
+def setup_cfg(username):
+    global _cookies, _headers, _proxies
+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
+    with open(file, encoding='utf-8') as rp:
+        json_data = json.load(rp)
+        _cookies = json_data['cookies']
+        _headers = json_data['headers']
+        _proxies = json_data['proxies']
+
+
+def launch_filter():
+    """创建布隆过滤器"""
+    logger.debug('创建布隆过滤器...')
+    backup = (Path(__file__).parent / 'backup')
+    if not backup.exists():
+        backup.mkdir(exist_ok=True)
+
+    file = (backup / 'bloomfilter.f')
+    if not file.exists():
+        file.touch()  # 初始创建存储文件
+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
+    else:
+        if file.stat().st_size == 0:
+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
+        else:
+            bf = BloomFilter.fromfile(file.open('rb'))
+
+    return file, bf
+
+
+def fetch(collection, username, keywords, page, page_size, channel, bf):
+    # rest/service/website/search/solr -> cookies
+    global _cookies, _headers, _proxies
+    response = None
+    try:
+        json_data = {
+            'keywords': keywords,
+            'timeType': '4',
+            'beginTime': '2024-01-01',
+            'endTime': '2024-12-04',
+            'filtermode': 5,
+            'searchMode': 1,
+            'currentPage': page,
+            'numPerPage': page_size,
+            'sortType': 2,
+            'allType': -1,
+            'noticeSegmentTypeStr': '',
+            'beginAmount': '',
+            'endAmount': '',
+            'purchasingUnitIdList': '',
+            'threeClassifyTagStr': '',
+            'fourLevelCategoryIdListStr': '',
+            'threeLevelCategoryIdListStr': '',
+            'levelId': '',
+            'tab': 0,
+            'searchDataType': 0,
+            'types': '-1',
+            'showContent': 1,
+            'hasTenderTransferProject': 1,
+            'newAreas': '',
+            'hasChooseSortType': 1,
+            'summaryType': 0,
+        }
+
+        response = requests.post(
+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
+            cookies=_cookies,
+            headers=_headers,
+            json=json_data,
+            proxies=_proxies,
+            timeout=60
+        )
+        assert response.status_code == 200
+        result = response.json()
+        try:
+            total = result['data']['rowCount']
+        except TypeError:
+            return False, -1, 0
+
+        dedup_count = 0
+        count = 0
+        insert_lst = []
+        data = result['data']['data']
+        for item in data:
+            href = item.get('url')
+            if href is None or href in bf:
+                dedup_count += 1
+                # logger.debug(f'重复数据[{href}]')
+                continue
+
+            item['channel'] = channel
+            insert_lst.append(item)
+            if len(insert_lst) == page_size:
+                collection.insert_many(insert_lst, ordered=False)
+                count += len(insert_lst)
+                insert_lst = []
+
+            bf.add(href)
+
+        if len(insert_lst) > 0:
+            collection.insert_many(insert_lst, ordered=False)
+            count += len(insert_lst)
+
+        logger.info(f'自动翻页|第{page}页|入库{count}条|重复{dedup_count}条')
+        return True, total, len(data)
+
+    except AssertionError:
+        logger.error(f'{username}|账号异常|请求失败')
+        # send_wechat_warning(msg=response.content.decode())
+        return False, -2, 0
+
+    except requests.exceptions.RequestException as e:
+        logger.exception(f'网络请求错误, 原因:{e}')
+        return False, -3, 0
+
+
+def spider(username, keywords, bf, coll, channel):
+    setup_cfg(username)
+
+    page = 1
+    page_size = 100
+
+    # 翻页
+    retries = 0
+    while True:
+        ok, total, count = fetch(coll, username, keywords, page, page_size, channel, bf)
+        if ok is False:
+            state = total
+            if state == -1:
+                logger.info(f'{username}|请求参数错误|修改参数')
+                return False
+            elif state == -2:
+                logger.info(f'{username}|访问频繁|3秒后切换账号')
+                time.sleep(3)
+                return
+            else:
+                logger.error(f'{username}|网络异常|准备重试~{retries}')
+                if retries > 3:
+                    return
+                else:
+                    retries += 1
+                    continue
+
+        # time.sleep(math.log(random.randint(100, 2400), 2))
+        time.sleep(.5)
+        if ok is True and count < page_size:
+            logger.info(f'采集完成|保存{total}条')
+            break
+        else:
+            page += 1
+
+    return True
+
+
+def main():
+    f, bf = launch_filter()  # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
+
+    client = MongoClient('192.168.3.182', 27017)
+    coll = client['zjb_poc']['qlm_data_lst']
+    channel = '综合'
+    keywords = '黑龙江省八目科技开发有限公司'
+    try:
+        username, password = account_pool.pop(0)
+        auto_login(username, password, proxy=True, headless=True, auto_quit=True)
+        spider(username, keywords, bf, coll, channel)
+    except KeyboardInterrupt:
+        pass
+
+    finally:
+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
+        logger.info('采集结束')
+
+
+if __name__ == '__main__':
+    main()

+ 253 - 0
采集列表页(地域).py

@@ -0,0 +1,253 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-06-01 
+---------
+@summary:  千里马列表页采集
+---------
+@author: Dzr
+"""
+import json
+import math
+import random
+import time
+from pathlib import Path
+
+import requests
+from loguru import logger
+from pybloom_live import BloomFilter
+from pymongo import MongoClient
+from login import auto_login, account_pool
+
+
+_cookies = None
+_headers = None
+_proxies = None
+
+
+def send_wechat_warning(msg, send=True):
+    markdown = f'采集异常中断,请切换d模式处理。'
+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
+
+    if not send:
+        logger.info(markdown)
+        return
+
+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
+    headers_ = {'Content-Type': 'application/json'}
+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
+    request_params = dict(headers=headers_, json=json_data, timeout=10)
+    response = requests.post(url, **request_params)
+    logger.info(response.json())
+
+
+def setup_cfg(username):
+    global _cookies, _headers, _proxies
+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
+    with open(file, encoding='utf-8') as rp:
+        json_data = json.load(rp)
+        _cookies = json_data['cookies']
+        _headers = json_data['headers']
+        _proxies = json_data['proxies']
+
+
+def launch_filter():
+    """创建布隆过滤器"""
+    logger.debug('创建布隆过滤器...')
+    backup = (Path(__file__).parent / 'backup')
+    if not backup.exists():
+        backup.mkdir(exist_ok=True)
+
+    file = (backup / 'bloomfilter.f')
+    if not file.exists():
+        file.touch()  # 初始创建存储文件
+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
+    else:
+        if file.stat().st_size == 0:
+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
+        else:
+            bf = BloomFilter.fromfile(file.open('rb'))
+
+    return file, bf
+
+
+def fetch(collection, username, page, page_size, channel, bf):
+    # rest/service/website/search/solr -> cookies
+    global _cookies, _headers, _proxies
+    response = None
+    try:
+        if channel == '中标信息':
+            json_data = {
+                'keywords': '',
+                'timeType': 4,
+                'beginTime': '2024-09-01',
+                'endTime': '2024-09-30',
+                'filtermode': '8',
+                'searchMode': 1,
+                'currentPage': page,
+                'numPerPage': page_size,
+                'sortType': 1,
+                'allType': 3,
+                'beginAmount': '',
+                'endAmount': '',
+                'purchasingUnitIdList': '',
+                'threeClassifyTagStr': '',
+                'fourLevelCategoryIdListStr': '',
+                'threeLevelCategoryIdListStr': '',
+                'levelId': '',
+                'tab': 0,
+                'searchDataType': 1,
+                'types': 3,
+                'showContent': 1,
+                'newAreas': '1744',
+                'hasChooseSortType': 1,
+                'progIdAndNoticeSegmentTypeMaps': {
+                    '3': [],
+                },
+                'summaryType': 0,
+            }
+        elif channel == '招标信息':
+            json_data = {
+                'keywords': '',
+                'timeType': 4,
+                'beginTime': '2024-09-01',
+                'endTime': '2024-09-30',
+                'filtermode': '8',
+                'searchMode': 1,
+                'currentPage': page,
+                'numPerPage': page_size,
+                'sortType': 1,
+                'allType': 0,
+                'beginAmount': '',
+                'endAmount': '',
+                'purchasingUnitIdList': '',
+                'threeClassifyTagStr': '',
+                'fourLevelCategoryIdListStr': '',
+                'threeLevelCategoryIdListStr': '',
+                'levelId': '',
+                'tab': 0,
+                'searchDataType': 1,
+                'types': -1,
+                'showContent': 1,
+                'newAreas': '1744',
+                'hasChooseSortType': 1,
+                'progIdAndNoticeSegmentTypeMaps': {
+                    '0': [],
+                    '1': [],
+                },
+                'summaryType': 0,
+            }
+        else:
+            pass
+
+        response = requests.post(
+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
+            cookies=_cookies,
+            headers=_headers,
+            json=json_data,
+            proxies=_proxies,
+            timeout=60
+        )
+        assert response.status_code == 200
+        result = response.json()
+        try:
+            total = result['data']['rowCount']
+        except TypeError:
+            return False, -1, 0
+
+        dedup_count = 0
+        count = 0
+        insert_lst = []
+        data = result['data']['data']
+        for item in data:
+            href = item.get('url')
+            if href is None or href in bf:
+                dedup_count += 1
+                # logger.debug(f'重复数据[{href}]')
+                continue
+
+            item['channel'] = channel
+            insert_lst.append(item)
+            if len(insert_lst) == page_size:
+                collection.insert_many(insert_lst, ordered=False)
+                count += len(insert_lst)
+                insert_lst = []
+
+            bf.add(href)
+
+        if len(insert_lst) > 0:
+            collection.insert_many(insert_lst, ordered=False)
+            count += len(insert_lst)
+
+        logger.info(f'自动翻页|第{page}页|入库{count}条|重复{dedup_count}条')
+        return True, total, len(data)
+
+    except AssertionError:
+        logger.error(f'{username}|账号异常|请求失败')
+        # send_wechat_warning(msg=response.content.decode())
+        return False, -2, 0
+
+    except requests.exceptions.RequestException as e:
+        logger.exception(f'网络请求错误, 原因:{e}')
+        return False, -3, 0
+
+
+def spider(username, bf, coll, channel):
+    setup_cfg(username)
+
+    page = 1
+    page_size = 100
+
+    # 翻页
+    retries = 0
+    while True:
+        ok, total, count = fetch(coll, username, page, page_size, channel, bf)
+        if ok is False:
+            state = total
+            if state == -1:
+                logger.info(f'{username}|请求参数错误|修改参数')
+                return False
+            elif state == -2:
+                logger.info(f'{username}|访问频繁|3秒后切换账号')
+                time.sleep(3)
+                return
+            else:
+                logger.error(f'{username}|网络异常|准备重试~{retries}')
+                if retries > 3:
+                    return
+                else:
+                    retries += 1
+                    continue
+
+        # time.sleep(math.log(random.randint(100, 2400), 2))
+        time.sleep(.5)
+        if ok is True and count < page_size:
+            logger.info(f'采集完成|保存{total}条')
+            break
+        else:
+            page += 1
+
+    return True
+
+
+def main():
+    f, bf = launch_filter()  # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
+
+    client = MongoClient('192.168.3.182', 27017)
+    coll = client['sdlt_poc']['qlm_data_lst']
+    # channel = '招标信息'
+    channel = '中标信息'
+
+    try:
+        username, password = account_pool.pop(0)
+        auto_login(username, password, proxy=True, headless=True, auto_quit=True)
+        spider(username, bf, coll, channel)
+    except KeyboardInterrupt:
+        pass
+
+    finally:
+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
+        logger.info('采集结束')
+
+
+if __name__ == '__main__':
+    main()

+ 268 - 0
采集列表页.py

@@ -0,0 +1,268 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-06-01 
+---------
+@summary:  千里马列表页采集
+---------
+@author: Dzr
+"""
+import json
+import math
+import random
+import time
+from pathlib import Path
+
+import requests
+from loguru import logger
+from pybloom_live import BloomFilter
+from pymongo import MongoClient
+from login import auto_login, account_pool
+
+
+_cookies = None
+_headers = None
+_proxies = None
+
+
+def send_wechat_warning(msg, send=True):
+    markdown = f'采集异常中断,请切换d模式处理。'
+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
+
+    if not send:
+        logger.info(markdown)
+        return
+
+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
+    headers_ = {'Content-Type': 'application/json'}
+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
+    request_params = dict(headers=headers_, json=json_data, timeout=10)
+    response = requests.post(url, **request_params)
+    logger.info(response.json())
+
+
+def setup_cfg(username):
+    global _cookies, _headers, _proxies
+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
+    with open(file, encoding='utf-8') as rp:
+        json_data = json.load(rp)
+        _cookies = json_data['cookies']
+        _headers = json_data['headers']
+        _proxies = json_data['proxies']
+
+
+def launch_filter():
+    """创建布隆过滤器"""
+    logger.debug('创建布隆过滤器...')
+    backup = (Path(__file__).parent / 'backup')
+    if not backup.exists():
+        backup.mkdir(exist_ok=True)
+
+    file = (backup / 'bloomfilter.f')
+    if not file.exists():
+        file.touch()  # 初始创建存储文件
+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
+    else:
+        if file.stat().st_size == 0:
+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
+        else:
+            bf = BloomFilter.fromfile(file.open('rb'))
+
+    return file, bf
+
+
+def fetch(collection, username, page, page_size, keywords, bf):
+    # rest/service/website/search/solr -> cookies
+    global _cookies, _headers, _proxies
+    response = None
+    try:
+        json_data = {
+            'keywords': keywords,
+            'timeType': 4,
+            'beginTime': '2024-09-01',
+            'endTime': '2024-09-30',
+            'filtermode': '8',
+            'searchMode': 1,
+            'currentPage': page,
+            'numPerPage': page_size,
+            'sortType': '1',
+            'allType': -1,
+            'beginAmount': '',
+            'endAmount': '',
+            'purchasingUnitIdList': '',
+            'threeClassifyTagStr': '',
+            'fourLevelCategoryIdListStr': '',
+            'threeLevelCategoryIdListStr': '',
+            'levelId': '',
+            'tab': 2,
+            'types': '-1',
+            'searchDataType': 1,
+            'showContent': 1,
+            'hasLinkName': '',
+            'newAreas': '',
+            'hasChooseSortType': 1,
+            'progIdAndNoticeSegmentTypeMaps': {
+                '3': [],
+                '4': [
+                    11,
+                    12,
+                ],
+                '5': [],
+            },
+            'summaryType': 1,
+        }
+
+        response = requests.post(
+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
+            cookies=_cookies,
+            headers=_headers,
+            json=json_data,
+            proxies=_proxies,
+            timeout=60
+        )
+        assert response.status_code == 200
+        result = response.json()
+        try:
+            total = result['data']['rowCount']
+            if total > 500:
+                # 丢弃不要
+                return True, total, page_size
+
+        except TypeError:
+            return False, -1, 0
+
+        data = result['data']['data']
+
+        dedup_count = 0
+        count = 0
+        insert_lst = []
+        for item in data:
+            href = item.get('url')
+            if href is None or href in bf:
+                dedup_count += 1
+                # logger.debug(f'重复数据[{href}]')
+                continue
+
+            insert_lst.append(item)
+            if len(insert_lst) == page_size:
+                collection.insert_many(insert_lst, ordered=False)
+                count += len(insert_lst)
+                insert_lst = []
+
+            bf.add(href)
+
+        if len(insert_lst) > 0:
+            collection.insert_many(insert_lst, ordered=False)
+            count += len(insert_lst)
+
+        logger.info(f'自动翻页|第{page}页|{keywords}|入库{count}条|重复{dedup_count}条')
+        return True, total, len(data)
+
+    except AssertionError:
+        logger.error(f'{username}|账号异常|请求失败')
+        # send_wechat_warning(msg=response.content.decode())
+        return False, -2, 0
+
+    except requests.exceptions.RequestException as e:
+        logger.exception(f'网络请求错误, 原因:{e}')
+        return False, -3, 0
+
+
+def spider(username, tasks, bf, to_data_lst, coll):
+    setup_cfg(username)
+
+    while tasks:
+        page = 1
+        page_size = 100
+
+        # 翻页
+        state = 1
+        retries = 0
+        isdownload = True
+        _id, keywords = tasks.pop()
+        while True:
+            ok, total, count = fetch(coll, username, page, page_size, keywords, bf)
+            if ok is False:
+                state = total
+                if state == -1:
+                    logger.info(f'{username}|请求参数错误|修改参数')
+                    return False
+                elif state == -2:
+                    logger.info(f'{username}|访问频繁|3秒后切换账号')
+                    time.sleep(3)
+                    return
+                else:
+                    logger.error(f'{username}|网络异常|准备重试~{retries}')
+                    if retries > 3:
+                        return
+                    else:
+                        retries += 1
+                        continue
+
+            # time.sleep(math.log(random.randint(100, 2400), 2))
+            time.sleep(.5)
+
+            if ok is True and total >= 500:
+                logger.error(f'采集完成|{keywords}|疑似模糊匹配|跳过采集')
+                isdownload = False
+                break
+
+            if ok is True and count < page_size:
+                logger.info(f'采集完成|{keywords}|保存{total}条')
+                break
+            else:
+                page += 1
+
+        # 更新任务状态
+        if state >= 0:
+            to_data_lst.update_one(
+                {'_id': _id},
+                {
+                    '$set': {
+                        'b_isdownload': isdownload,
+                        'i_total': total,
+                        'i_pages': page,
+                        'i_state': state,
+                        'i_updatetime': int(time.time())
+                    }
+                }
+            )
+
+    return True
+
+
+def main():
+    f, bf = launch_filter()  # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
+
+    client = MongoClient('192.168.3.182', 27017)
+    to_data_lst = client['31zg_poc']['keyword_company']
+    coll = client['31zg_poc']['qlm_data_lst']
+
+    try:
+        while True:
+            q = {'b_isdownload': None}
+            p = {'s_keyword': 1, '_id': 1}
+            with to_data_lst.find(q, projection=p, limit=50) as cursor:
+                tasks = [(item['_id'], item['s_keyword']) for item in cursor]
+
+            username, password = account_pool.pop(0)
+            auto_login(username, password, proxy=True, headless=True, auto_quit=True)
+            state = spider(username, tasks, bf, to_data_lst, coll)
+            if state is True:
+                account_pool.append((username, password))
+
+            if state is False:
+                break
+
+            if not to_data_lst.count_documents(q):
+                break
+
+    except KeyboardInterrupt:
+        pass
+
+    finally:
+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
+        logger.info('采集结束')
+
+
+if __name__ == '__main__':
+    main()

+ 178 - 0
采集详情页.py

@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-10
+---------
+@summary:
+---------
+@author: Dzr
+"""
+import json
+import time
+
+import bson
+from loguru import logger
+from pymongo import MongoClient
+from pymongo.operations import UpdateOne
+
+import net
+from clean_html import cleaner, drop_tree_by_lxml
+from pathlib import Path
+from login import auto_login, account_pool
+
+Int64 = bson.int64.Int64
+
+
+def setup_cfg(username):
+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
+    with open(file, encoding='utf-8') as rp:
+        json_data = json.load(rp)
+        net.set_cookies(ck=json_data['cookies'])
+        net.set_headers(h=json_data['headers'])
+        net.set_proxies(p=json_data['proxies'])
+
+
+def bulk_update(collection, id_lst, update):
+    """
+    批量更新任务状态
+
+    :param pymongo.collection.Collection collection:
+    :param id_lst:
+    :param dict update:更新条件
+    :return:
+    """
+    count = 0
+    update_lst = []
+    for id_ in id_lst:
+        update['updatetime'] = Int64(int(time.time()))  # 更新任务时间
+        update_lst.append(UpdateOne({'_id': id_}, {'$set': update}))
+        if len(update_lst) == 50:
+            results = collection.bulk_write(update_lst, ordered=False)
+            count += results.modified_count
+            update_lst = []
+
+    if len(update_lst) > 0:
+        results = collection.bulk_write(update_lst, ordered=False)
+        count += results.modified_count
+
+    return count
+
+
+def finalize(insert_lst, update_dict, data_coll, lst_coll):
+    if len(insert_lst) > 0:
+        data_coll.insert_many(insert_lst, ordered=False)
+
+    success_ids = update_dict['success']
+    if bulk_update(lst_coll, success_ids, {'isdownload': True}):
+        logger.info(f'批量更新[采集成功{len(success_ids)}条]任务状态')
+
+    failed_ids = update_dict['failed']
+    if bulk_update(lst_coll, failed_ids, {'isdownload': True, 'isfailed': True}):
+        logger.info(f'批量更新[采集失败{len(failed_ids)}条]任务状态')
+
+
+def spider(username, password, task_lst, data_coll, lst_coll):
+    setup_cfg(username)
+
+    update_dict = {'success': [], 'failed': []}
+    insert_lst = []
+
+    def handle_task(task, ret):
+        if len(ret) == 0:
+            update_dict['failed'].append(task['_id'])
+            logger.error(f'下载失败|{href}')
+        else:
+            html = drop_tree_by_lxml(ret['content'], '//*[contains(text(), "企业信息")]')
+            insert_lst.append({
+                'site': task['site'],
+                'channel': task['channel'],
+                'spidercode': task['spidercode'],
+                'area': task['area'],
+                'city': task['city'],
+                'district': task['district'],
+                'href': '#',
+                'competehref': href,
+                'title': task['title'],
+                's_title': task['title'],
+                'contenthtml': html,
+                'detail': cleaner(html),
+                'publishtime': task['publishtime'],
+                'l_np_publishtime': task['l_np_publishtime'],
+                'comeintime': Int64(int(time.time())),
+                'T': 'bidding',
+                'infoformat': 1,
+                'sendflag': 'false',
+                'repeat': 'true',
+                'iscompete': True,
+                '_d': 'comeintime',
+                'publishdept': '',
+                'type': '',
+                'is_mixed': True
+            })
+            update_dict['success'].append(task['_id'])
+
+    for task in task_lst:
+        href = task['href']
+        ret = net.download_json(href, referer=False)
+        if isinstance(ret, int) and ret == 429:
+            auto_login(username, password, proxy=True, headless=False, auto_quit=True, accident_url=href)
+            setup_cfg(username)
+            ret = net.download_json(href, referer=False)
+            if input('退出:0 继续:1\n') == '0':
+                finalize(insert_lst, update_dict, data_coll, lst_coll)
+                return False
+
+        if ret is False:
+            logger.error(f'账号失效|{username}')
+            finalize(insert_lst, update_dict, data_coll, lst_coll)
+            return False
+
+        handle_task(task, ret)
+
+        if len(insert_lst) == 50:
+            data_coll.insert_many(insert_lst, ordered=False)
+            insert_lst = []
+
+        time.sleep(.5)
+
+    finalize(insert_lst, update_dict, data_coll, lst_coll)
+    return True
+
+
+def main():
+    logger.info('**** 数据采集开始 ****')
+
+    client = MongoClient('192.168.3.182', 27017)
+    data_coll = client['zjb_poc']['jy_data_bak']
+    lst_coll = client['zjb_poc']['jy_data_lst']
+
+    try:
+        while True:
+            if len(account_pool) == 0:
+                logger.warning('账号数量已不足,请及时补充')
+                break
+
+            # q = {'isdownload': False, 'isuse': {'$in': [4]}}
+            # q = {'isdownload': False, 'isuse': {'$in': [2, 3]}}
+            q = {'isdownload': False, 'is_use': 0}
+            with lst_coll.find(q, limit=100) as cursor:
+                task_lst = [item for item in cursor]
+
+            username, password = account_pool.pop(0)
+            auto_login(username, password, proxy=True, headless=True, auto_quit=True)
+            ret = spider(username, password, task_lst, data_coll, lst_coll)
+            if ret is False:
+                logger.info('切换账号')
+                continue
+
+            if not lst_coll.count_documents(q):
+                break
+
+    except KeyboardInterrupt:
+        pass
+
+    finally:
+        logger.info('**** 数据采集结束 ****')
+
+
+if __name__ == '__main__':
+    main()