5 ヶ月前 · c3355914d8
--- a/account/18530014520.json
+++ b/account/18530014520.json
@@ -0,0 +1,32 @@
 
															+{
														
 
															+    "cookies": {
														
 
															+        "userInfo": "{%22userId%22:6174239%2C%22username%22:%2218530014520%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E5%BE%90%E5%BF%97%E6%81%92%22%2C%22companyName%22:%22%E7%99%BE%E7%82%89%E5%B1%AF%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222019-12-14%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22https://gw-static.qianlima.com/gw/invoice/1721198286_1d8b871dc0.jpg%22%2C%22customerServicePhone%22:%22%20400-688-2000%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2218530014520%22%2C%22email%22:%221151584137@qq.com%22%2C%22dwmc%22:%22%E7%99%BE%E7%82%89%E5%B1%AF%22%2C%22zhiwu%22:%22%E5%90%88%E4%BC%99%E4%BA%BA%22%2C%22types%22:6%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22showExpireDate%22:true%2C%22companyNature%22:null%2C%22companyArea%22:%22%22%2C%22companyType%22:null%2C%22industry%22:null%2C%22product%22:null%2C%22contacts%22:%22%E5%BE%90%E5%BF%97%E6%81%92%22%2C%22contactNumber%22:%2218530014520%22%2C%22contactAddress%22:null%2C%22mainCustomerGroups%22:null%2C%22informationTypePreferences%22:null%2C%22registerSource%22:%22miniwechat%22%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}",
														
 
															+        "login_time": "1733897438",
														
 
															+        "useragent_hash": "0845b309c7b9b957afd9ecf775a4c21f",
														
 
															+        "source": "1",
														
 
															+        "qlm_password": "3KUEoC37mfjCEmmCCBffUp77fEoCE3C8",
														
 
															+        "HWWAFSESTIME": "1733897419511",
														
 
															+        "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1733897435",
														
 
															+        "qlm_username": "18530014520",
														
 
															+        "HMACCOUNT": "FC1B7E7FD8E10E3C",
														
 
															+        "qlm_visitor_id": "383096361",
														
 
															+        "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1733897435",
														
 
															+        "xAuthToken": "2cae03be-8afa-4d3c-8f4e-cbe8926577e9",
														
 
															+        "HWWAFSESID": "a918373c075b3fa3fc"
														
 
															+    },
														
 
															+    "headers": {
														
 
															+        "X-Auth-Token": "2cae03be-8afa-4d3c-8f4e-cbe8926577e9",
														
 
															+        "sec-ch-ua-platform": "\"macOS\"",
														
 
															+        "Referer": "https://vip.qianlima.com/",
														
 
															+        "Access-Captcha-Permission": "None",
														
 
															+        "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
														
 
															+        "sec-ch-ua-mobile": "?0",
														
 
															+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
														
 
															+        "Accept": "*/*",
														
 
															+        "Content-Type": "application/json"
														
 
															+    },
														
 
															+    "proxies": {
														
 
															+        "https": "socks5://58.221.59.179:8860",
														
 
															+        "http": "socks5://58.221.59.179:8860"
														
 
															+    }
														
 
															+}
														
--- a/backup/bloomfilter.f
+++ b/backup/bloomfilter.f
--- a/bidding表查询数据保存情况.py
+++ b/bidding表查询数据保存情况.py
@@ -0,0 +1,41 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-10-10
														
 
															+---------
														
 
															+@summary:
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import bson
														
 
															+from pymongo import MongoClient
														
 
															+from urllib.parse import quote_plus
														
 
															+
														
 
															+user = 'dataFx'
														
 
															+password = 'data@fenxi'
														
 
															+uri = "mongodb://%s:%s@%s" % (quote_plus(user), quote_plus(password), '127.0.0.1:27089')
														
 
															+bidding_client = MongoClient(uri)
														
 
															+bidding_db = bidding_client['qfw']
														
 
															+bidding_nomal = bidding_db['bidding_nomal']
														
 
															+
														
 
															+client = MongoClient('192.168.3.182', 27080)
														
 
															+coll = client['py_spider']['data_bak']
														
 
															+
														
 
															+# data_bak
														
 
															+p = {'biddingid': 1, '_id': 0, 'biddingcoll': 1}
														
 
															+q = {'spidercode': 'sdxzbiddingsjzypc', 'comeintime': 1733319761}
														
 
															+with coll.find(q, projection=p) as cursor:
														
 
															+    nomal_ids = [item['biddingid'] for item in cursor if 'biddingid' in item]
														
 
															+    not_nomal_ids = [1 for item in cursor if 'biddingid' not in item]
														
 
															+
														
 
															+print(f'未上传数据量:{sum(not_nomal_ids)}')
														
 
															+
														
 
															+# bidding_nomal
														
 
															+p = {'biddingid': 1, '_id': 0}
														
 
															+not_bidding_ids = []
														
 
															+for _id in nomal_ids:
														
 
															+    ret = bidding_nomal.find_one({'_id': bson.ObjectId(_id)}, projection=p)
														
 
															+    if not ret:
														
 
															+        not_bidding_ids.append(_id)
														
 
															+
														
 
															+for _id in not_bidding_ids:
														
 
															+    print('未入bidding库>>>', _id)
														
--- a/clean_html.py
+++ b/clean_html.py
@@ -0,0 +1,189 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+import re
														
 
															+
														
 
															+from lxml.html import fromstring, HtmlElement, tostring
														
 
															+
														
 
															+__all__ = ['cleaner', 'drop_tree_by_lxml']
														
 
															+
														
 
															+'''独立元素'''
														
 
															+INDEPENDENT_TAGS = {
														
 
															+    '<head>[\s\S]*?</head>': '',
														
 
															+    '<html>|<html [^>]*>|</html>': '',
														
 
															+    '<body>|<body [^>]*>|</body>': '',
														
 
															+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
														
 
															+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
														
 
															+    '\\xa0|\\u3000': '',  # 空格
														
 
															+    '<!--[\s\S]*?-->': '',  # 注释
														
 
															+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
														
 
															+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
														
 
															+    '<input>': '',  # 输入框
														
 
															+    '</input>': '',  # 输入框
														
 
															+    '<img[^>]*>': '<br>',  # 图片
														
 
															+}
														
 
															+'''行内元素'''
														
 
															+INLINE_TAGS = {
														
 
															+    '<a>|<a [^>]*>|</a>': '',  # 超链接
														
 
															+    '<link>|<link [^>]*>|</link>': '',  # 超链接
														
 
															+    '<span>|<span [^>]*>|</span>': '',  # span
														
 
															+    '<label>|<label [^>]*>|</label>': '<br>',  # label
														
 
															+    '<font>|<font [^>]*>|</font>': '',  # font
														
 
															+    'data:image(.*?) ': '',  # 图片base64
														
 
															+}
														
 
															+'''块级元素'''
														
 
															+BLOCK_TAGS = {
														
 
															+    '<div>\s*?</div>': '',
														
 
															+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
														
 
															+    '<p>|<p [^>]*>': '<br>',  # 段落
														
 
															+    '</p>': '',  # 段落
														
 
															+    '<div>|<div [^>]*>': '<br>',  # 分割
														
 
															+    '</div>': '',  # 分割 division
														
 
															+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
														
 
															+}
														
 
															+'''其他'''
														
 
															+OTHER = {
														
 
															+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
														
 
															+    '<epointform>': '',
														
 
															+    '<!doctype html>|<!doctype html [^>]*>': '',
														
 
															+    '【关闭】|关闭': '',
														
 
															+    '【打印】|打印本页': '',
														
 
															+    '【字体：[\s\S]*】': '',
														
 
															+    '文章来源：[\u4e00-\u9fa5]+': '',
														
 
															+    '浏览次数：.*[<]+': '',
														
 
															+    '（责任编辑：.*?）': '',
														
 
															+    '分享到[：]': '',
														
 
															+}
														
 
															+'''样式'''
														
 
															+CSS_STYLE = {
														
 
															+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
														
 
															+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
														
 
															+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
														
 
															+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
														
 
															+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
														
 
															+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
														
 
															+}
														
 
															+'''空白符'''
														
 
															+BLANKS = {
														
 
															+    '\n\s*\n': '\n',
														
 
															+    '\s*\n\s*': '\n',
														
 
															+    '[^\S\n]': ' ',
														
 
															+    '\s+': ' ',
														
 
															+}
														
 
															+'''css标签集合'''
														
 
															+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
														
 
															+'''css属性集合'''
														
 
															+ATTRS = {'id', 'class', 'style', 'width'}
														
 
															+'''特殊样式的标签'''
														
 
															+SPECIAL_TAGS = {
														
 
															+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def _repair_tag():
														
 
															+    """异常的标签组合,用来替换非标准页面的标签"""
														
 
															+    _repairs = {}
														
 
															+    for tag in TAGS:
														
 
															+        for attr in ATTRS:
														
 
															+            key = '{}{}'.format(tag, attr)
														
 
															+            val = '{} {}'.format(tag, attr)
														
 
															+            _repairs[key] = val
														
 
															+    return _repairs
														
 
															+
														
 
															+
														
 
															+def _escape_character(html):
														
 
															+    """转义字符"""
														
 
															+    html = html.replace('&lt;', '<')
														
 
															+    html = html.replace('&gt;', '>')
														
 
															+    html = html.replace('&quot;', '"')
														
 
															+    html = html.replace('&amp;', '&')
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def _lowercase_tag(html):
														
 
															+    """元素标签转成小写，不影响页面文本"""
														
 
															+    tags = re.findall("<[^>]+>", html)
														
 
															+    tag_sets = set(tags)
														
 
															+
														
 
															+    if len(tag_sets) > 10000:
														
 
															+        from bs4 import BeautifulSoup
														
 
															+        soup = BeautifulSoup(html, 'lxml')
														
 
															+        html = str(soup.body.next_element)
														
 
															+    else:
														
 
															+        for tag in tag_sets:
														
 
															+            html = html.replace(tag, str(tag).lower())
														
 
															+
														
 
															+    repair_tags = _repair_tag()  # 标签修复
														
 
															+    for err, right in repair_tags.items():
														
 
															+        html = html.replace(err, right)
														
 
															+
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def _clear_special_tag(html):
														
 
															+    """删除特殊元素标签"""
														
 
															+    for tag, repl in SPECIAL_TAGS.items():
														
 
															+        html = tag.sub(repl, html)
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def _clear_input_tag(html, display=False):
														
 
															+    """提取value值，替换input标签"""
														
 
															+    if not display:
														
 
															+        html = html.replace('<input', '<input style="border-color: transparent;"')  # 不显示输入框边框
														
 
															+
														
 
															+    tag = re.compile(r'<input .*?>', re.S)
														
 
															+    value = re.compile(r'value=["|\'](.*?)["|\']')
														
 
															+
														
 
															+    lst = re.findall(tag, html) or []
														
 
															+    for ipt in lst:
														
 
															+        val = re.findall(value, ipt)
														
 
															+        if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
														
 
															+            html = html.replace(ipt, val[0])
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def drop_tree_by_lxml(html, feature):
														
 
															+    tree: HtmlElement = fromstring(html)
														
 
															+    tag_lst = tree.xpath(feature)
														
 
															+    for tag in tag_lst:
														
 
															+        tag.drop_tree()
														
 
															+
														
 
															+    html = tostring(tree, encoding='utf8').decode('utf8')
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
														
 
															+    """
														
 
															+    源码清洗
														
 
															+
														
 
															+    :param html: 清洗的页面
														
 
															+    :param special: 额外指定页面清洗规则
														
 
															+    :param completely: 是否完全清洗页面
														
 
															+    :param del_tag: 删除标签
														
 
															+    :return: 页面源码
														
 
															+    """
														
 
															+    special = set() if special is None else special
														
 
															+    OTHER.update(special)
														
 
															+    remove_tags = {
														
 
															+        **INDEPENDENT_TAGS,
														
 
															+        **INLINE_TAGS,
														
 
															+        **BLOCK_TAGS,
														
 
															+        **OTHER,
														
 
															+        **CSS_STYLE,
														
 
															+        **BLANKS,
														
 
															+    }
														
 
															+
														
 
															+    html = _lowercase_tag(html)
														
 
															+    if del_tag:
														
 
															+        html = _clear_special_tag(html)
														
 
															+
														
 
															+    for tag, repl in remove_tags.items():
														
 
															+        html = re.sub(tag, repl, html)
														
 
															+
														
 
															+    if completely:
														
 
															+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
														
 
															+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
														
 
															+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
														
 
															+
														
 
															+    html = _escape_character(html)
														
 
															+    html = _clear_input_tag(html, **kwargs)
														
 
															+    return html
														
--- a/login.py
+++ b/login.py
@@ -0,0 +1,115 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-10-10 
														
 
															+---------
														
 
															+@summary:  
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+
														
 
															+import json
														
 
															+from pathlib import Path
														
 
															+
														
 
															+from DrissionPage import ChromiumPage, ChromiumOptions
														
 
															+from DrissionPage._functions.tools import PortFinder
														
 
															+
														
 
															+
														
 
															+account_pool = [
														
 
															+    ('18530014520', 'qp!4LXH_'),
														
 
															+]
														
 
															+
														
 
															+
														
 
															+def auto_login(username, password, proxy=False, headless=False, auto_quit=False, accident_url=None):
														
 
															+    co = ChromiumOptions()
														
 
															+
														
 
															+    port, _ = PortFinder(path='./').get_port()
														
 
															+    co.set_paths(
														
 
															+        local_port=port,
														
 
															+        user_data_path=f'./chrome/{username}',
														
 
															+        download_path=f'./download/{username}'
														
 
															+    )
														
 
															+    # 禁用密码保存弹窗
														
 
															+    co.set_argument('--disable-extensions')
														
 
															+
														
 
															+    if proxy:
														
 
															+        proxies = {
														
 
															+            'https': 'socks5://58.221.59.179:8860',
														
 
															+            'http': 'socks5://58.221.59.179:8860'
														
 
															+        }
														
 
															+        co.set_argument('--proxy-server', value=proxies['https'])
														
 
															+    else:
														
 
															+        proxies = None
														
 
															+
														
 
															+    if headless:
														
 
															+        co.set_argument('--headless', value='new')
														
 
															+        co.set_user_agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36')
														
 
															+
														
 
															+    page = ChromiumPage(addr_or_opts=co)
														
 
															+    try:
														
 
															+        targets = [
														
 
															+            'website-seo/v2/cm/getcatid',
														
 
															+            'rest/detail/alltypesdetail/detail',
														
 
															+            'rest/account/companySpace/checkNewUser'
														
 
															+        ]
														
 
															+        page.listen.start(targets=targets, res_type=['Document', 'XHR'])  # 开启路由监听
														
 
															+
														
 
															+        success = page.get('https://vip.qianlima.com/')  # 访问用户管理界面
														
 
															+        if not success:
														
 
															+            return
														
 
															+
														
 
															+        login = page.wait.ele_displayed('x://span[text()="登录"]', timeout=5)
														
 
															+        if login:
														
 
															+            page.ele('x://input[@name="username"]').input(username, clear=True)
														
 
															+            page.ele('x://input[@name="password"]').input(password, clear=True)
														
 
															+            page.ele('x://span[text()="登录"]/parent::*').click()
														
 
															+
														
 
															+        loaded = page.wait.ele_displayed(f'x://p[contains(text(), "{username}")]')  # 等待页面加载完成
														
 
															+        if not loaded:
														
 
															+            print(f'登录失败>{username}')
														
 
															+            return
														
 
															+
														
 
															+        # page.get('http://www.qianlima.com/zb/detail/20241016_454396207.html')
														
 
															+
														
 
															+        packet = page.listen.wait()
														
 
															+        root = Path(__file__).parent
														
 
															+        if not (root / 'account').exists():
														
 
															+            (root / 'account').mkdir(exist_ok=True)
														
 
															+
														
 
															+        file = (root / f'account/{username}.json').absolute()
														
 
															+        with open(file, 'w') as f:
														
 
															+            print(packet.url)  # 打印数据包url
														
 
															+            # print(packet.response.body)
														
 
															+            headers = dict(packet.request.headers)
														
 
															+            print(f'** headers ** \n{json.dumps(headers, indent=4)}')
														
 
															+            cookies = page.cookies(as_dict=True)
														
 
															+            print(f'** cookies ** \n{json.dumps(cookies, indent=4)}')
														
 
															+            user = {
														
 
															+                'cookies': cookies,
														
 
															+                'headers': headers,
														
 
															+                'proxies': proxies
														
 
															+            }
														
 
															+            f.write(json.dumps(user, indent=4))
														
 
															+
														
 
															+            if accident_url is not None:
														
 
															+                while True:
														
 
															+                    page.get(accident_url)  # 人工处理意外情况
														
 
															+                    if input('异常已处理?[Y|N]').upper() == 'Y':
														
 
															+                        break
														
 
															+
														
 
															+            if not auto_quit:
														
 
															+                f.flush()
														
 
															+                while True:
														
 
															+                    if input("退出? >>>"):
														
 
															+                        break
														
 
															+
														
 
															+    except KeyboardInterrupt:
														
 
															+        pass
														
 
															+
														
 
															+    finally:
														
 
															+        page.quit()
														
 
															+        print('关闭浏览器')
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    for username, password in account_pool:
														
 
															+        auto_login(username, password, proxy=True, auto_quit=False, headless=False)
														
--- a/net.py
+++ b/net.py
@@ -0,0 +1,191 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-10-10 
														
 
															+---------
														
 
															+@summary:  千里马详情页专用下载器
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import copy
														
 
															+import functools
														
 
															+import json
														
 
															+
														
 
															+import execjs
														
 
															+import requests
														
 
															+from loguru import logger
														
 
															+
														
 
															+
														
 
															+# 来源 rest/detail/alltypesdetail/detail
														
 
															+_cookies = {}
														
 
															+_headers = {}
														
 
															+_proxies = None
														
 
															+
														
 
															+
														
 
															+def router(func):
														
 
															+    @functools.wraps(func)
														
 
															+    def wrapper(*args, **kwargs):
														
 
															+        try:
														
 
															+            return func(*args, **kwargs)
														
 
															+        except AssertionError:
														
 
															+            logger.exception('账号异常')
														
 
															+            return False, {}
														
 
															+
														
 
															+        except KeyboardInterrupt:
														
 
															+            pass
														
 
															+
														
 
															+        except requests.exceptions.RequestException as e:
														
 
															+            logger.exception(f'网络请求错误, 原因:{e}')
														
 
															+            return False, {}
														
 
															+
														
 
															+    return wrapper
														
 
															+
														
 
															+
														
 
															+def set_cookies(ck):
														
 
															+    global _cookies
														
 
															+    _cookies = ck
														
 
															+
														
 
															+
														
 
															+def set_headers(h):
														
 
															+    global _headers
														
 
															+    _headers = h
														
 
															+
														
 
															+
														
 
															+def set_proxies(p):
														
 
															+    global _proxies
														
 
															+    _proxies = p
														
 
															+
														
 
															+
														
 
															+def _extract_cid(href):
														
 
															+    script = '''
														
 
															+    function extractCid(url) {
														
 
															+        if(url.indexOf('/zb/detail') != -1){
														
 
															+            var cidArr = url.split('_');
														
 
															+            if (cidArr.length > 1) {
														
 
															+                var cid = cidArr[1].replace('.html', '');
														
 
															+                if (cid.indexOf('-') != -1) {
														
 
															+                    cid = cid.split("-")[1];
														
 
															+                }
														
 
															+                return cid
														
 
															+            }
														
 
															+        }
														
 
															+        
														
 
															+        if (url.indexOf('-') != -1) {
														
 
															+            t = url.lastIndexOf("-")
														
 
															+            n = url.substring(t + 1)
														
 
															+            cid = n.split(".html")[0]
														
 
															+            return cid
														
 
															+        }
														
 
															+        
														
 
															+    }
														
 
															+    '''
														
 
															+    ctx = execjs.compile(script)
														
 
															+    result = ctx.call('extractCid', href)
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def _extract_referer(href, cid):
														
 
															+    global _cookies, _proxies
														
 
															+    href = str(href).replace('http:', 'https:')
														
 
															+
														
 
															+    url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
														
 
															+    headers = {
														
 
															+        'Accept': '*/*',
														
 
															+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
														
 
															+        'Cache-Control': 'no-cache',
														
 
															+        'Connection': 'keep-alive',
														
 
															+        'Pragma': 'no-cache',
														
 
															+        'Referer': href,
														
 
															+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
														
 
															+        'X-Requested-With': 'XMLHttpRequest',
														
 
															+    }
														
 
															+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
														
 
															+    response = requests.get(url, timeout=10, **requests_params)
														
 
															+    assert response.status_code == 200
														
 
															+    text = response.content.decode()
														
 
															+
														
 
															+    script = '''
														
 
															+    function extractDetailUrl(cid, dataStr) {
														
 
															+        var data = JSON.parse(dataStr)
														
 
															+        var catId = data.data;
														
 
															+        var pageName;
														
 
															+        switch (catId) {
														
 
															+            case 301:
														
 
															+            case 601:
														
 
															+                pageName = "tenderDetail.html";
														
 
															+                break;
														
 
															+            case 202:
														
 
															+                pageName = "projectDetail.html";
														
 
															+                break;
														
 
															+            case 201:
														
 
															+                pageName = "tenderDetail.html";
														
 
															+                break;
														
 
															+            case 101:
														
 
															+                pageName = "projectDetail.html";
														
 
															+                break;
														
 
															+            default:
														
 
															+                pageName = "tenderDetail.html";
														
 
															+                break;
														
 
															+        }
														
 
															+        return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
														
 
															+    }
														
 
															+    '''
														
 
															+    ctx = execjs.compile(script)
														
 
															+    result = ctx.call('extractDetailUrl', cid, text)
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def _download_detail(href, referer=False):
														
 
															+    global _cookies, _headers, _proxies
														
 
															+    headers = copy.deepcopy(_headers)
														
 
															+    cid = _extract_cid(href)
														
 
															+    if not cid:
														
 
															+        raise ValueError('cid is not exist')
														
 
															+
														
 
															+    url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
														
 
															+    if referer:
														
 
															+        referer = _extract_referer(href, cid)
														
 
															+        headers['Referer'] = referer
														
 
															+
														
 
															+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
														
 
															+    try:
														
 
															+        response = requests.post(url, timeout=10, **requests_params)
														
 
															+    except requests.exceptions.Timeout:
														
 
															+        logger.error(f'采集失败|访问超时|{href}')
														
 
															+        return False, None  # 账号额度不足时，返回：None
														
 
															+
														
 
															+    username = _cookies['qlm_username']
														
 
															+    status_code = response.status_code
														
 
															+    if status_code != 200:
														
 
															+        result = response.content.decode()
														
 
															+        logger.error(f'采集失败|{username}|状态码|{status_code}|请求响应|{result}')
														
 
															+        return False, status_code
														
 
															+
														
 
															+    result = response.json()
														
 
															+    data = result['data']
														
 
															+    if not data:
														
 
															+        logger.warning(f'数据异常|{result}')
														
 
															+        return False, data  # 账号额度不足时，返回：None
														
 
															+
														
 
															+    logger.info(f'采集成功[{href}]')
														
 
															+    return True, data
														
 
															+
														
 
															+
														
 
															+@router
														
 
															+def download_json(href, **kwargs):
														
 
															+    _, result = _download_detail(href, **kwargs)
														
 
															+    if not result:
														
 
															+        return False
														
 
															+
														
 
															+    if isinstance(result, int):
														
 
															+        return result
														
 
															+
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+@router
														
 
															+def download_html(href, **kwargs):
														
 
															+    _, result = _download_detail(href, **kwargs)
														
 
															+    if not result:
														
 
															+        return False
														
 
															+
														
 
															+    return result['content']
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
 
															+pymongo==3.12.0
														
 
															+pybloom_live==4.0.0
														
 
															+loguru==0.5.3
														
 
															+DrissionPage==4.0.5.6
														
 
															+beautifulsoup4==4.12.3
														
 
															+PyExecJS==1.5.1
														
--- a/数据处理.py
+++ b/数据处理.py
@@ -0,0 +1,70 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-10-19 
														
 
															+---------
														
 
															+@summary:  
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import time
														
 
															+from datetime import datetime
														
 
															+
														
 
															+import bson
														
 
															+from pymongo import MongoClient
														
 
															+
														
 
															+Int64 = bson.int64.Int64
														
 
															+
														
 
															+client = MongoClient('192.168.3.182', 27017)
														
 
															+qlm_coll = client['zjb_poc']['qlm_data_lst']
														
 
															+jy_coll = client['zjb_poc']['jy_data_lst']
														
 
															+
														
 
															+count = 0
														
 
															+insert_lst = []
														
 
															+with qlm_coll.find() as cursor:
														
 
															+    for item in cursor:
														
 
															+        href = item['url']
														
 
															+        title = item['popTitle'] if 'popTitle' in item else item['showTitle']
														
 
															+        publishtime = item['updateTime']
														
 
															+        l_np_publishtime = datetime.strptime(publishtime, '%Y-%m-%d').timestamp()
														
 
															+
														
 
															+        addr = str(item['areaName']).split('-')
														
 
															+        area = addr[0] if len(addr) > 0 else ''
														
 
															+        city = addr[1] if len(addr) > 1 else ''
														
 
															+        if '国土' in item.get('progName', ''):
														
 
															+            toptype = item['progName']
														
 
															+        else:
														
 
															+            toptype = (item['noticeSegmentTypeName'] or item['progName'])
														
 
															+
														
 
															+        data = {
														
 
															+            'site': '千里马',
														
 
															+            'channel': item['channel'],
														
 
															+            'spidercode': 'sdxzbiddingsjzypc',
														
 
															+            'area': area,
														
 
															+            'city': city,
														
 
															+            'district': '',
														
 
															+            'comeintime': Int64(int(time.time())),
														
 
															+            'isdownload': False,  # 是否下载
														
 
															+            'isfailed': False,  # 是否失败
														
 
															+            'title': title,  # 标题
														
 
															+            'href': href,  # 信息链接
														
 
															+            'publishtime': publishtime,  # 发布时间(字符串)
														
 
															+            'l_np_publishtime': Int64(l_np_publishtime),  # 发布时间(时间戳)
														
 
															+            'buyer': item['tenderees'],  # 招标单位
														
 
															+            'toptype': toptype,  # 公告类型
														
 
															+            'winner': item['bidder'] if item.get('bidder') is not None else '',  # 中标单位
														
 
															+            'agency': item['agent'] if item.get('agent') is not None else '',  # 代理单位
														
 
															+        }
														
 
															+
														
 
															+        insert_lst.append(data)
														
 
															+        if len(insert_lst) == 100:
														
 
															+            jy_coll.insert_many(insert_lst, ordered=False)
														
 
															+            count += len(insert_lst)
														
 
															+            insert_lst = []
														
 
															+            print('已处理{}条'.format(count))
														
 
															+
														
 
															+    if len(insert_lst) > 0:
														
 
															+        jy_coll.insert_many(insert_lst, ordered=False)
														
 
															+        count += len(insert_lst)
														
 
															+        print('已处理{}条'.format(count))
														
 
															+
														
 
															+print('数据处理结束')
														
--- a/采集任务清单.py
+++ b/采集任务清单.py
@@ -0,0 +1,50 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-10-18 
														
 
															+---------
														
 
															+@summary:  
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+from pymongo import MongoClient
														
 
															+import pandas as pd
														
 
															+
														
 
															+
														
 
															+to_db = MongoClient('192.168.3.182', 27017)
														
 
															+coll = to_db['31zg_poc']['keyword_company']
														
 
															+
														
 
															+suffix_lst = '重机,挖掘机,装载机,泵送,桥泵车,搅拌车,拖泵,搅拌站,车载泵,搅拌车,重起,汽车起重机,履带起重机,桩机,旋挖钻,大旋挖,中旋挖,小旋挖,路机,铣刨机,摊铺机,平地机,压路机,沥青站'.split(',')
														
 
															+suffix_set = set(suffix_lst)
														
 
															+
														
 
															+f = '/Users/dongzhaorui/Desktop/qlm数据采集.xlsx'
														
 
															+df = pd.read_excel(f)
														
 
															+df.fillna('',  inplace=True)
														
 
															+
														
 
															+company_lst = []
														
 
															+for _, i in df.iterrows():
														
 
															+    items = i.to_dict()
														
 
															+    s_company = str(items['集团名称']).strip()
														
 
															+    s_sub_company = str(items['二级局名称']).strip()
														
 
															+
														
 
															+    if s_company and s_company not in company_lst:
														
 
															+        company_lst.append(s_company)
														
 
															+
														
 
															+    if s_sub_company and s_sub_company not in company_lst:
														
 
															+        company_lst.append(s_sub_company)
														
 
															+
														
 
															+data = []
														
 
															+for suffix in suffix_set:
														
 
															+    print(suffix)
														
 
															+
														
 
															+    for company in company_lst:
														
 
															+        data.append({'s_suffix': suffix, 's_company': company, 's_keyword': f'{company}+{suffix}'})
														
 
															+
														
 
															+    if len(data) == 100:
														
 
															+        coll.insert_many(data, ordered=False)
														
 
															+        data = []
														
 
															+
														
 
															+if len(data) > 0:
														
 
															+    coll.insert_many(data, ordered=False)
														
 
															+
														
 
															+print('1234')
														
 
															+
														
--- a/采集列表页(关键词).py
+++ b/采集列表页(关键词).py
@@ -0,0 +1,217 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-06-01 
														
 
															+---------
														
 
															+@summary:  千里马列表页采集
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import json
														
 
															+import math
														
 
															+import random
														
 
															+import time
														
 
															+from pathlib import Path
														
 
															+
														
 
															+import requests
														
 
															+from loguru import logger
														
 
															+from pybloom_live import BloomFilter
														
 
															+from pymongo import MongoClient
														
 
															+from login import auto_login, account_pool
														
 
															+
														
 
															+
														
 
															+_cookies = None
														
 
															+_headers = None
														
 
															+_proxies = None
														
 
															+
														
 
															+
														
 
															+def send_wechat_warning(msg, send=True):
														
 
															+    markdown = f'采集异常中断，请切换d模式处理。'
														
 
															+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
														
 
															+
														
 
															+    if not send:
														
 
															+        logger.info(markdown)
														
 
															+        return
														
 
															+
														
 
															+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
														
 
															+    headers_ = {'Content-Type': 'application/json'}
														
 
															+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
														
 
															+    request_params = dict(headers=headers_, json=json_data, timeout=10)
														
 
															+    response = requests.post(url, **request_params)
														
 
															+    logger.info(response.json())
														
 
															+
														
 
															+
														
 
															+def setup_cfg(username):
														
 
															+    global _cookies, _headers, _proxies
														
 
															+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
														
 
															+    with open(file, encoding='utf-8') as rp:
														
 
															+        json_data = json.load(rp)
														
 
															+        _cookies = json_data['cookies']
														
 
															+        _headers = json_data['headers']
														
 
															+        _proxies = json_data['proxies']
														
 
															+
														
 
															+
														
 
															+def launch_filter():
														
 
															+    """创建布隆过滤器"""
														
 
															+    logger.debug('创建布隆过滤器...')
														
 
															+    backup = (Path(__file__).parent / 'backup')
														
 
															+    if not backup.exists():
														
 
															+        backup.mkdir(exist_ok=True)
														
 
															+
														
 
															+    file = (backup / 'bloomfilter.f')
														
 
															+    if not file.exists():
														
 
															+        file.touch()  # 初始创建存储文件
														
 
															+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
														
 
															+    else:
														
 
															+        if file.stat().st_size == 0:
														
 
															+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
														
 
															+        else:
														
 
															+            bf = BloomFilter.fromfile(file.open('rb'))
														
 
															+
														
 
															+    return file, bf
														
 
															+
														
 
															+
														
 
															+def fetch(collection, username, keywords, page, page_size, channel, bf):
														
 
															+    # rest/service/website/search/solr -> cookies
														
 
															+    global _cookies, _headers, _proxies
														
 
															+    response = None
														
 
															+    try:
														
 
															+        json_data = {
														
 
															+            'keywords': keywords,
														
 
															+            'timeType': '4',
														
 
															+            'beginTime': '2024-01-01',
														
 
															+            'endTime': '2024-12-04',
														
 
															+            'filtermode': 5,
														
 
															+            'searchMode': 1,
														
 
															+            'currentPage': page,
														
 
															+            'numPerPage': page_size,
														
 
															+            'sortType': 2,
														
 
															+            'allType': -1,
														
 
															+            'noticeSegmentTypeStr': '',
														
 
															+            'beginAmount': '',
														
 
															+            'endAmount': '',
														
 
															+            'purchasingUnitIdList': '',
														
 
															+            'threeClassifyTagStr': '',
														
 
															+            'fourLevelCategoryIdListStr': '',
														
 
															+            'threeLevelCategoryIdListStr': '',
														
 
															+            'levelId': '',
														
 
															+            'tab': 0,
														
 
															+            'searchDataType': 0,
														
 
															+            'types': '-1',
														
 
															+            'showContent': 1,
														
 
															+            'hasTenderTransferProject': 1,
														
 
															+            'newAreas': '',
														
 
															+            'hasChooseSortType': 1,
														
 
															+            'summaryType': 0,
														
 
															+        }
														
 
															+
														
 
															+        response = requests.post(
														
 
															+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
														
 
															+            cookies=_cookies,
														
 
															+            headers=_headers,
														
 
															+            json=json_data,
														
 
															+            proxies=_proxies,
														
 
															+            timeout=60
														
 
															+        )
														
 
															+        assert response.status_code == 200
														
 
															+        result = response.json()
														
 
															+        try:
														
 
															+            total = result['data']['rowCount']
														
 
															+        except TypeError:
														
 
															+            return False, -1, 0
														
 
															+
														
 
															+        dedup_count = 0
														
 
															+        count = 0
														
 
															+        insert_lst = []
														
 
															+        data = result['data']['data']
														
 
															+        for item in data:
														
 
															+            href = item.get('url')
														
 
															+            if href is None or href in bf:
														
 
															+                dedup_count += 1
														
 
															+                # logger.debug(f'重复数据[{href}]')
														
 
															+                continue
														
 
															+
														
 
															+            item['channel'] = channel
														
 
															+            insert_lst.append(item)
														
 
															+            if len(insert_lst) == page_size:
														
 
															+                collection.insert_many(insert_lst, ordered=False)
														
 
															+                count += len(insert_lst)
														
 
															+                insert_lst = []
														
 
															+
														
 
															+            bf.add(href)
														
 
															+
														
 
															+        if len(insert_lst) > 0:
														
 
															+            collection.insert_many(insert_lst, ordered=False)
														
 
															+            count += len(insert_lst)
														
 
															+
														
 
															+        logger.info(f'自动翻页|第{page}页|入库{count}条|重复{dedup_count}条')
														
 
															+        return True, total, len(data)
														
 
															+
														
 
															+    except AssertionError:
														
 
															+        logger.error(f'{username}|账号异常|请求失败')
														
 
															+        # send_wechat_warning(msg=response.content.decode())
														
 
															+        return False, -2, 0
														
 
															+
														
 
															+    except requests.exceptions.RequestException as e:
														
 
															+        logger.exception(f'网络请求错误, 原因:{e}')
														
 
															+        return False, -3, 0
														
 
															+
														
 
															+
														
 
															+def spider(username, keywords, bf, coll, channel):
														
 
															+    setup_cfg(username)
														
 
															+
														
 
															+    page = 1
														
 
															+    page_size = 100
														
 
															+
														
 
															+    # 翻页
														
 
															+    retries = 0
														
 
															+    while True:
														
 
															+        ok, total, count = fetch(coll, username, keywords, page, page_size, channel, bf)
														
 
															+        if ok is False:
														
 
															+            state = total
														
 
															+            if state == -1:
														
 
															+                logger.info(f'{username}|请求参数错误|修改参数')
														
 
															+                return False
														
 
															+            elif state == -2:
														
 
															+                logger.info(f'{username}|访问频繁|3秒后切换账号')
														
 
															+                time.sleep(3)
														
 
															+                return
														
 
															+            else:
														
 
															+                logger.error(f'{username}|网络异常|准备重试~{retries}')
														
 
															+                if retries > 3:
														
 
															+                    return
														
 
															+                else:
														
 
															+                    retries += 1
														
 
															+                    continue
														
 
															+
														
 
															+        # time.sleep(math.log(random.randint(100, 2400), 2))
														
 
															+        time.sleep(.5)
														
 
															+        if ok is True and count < page_size:
														
 
															+            logger.info(f'采集完成|保存{total}条')
														
 
															+            break
														
 
															+        else:
														
 
															+            page += 1
														
 
															+
														
 
															+    return True
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    f, bf = launch_filter()  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
														
 
															+
														
 
															+    client = MongoClient('192.168.3.182', 27017)
														
 
															+    coll = client['zjb_poc']['qlm_data_lst']
														
 
															+    channel = '综合'
														
 
															+    keywords = '黑龙江省八目科技开发有限公司'
														
 
															+    try:
														
 
															+        username, password = account_pool.pop(0)
														
 
															+        auto_login(username, password, proxy=True, headless=True, auto_quit=True)
														
 
															+        spider(username, keywords, bf, coll, channel)
														
 
															+    except KeyboardInterrupt:
														
 
															+        pass
														
 
															+
														
 
															+    finally:
														
 
															+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
														
 
															+        logger.info('采集结束')
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    main()
														
--- a/采集列表页(地域).py
+++ b/采集列表页(地域).py
@@ -0,0 +1,253 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-06-01 
														
 
															+---------
														
 
															+@summary:  千里马列表页采集
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import json
														
 
															+import math
														
 
															+import random
														
 
															+import time
														
 
															+from pathlib import Path
														
 
															+
														
 
															+import requests
														
 
															+from loguru import logger
														
 
															+from pybloom_live import BloomFilter
														
 
															+from pymongo import MongoClient
														
 
															+from login import auto_login, account_pool
														
 
															+
														
 
															+
														
 
															+_cookies = None
														
 
															+_headers = None
														
 
															+_proxies = None
														
 
															+
														
 
															+
														
 
															+def send_wechat_warning(msg, send=True):
														
 
															+    markdown = f'采集异常中断，请切换d模式处理。'
														
 
															+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
														
 
															+
														
 
															+    if not send:
														
 
															+        logger.info(markdown)
														
 
															+        return
														
 
															+
														
 
															+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
														
 
															+    headers_ = {'Content-Type': 'application/json'}
														
 
															+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
														
 
															+    request_params = dict(headers=headers_, json=json_data, timeout=10)
														
 
															+    response = requests.post(url, **request_params)
														
 
															+    logger.info(response.json())
														
 
															+
														
 
															+
														
 
															+def setup_cfg(username):
														
 
															+    global _cookies, _headers, _proxies
														
 
															+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
														
 
															+    with open(file, encoding='utf-8') as rp:
														
 
															+        json_data = json.load(rp)
														
 
															+        _cookies = json_data['cookies']
														
 
															+        _headers = json_data['headers']
														
 
															+        _proxies = json_data['proxies']
														
 
															+
														
 
															+
														
 
															+def launch_filter():
														
 
															+    """创建布隆过滤器"""
														
 
															+    logger.debug('创建布隆过滤器...')
														
 
															+    backup = (Path(__file__).parent / 'backup')
														
 
															+    if not backup.exists():
														
 
															+        backup.mkdir(exist_ok=True)
														
 
															+
														
 
															+    file = (backup / 'bloomfilter.f')
														
 
															+    if not file.exists():
														
 
															+        file.touch()  # 初始创建存储文件
														
 
															+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
														
 
															+    else:
														
 
															+        if file.stat().st_size == 0:
														
 
															+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
														
 
															+        else:
														
 
															+            bf = BloomFilter.fromfile(file.open('rb'))
														
 
															+
														
 
															+    return file, bf
														
 
															+
														
 
															+
														
 
															+def fetch(collection, username, page, page_size, channel, bf):
														
 
															+    # rest/service/website/search/solr -> cookies
														
 
															+    global _cookies, _headers, _proxies
														
 
															+    response = None
														
 
															+    try:
														
 
															+        if channel == '中标信息':
														
 
															+            json_data = {
														
 
															+                'keywords': '',
														
 
															+                'timeType': 4,
														
 
															+                'beginTime': '2024-09-01',
														
 
															+                'endTime': '2024-09-30',
														
 
															+                'filtermode': '8',
														
 
															+                'searchMode': 1,
														
 
															+                'currentPage': page,
														
 
															+                'numPerPage': page_size,
														
 
															+                'sortType': 1,
														
 
															+                'allType': 3,
														
 
															+                'beginAmount': '',
														
 
															+                'endAmount': '',
														
 
															+                'purchasingUnitIdList': '',
														
 
															+                'threeClassifyTagStr': '',
														
 
															+                'fourLevelCategoryIdListStr': '',
														
 
															+                'threeLevelCategoryIdListStr': '',
														
 
															+                'levelId': '',
														
 
															+                'tab': 0,
														
 
															+                'searchDataType': 1,
														
 
															+                'types': 3,
														
 
															+                'showContent': 1,
														
 
															+                'newAreas': '1744',
														
 
															+                'hasChooseSortType': 1,
														
 
															+                'progIdAndNoticeSegmentTypeMaps': {
														
 
															+                    '3': [],
														
 
															+                },
														
 
															+                'summaryType': 0,
														
 
															+            }
														
 
															+        elif channel == '招标信息':
														
 
															+            json_data = {
														
 
															+                'keywords': '',
														
 
															+                'timeType': 4,
														
 
															+                'beginTime': '2024-09-01',
														
 
															+                'endTime': '2024-09-30',
														
 
															+                'filtermode': '8',
														
 
															+                'searchMode': 1,
														
 
															+                'currentPage': page,
														
 
															+                'numPerPage': page_size,
														
 
															+                'sortType': 1,
														
 
															+                'allType': 0,
														
 
															+                'beginAmount': '',
														
 
															+                'endAmount': '',
														
 
															+                'purchasingUnitIdList': '',
														
 
															+                'threeClassifyTagStr': '',
														
 
															+                'fourLevelCategoryIdListStr': '',
														
 
															+                'threeLevelCategoryIdListStr': '',
														
 
															+                'levelId': '',
														
 
															+                'tab': 0,
														
 
															+                'searchDataType': 1,
														
 
															+                'types': -1,
														
 
															+                'showContent': 1,
														
 
															+                'newAreas': '1744',
														
 
															+                'hasChooseSortType': 1,
														
 
															+                'progIdAndNoticeSegmentTypeMaps': {
														
 
															+                    '0': [],
														
 
															+                    '1': [],
														
 
															+                },
														
 
															+                'summaryType': 0,
														
 
															+            }
														
 
															+        else:
														
 
															+            pass
														
 
															+
														
 
															+        response = requests.post(
														
 
															+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
														
 
															+            cookies=_cookies,
														
 
															+            headers=_headers,
														
 
															+            json=json_data,
														
 
															+            proxies=_proxies,
														
 
															+            timeout=60
														
 
															+        )
														
 
															+        assert response.status_code == 200
														
 
															+        result = response.json()
														
 
															+        try:
														
 
															+            total = result['data']['rowCount']
														
 
															+        except TypeError:
														
 
															+            return False, -1, 0
														
 
															+
														
 
															+        dedup_count = 0
														
 
															+        count = 0
														
 
															+        insert_lst = []
														
 
															+        data = result['data']['data']
														
 
															+        for item in data:
														
 
															+            href = item.get('url')
														
 
															+            if href is None or href in bf:
														
 
															+                dedup_count += 1
														
 
															+                # logger.debug(f'重复数据[{href}]')
														
 
															+                continue
														
 
															+
														
 
															+            item['channel'] = channel
														
 
															+            insert_lst.append(item)
														
 
															+            if len(insert_lst) == page_size:
														
 
															+                collection.insert_many(insert_lst, ordered=False)
														
 
															+                count += len(insert_lst)
														
 
															+                insert_lst = []
														
 
															+
														
 
															+            bf.add(href)
														
 
															+
														
 
															+        if len(insert_lst) > 0:
														
 
															+            collection.insert_many(insert_lst, ordered=False)
														
 
															+            count += len(insert_lst)
														
 
															+
														
 
															+        logger.info(f'自动翻页|第{page}页|入库{count}条|重复{dedup_count}条')
														
 
															+        return True, total, len(data)
														
 
															+
														
 
															+    except AssertionError:
														
 
															+        logger.error(f'{username}|账号异常|请求失败')
														
 
															+        # send_wechat_warning(msg=response.content.decode())
														
 
															+        return False, -2, 0
														
 
															+
														
 
															+    except requests.exceptions.RequestException as e:
														
 
															+        logger.exception(f'网络请求错误, 原因:{e}')
														
 
															+        return False, -3, 0
														
 
															+
														
 
															+
														
 
															+def spider(username, bf, coll, channel):
														
 
															+    setup_cfg(username)
														
 
															+
														
 
															+    page = 1
														
 
															+    page_size = 100
														
 
															+
														
 
															+    # 翻页
														
 
															+    retries = 0
														
 
															+    while True:
														
 
															+        ok, total, count = fetch(coll, username, page, page_size, channel, bf)
														
 
															+        if ok is False:
														
 
															+            state = total
														
 
															+            if state == -1:
														
 
															+                logger.info(f'{username}|请求参数错误|修改参数')
														
 
															+                return False
														
 
															+            elif state == -2:
														
 
															+                logger.info(f'{username}|访问频繁|3秒后切换账号')
														
 
															+                time.sleep(3)
														
 
															+                return
														
 
															+            else:
														
 
															+                logger.error(f'{username}|网络异常|准备重试~{retries}')
														
 
															+                if retries > 3:
														
 
															+                    return
														
 
															+                else:
														
 
															+                    retries += 1
														
 
															+                    continue
														
 
															+
														
 
															+        # time.sleep(math.log(random.randint(100, 2400), 2))
														
 
															+        time.sleep(.5)
														
 
															+        if ok is True and count < page_size:
														
 
															+            logger.info(f'采集完成|保存{total}条')
														
 
															+            break
														
 
															+        else:
														
 
															+            page += 1
														
 
															+
														
 
															+    return True
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    f, bf = launch_filter()  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
														
 
															+
														
 
															+    client = MongoClient('192.168.3.182', 27017)
														
 
															+    coll = client['sdlt_poc']['qlm_data_lst']
														
 
															+    # channel = '招标信息'
														
 
															+    channel = '中标信息'
														
 
															+
														
 
															+    try:
														
 
															+        username, password = account_pool.pop(0)
														
 
															+        auto_login(username, password, proxy=True, headless=True, auto_quit=True)
														
 
															+        spider(username, bf, coll, channel)
														
 
															+    except KeyboardInterrupt:
														
 
															+        pass
														
 
															+
														
 
															+    finally:
														
 
															+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
														
 
															+        logger.info('采集结束')
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    main()
														
--- a/采集列表页.py
+++ b/采集列表页.py
@@ -0,0 +1,268 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-06-01 
														
 
															+---------
														
 
															+@summary:  千里马列表页采集
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import json
														
 
															+import math
														
 
															+import random
														
 
															+import time
														
 
															+from pathlib import Path
														
 
															+
														
 
															+import requests
														
 
															+from loguru import logger
														
 
															+from pybloom_live import BloomFilter
														
 
															+from pymongo import MongoClient
														
 
															+from login import auto_login, account_pool
														
 
															+
														
 
															+
														
 
															+_cookies = None
														
 
															+_headers = None
														
 
															+_proxies = None
														
 
															+
														
 
															+
														
 
															+def send_wechat_warning(msg, send=True):
														
 
															+    markdown = f'采集异常中断，请切换d模式处理。'
														
 
															+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
														
 
															+
														
 
															+    if not send:
														
 
															+        logger.info(markdown)
														
 
															+        return
														
 
															+
														
 
															+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
														
 
															+    headers_ = {'Content-Type': 'application/json'}
														
 
															+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
														
 
															+    request_params = dict(headers=headers_, json=json_data, timeout=10)
														
 
															+    response = requests.post(url, **request_params)
														
 
															+    logger.info(response.json())
														
 
															+
														
 
															+
														
 
															+def setup_cfg(username):
														
 
															+    global _cookies, _headers, _proxies
														
 
															+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
														
 
															+    with open(file, encoding='utf-8') as rp:
														
 
															+        json_data = json.load(rp)
														
 
															+        _cookies = json_data['cookies']
														
 
															+        _headers = json_data['headers']
														
 
															+        _proxies = json_data['proxies']
														
 
															+
														
 
															+
														
 
															+def launch_filter():
														
 
															+    """创建布隆过滤器"""
														
 
															+    logger.debug('创建布隆过滤器...')
														
 
															+    backup = (Path(__file__).parent / 'backup')
														
 
															+    if not backup.exists():
														
 
															+        backup.mkdir(exist_ok=True)
														
 
															+
														
 
															+    file = (backup / 'bloomfilter.f')
														
 
															+    if not file.exists():
														
 
															+        file.touch()  # 初始创建存储文件
														
 
															+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
														
 
															+    else:
														
 
															+        if file.stat().st_size == 0:
														
 
															+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
														
 
															+        else:
														
 
															+            bf = BloomFilter.fromfile(file.open('rb'))
														
 
															+
														
 
															+    return file, bf
														
 
															+
														
 
															+
														
 
															+def fetch(collection, username, page, page_size, keywords, bf):
														
 
															+    # rest/service/website/search/solr -> cookies
														
 
															+    global _cookies, _headers, _proxies
														
 
															+    response = None
														
 
															+    try:
														
 
															+        json_data = {
														
 
															+            'keywords': keywords,
														
 
															+            'timeType': 4,
														
 
															+            'beginTime': '2024-09-01',
														
 
															+            'endTime': '2024-09-30',
														
 
															+            'filtermode': '8',
														
 
															+            'searchMode': 1,
														
 
															+            'currentPage': page,
														
 
															+            'numPerPage': page_size,
														
 
															+            'sortType': '1',
														
 
															+            'allType': -1,
														
 
															+            'beginAmount': '',
														
 
															+            'endAmount': '',
														
 
															+            'purchasingUnitIdList': '',
														
 
															+            'threeClassifyTagStr': '',
														
 
															+            'fourLevelCategoryIdListStr': '',
														
 
															+            'threeLevelCategoryIdListStr': '',
														
 
															+            'levelId': '',
														
 
															+            'tab': 2,
														
 
															+            'types': '-1',
														
 
															+            'searchDataType': 1,
														
 
															+            'showContent': 1,
														
 
															+            'hasLinkName': '',
														
 
															+            'newAreas': '',
														
 
															+            'hasChooseSortType': 1,
														
 
															+            'progIdAndNoticeSegmentTypeMaps': {
														
 
															+                '3': [],
														
 
															+                '4': [
														
 
															+                    11,
														
 
															+                    12,
														
 
															+                ],
														
 
															+                '5': [],
														
 
															+            },
														
 
															+            'summaryType': 1,
														
 
															+        }
														
 
															+
														
 
															+        response = requests.post(
														
 
															+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
														
 
															+            cookies=_cookies,
														
 
															+            headers=_headers,
														
 
															+            json=json_data,
														
 
															+            proxies=_proxies,
														
 
															+            timeout=60
														
 
															+        )
														
 
															+        assert response.status_code == 200
														
 
															+        result = response.json()
														
 
															+        try:
														
 
															+            total = result['data']['rowCount']
														
 
															+            if total > 500:
														
 
															+                # 丢弃不要
														
 
															+                return True, total, page_size
														
 
															+
														
 
															+        except TypeError:
														
 
															+            return False, -1, 0
														
 
															+
														
 
															+        data = result['data']['data']
														
 
															+
														
 
															+        dedup_count = 0
														
 
															+        count = 0
														
 
															+        insert_lst = []
														
 
															+        for item in data:
														
 
															+            href = item.get('url')
														
 
															+            if href is None or href in bf:
														
 
															+                dedup_count += 1
														
 
															+                # logger.debug(f'重复数据[{href}]')
														
 
															+                continue
														
 
															+
														
 
															+            insert_lst.append(item)
														
 
															+            if len(insert_lst) == page_size:
														
 
															+                collection.insert_many(insert_lst, ordered=False)
														
 
															+                count += len(insert_lst)
														
 
															+                insert_lst = []
														
 
															+
														
 
															+            bf.add(href)
														
 
															+
														
 
															+        if len(insert_lst) > 0:
														
 
															+            collection.insert_many(insert_lst, ordered=False)
														
 
															+            count += len(insert_lst)
														
 
															+
														
 
															+        logger.info(f'自动翻页|第{page}页|{keywords}|入库{count}条|重复{dedup_count}条')
														
 
															+        return True, total, len(data)
														
 
															+
														
 
															+    except AssertionError:
														
 
															+        logger.error(f'{username}|账号异常|请求失败')
														
 
															+        # send_wechat_warning(msg=response.content.decode())
														
 
															+        return False, -2, 0
														
 
															+
														
 
															+    except requests.exceptions.RequestException as e:
														
 
															+        logger.exception(f'网络请求错误, 原因:{e}')
														
 
															+        return False, -3, 0
														
 
															+
														
 
															+
														
 
															+def spider(username, tasks, bf, to_data_lst, coll):
														
 
															+    setup_cfg(username)
														
 
															+
														
 
															+    while tasks:
														
 
															+        page = 1
														
 
															+        page_size = 100
														
 
															+
														
 
															+        # 翻页
														
 
															+        state = 1
														
 
															+        retries = 0
														
 
															+        isdownload = True
														
 
															+        _id, keywords = tasks.pop()
														
 
															+        while True:
														
 
															+            ok, total, count = fetch(coll, username, page, page_size, keywords, bf)
														
 
															+            if ok is False:
														
 
															+                state = total
														
 
															+                if state == -1:
														
 
															+                    logger.info(f'{username}|请求参数错误|修改参数')
														
 
															+                    return False
														
 
															+                elif state == -2:
														
 
															+                    logger.info(f'{username}|访问频繁|3秒后切换账号')
														
 
															+                    time.sleep(3)
														
 
															+                    return
														
 
															+                else:
														
 
															+                    logger.error(f'{username}|网络异常|准备重试~{retries}')
														
 
															+                    if retries > 3:
														
 
															+                        return
														
 
															+                    else:
														
 
															+                        retries += 1
														
 
															+                        continue
														
 
															+
														
 
															+            # time.sleep(math.log(random.randint(100, 2400), 2))
														
 
															+            time.sleep(.5)
														
 
															+
														
 
															+            if ok is True and total >= 500:
														
 
															+                logger.error(f'采集完成|{keywords}|疑似模糊匹配|跳过采集')
														
 
															+                isdownload = False
														
 
															+                break
														
 
															+
														
 
															+            if ok is True and count < page_size:
														
 
															+                logger.info(f'采集完成|{keywords}|保存{total}条')
														
 
															+                break
														
 
															+            else:
														
 
															+                page += 1
														
 
															+
														
 
															+        # 更新任务状态
														
 
															+        if state >= 0:
														
 
															+            to_data_lst.update_one(
														
 
															+                {'_id': _id},
														
 
															+                {
														
 
															+                    '$set': {
														
 
															+                        'b_isdownload': isdownload,
														
 
															+                        'i_total': total,
														
 
															+                        'i_pages': page,
														
 
															+                        'i_state': state,
														
 
															+                        'i_updatetime': int(time.time())
														
 
															+                    }
														
 
															+                }
														
 
															+            )
														
 
															+
														
 
															+    return True
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    f, bf = launch_filter()  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
														
 
															+
														
 
															+    client = MongoClient('192.168.3.182', 27017)
														
 
															+    to_data_lst = client['31zg_poc']['keyword_company']
														
 
															+    coll = client['31zg_poc']['qlm_data_lst']
														
 
															+
														
 
															+    try:
														
 
															+        while True:
														
 
															+            q = {'b_isdownload': None}
														
 
															+            p = {'s_keyword': 1, '_id': 1}
														
 
															+            with to_data_lst.find(q, projection=p, limit=50) as cursor:
														
 
															+                tasks = [(item['_id'], item['s_keyword']) for item in cursor]
														
 
															+
														
 
															+            username, password = account_pool.pop(0)
														
 
															+            auto_login(username, password, proxy=True, headless=True, auto_quit=True)
														
 
															+            state = spider(username, tasks, bf, to_data_lst, coll)
														
 
															+            if state is True:
														
 
															+                account_pool.append((username, password))
														
 
															+
														
 
															+            if state is False:
														
 
															+                break
														
 
															+
														
 
															+            if not to_data_lst.count_documents(q):
														
 
															+                break
														
 
															+
														
 
															+    except KeyboardInterrupt:
														
 
															+        pass
														
 
															+
														
 
															+    finally:
														
 
															+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
														
 
															+        logger.info('采集结束')
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    main()
														
--- a/采集详情页.py
+++ b/采集详情页.py
@@ -0,0 +1,178 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-10-10
														
 
															+---------
														
 
															+@summary:
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import json
														
 
															+import time
														
 
															+
														
 
															+import bson
														
 
															+from loguru import logger
														
 
															+from pymongo import MongoClient
														
 
															+from pymongo.operations import UpdateOne
														
 
															+
														
 
															+import net
														
 
															+from clean_html import cleaner, drop_tree_by_lxml
														
 
															+from pathlib import Path
														
 
															+from login import auto_login, account_pool
														
 
															+
														
 
															+Int64 = bson.int64.Int64
														
 
															+
														
 
															+
														
 
															+def setup_cfg(username):
														
 
															+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
														
 
															+    with open(file, encoding='utf-8') as rp:
														
 
															+        json_data = json.load(rp)
														
 
															+        net.set_cookies(ck=json_data['cookies'])
														
 
															+        net.set_headers(h=json_data['headers'])
														
 
															+        net.set_proxies(p=json_data['proxies'])
														
 
															+
														
 
															+
														
 
															+def bulk_update(collection, id_lst, update):
														
 
															+    """
														
 
															+    批量更新任务状态
														
 
															+
														
 
															+    :param pymongo.collection.Collection collection:
														
 
															+    :param id_lst:
														
 
															+    :param dict update:更新条件
														
 
															+    :return:
														
 
															+    """
														
 
															+    count = 0
														
 
															+    update_lst = []
														
 
															+    for id_ in id_lst:
														
 
															+        update['updatetime'] = Int64(int(time.time()))  # 更新任务时间
														
 
															+        update_lst.append(UpdateOne({'_id': id_}, {'$set': update}))
														
 
															+        if len(update_lst) == 50:
														
 
															+            results = collection.bulk_write(update_lst, ordered=False)
														
 
															+            count += results.modified_count
														
 
															+            update_lst = []
														
 
															+
														
 
															+    if len(update_lst) > 0:
														
 
															+        results = collection.bulk_write(update_lst, ordered=False)
														
 
															+        count += results.modified_count
														
 
															+
														
 
															+    return count
														
 
															+
														
 
															+
														
 
															+def finalize(insert_lst, update_dict, data_coll, lst_coll):
														
 
															+    if len(insert_lst) > 0:
														
 
															+        data_coll.insert_many(insert_lst, ordered=False)
														
 
															+
														
 
															+    success_ids = update_dict['success']
														
 
															+    if bulk_update(lst_coll, success_ids, {'isdownload': True}):
														
 
															+        logger.info(f'批量更新[采集成功{len(success_ids)}条]任务状态')
														
 
															+
														
 
															+    failed_ids = update_dict['failed']
														
 
															+    if bulk_update(lst_coll, failed_ids, {'isdownload': True, 'isfailed': True}):
														
 
															+        logger.info(f'批量更新[采集失败{len(failed_ids)}条]任务状态')
														
 
															+
														
 
															+
														
 
															+def spider(username, password, task_lst, data_coll, lst_coll):
														
 
															+    setup_cfg(username)
														
 
															+
														
 
															+    update_dict = {'success': [], 'failed': []}
														
 
															+    insert_lst = []
														
 
															+
														
 
															+    def handle_task(task, ret):
														
 
															+        if len(ret) == 0:
														
 
															+            update_dict['failed'].append(task['_id'])
														
 
															+            logger.error(f'下载失败|{href}')
														
 
															+        else:
														
 
															+            html = drop_tree_by_lxml(ret['content'], '//*[contains(text(), "企业信息")]')
														
 
															+            insert_lst.append({
														
 
															+                'site': task['site'],
														
 
															+                'channel': task['channel'],
														
 
															+                'spidercode': task['spidercode'],
														
 
															+                'area': task['area'],
														
 
															+                'city': task['city'],
														
 
															+                'district': task['district'],
														
 
															+                'href': '#',
														
 
															+                'competehref': href,
														
 
															+                'title': task['title'],
														
 
															+                's_title': task['title'],
														
 
															+                'contenthtml': html,
														
 
															+                'detail': cleaner(html),
														
 
															+                'publishtime': task['publishtime'],
														
 
															+                'l_np_publishtime': task['l_np_publishtime'],
														
 
															+                'comeintime': Int64(int(time.time())),
														
 
															+                'T': 'bidding',
														
 
															+                'infoformat': 1,
														
 
															+                'sendflag': 'false',
														
 
															+                'repeat': 'true',
														
 
															+                'iscompete': True,
														
 
															+                '_d': 'comeintime',
														
 
															+                'publishdept': '',
														
 
															+                'type': '',
														
 
															+                'is_mixed': True
														
 
															+            })
														
 
															+            update_dict['success'].append(task['_id'])
														
 
															+
														
 
															+    for task in task_lst:
														
 
															+        href = task['href']
														
 
															+        ret = net.download_json(href, referer=False)
														
 
															+        if isinstance(ret, int) and ret == 429:
														
 
															+            auto_login(username, password, proxy=True, headless=False, auto_quit=True, accident_url=href)
														
 
															+            setup_cfg(username)
														
 
															+            ret = net.download_json(href, referer=False)
														
 
															+            if input('退出:0 继续:1\n') == '0':
														
 
															+                finalize(insert_lst, update_dict, data_coll, lst_coll)
														
 
															+                return False
														
 
															+
														
 
															+        if ret is False:
														
 
															+            logger.error(f'账号失效|{username}')
														
 
															+            finalize(insert_lst, update_dict, data_coll, lst_coll)
														
 
															+            return False
														
 
															+
														
 
															+        handle_task(task, ret)
														
 
															+
														
 
															+        if len(insert_lst) == 50:
														
 
															+            data_coll.insert_many(insert_lst, ordered=False)
														
 
															+            insert_lst = []
														
 
															+
														
 
															+        time.sleep(.5)
														
 
															+
														
 
															+    finalize(insert_lst, update_dict, data_coll, lst_coll)
														
 
															+    return True
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    logger.info('**** 数据采集开始 ****')
														
 
															+
														
 
															+    client = MongoClient('192.168.3.182', 27017)
														
 
															+    data_coll = client['zjb_poc']['jy_data_bak']
														
 
															+    lst_coll = client['zjb_poc']['jy_data_lst']
														
 
															+
														
 
															+    try:
														
 
															+        while True:
														
 
															+            if len(account_pool) == 0:
														
 
															+                logger.warning('账号数量已不足,请及时补充')
														
 
															+                break
														
 
															+
														
 
															+            # q = {'isdownload': False, 'isuse': {'$in': [4]}}
														
 
															+            # q = {'isdownload': False, 'isuse': {'$in': [2, 3]}}
														
 
															+            q = {'isdownload': False, 'is_use': 0}
														
 
															+            with lst_coll.find(q, limit=100) as cursor:
														
 
															+                task_lst = [item for item in cursor]
														
 
															+
														
 
															+            username, password = account_pool.pop(0)
														
 
															+            auto_login(username, password, proxy=True, headless=True, auto_quit=True)
														
 
															+            ret = spider(username, password, task_lst, data_coll, lst_coll)
														
 
															+            if ret is False:
														
 
															+                logger.info('切换账号')
														
 
															+                continue
														
 
															+
														
 
															+            if not lst_coll.count_documents(q):
														
 
															+                break
														
 
															+
														
 
															+    except KeyboardInterrupt:
														
 
															+        pass
														
 
															+
														
 
															+    finally:
														
 
															+        logger.info('**** 数据采集结束 ****')
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    main()