5 月之前 · c3355914d8
--- a/account/18530014520.json
+++ b/account/18530014520.json
@@ -0,0 +1,32 @@
 
				+{
			
 
				+    "cookies": {
			
 
				+        "userInfo": "{%22userId%22:6174239%2C%22username%22:%2218530014520%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E5%BE%90%E5%BF%97%E6%81%92%22%2C%22companyName%22:%22%E7%99%BE%E7%82%89%E5%B1%AF%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222019-12-14%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22https://gw-static.qianlima.com/gw/invoice/1721198286_1d8b871dc0.jpg%22%2C%22customerServicePhone%22:%22%20400-688-2000%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2218530014520%22%2C%22email%22:%221151584137@qq.com%22%2C%22dwmc%22:%22%E7%99%BE%E7%82%89%E5%B1%AF%22%2C%22zhiwu%22:%22%E5%90%88%E4%BC%99%E4%BA%BA%22%2C%22types%22:6%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22showExpireDate%22:true%2C%22companyNature%22:null%2C%22companyArea%22:%22%22%2C%22companyType%22:null%2C%22industry%22:null%2C%22product%22:null%2C%22contacts%22:%22%E5%BE%90%E5%BF%97%E6%81%92%22%2C%22contactNumber%22:%2218530014520%22%2C%22contactAddress%22:null%2C%22mainCustomerGroups%22:null%2C%22informationTypePreferences%22:null%2C%22registerSource%22:%22miniwechat%22%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}",
			
 
				+        "login_time": "1733897438",
			
 
				+        "useragent_hash": "0845b309c7b9b957afd9ecf775a4c21f",
			
 
				+        "source": "1",
			
 
				+        "qlm_password": "3KUEoC37mfjCEmmCCBffUp77fEoCE3C8",
			
 
				+        "HWWAFSESTIME": "1733897419511",
			
 
				+        "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1733897435",
			
 
				+        "qlm_username": "18530014520",
			
 
				+        "HMACCOUNT": "FC1B7E7FD8E10E3C",
			
 
				+        "qlm_visitor_id": "383096361",
			
 
				+        "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1733897435",
			
 
				+        "xAuthToken": "2cae03be-8afa-4d3c-8f4e-cbe8926577e9",
			
 
				+        "HWWAFSESID": "a918373c075b3fa3fc"
			
 
				+    },
			
 
				+    "headers": {
			
 
				+        "X-Auth-Token": "2cae03be-8afa-4d3c-8f4e-cbe8926577e9",
			
 
				+        "sec-ch-ua-platform": "\"macOS\"",
			
 
				+        "Referer": "https://vip.qianlima.com/",
			
 
				+        "Access-Captcha-Permission": "None",
			
 
				+        "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
			
 
				+        "sec-ch-ua-mobile": "?0",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
			
 
				+        "Accept": "*/*",
			
 
				+        "Content-Type": "application/json"
			
 
				+    },
			
 
				+    "proxies": {
			
 
				+        "https": "socks5://58.221.59.179:8860",
			
 
				+        "http": "socks5://58.221.59.179:8860"
			
 
				+    }
			
 
				+}
			
--- a/backup/bloomfilter.f
+++ b/backup/bloomfilter.f
--- a/bidding表查询数据保存情况.py
+++ b/bidding表查询数据保存情况.py
@@ -0,0 +1,41 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-10
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import bson
			
 
				+from pymongo import MongoClient
			
 
				+from urllib.parse import quote_plus
			
 
				+
			
 
				+user = 'dataFx'
			
 
				+password = 'data@fenxi'
			
 
				+uri = "mongodb://%s:%s@%s" % (quote_plus(user), quote_plus(password), '127.0.0.1:27089')
			
 
				+bidding_client = MongoClient(uri)
			
 
				+bidding_db = bidding_client['qfw']
			
 
				+bidding_nomal = bidding_db['bidding_nomal']
			
 
				+
			
 
				+client = MongoClient('192.168.3.182', 27080)
			
 
				+coll = client['py_spider']['data_bak']
			
 
				+
			
 
				+# data_bak
			
 
				+p = {'biddingid': 1, '_id': 0, 'biddingcoll': 1}
			
 
				+q = {'spidercode': 'sdxzbiddingsjzypc', 'comeintime': 1733319761}
			
 
				+with coll.find(q, projection=p) as cursor:
			
 
				+    nomal_ids = [item['biddingid'] for item in cursor if 'biddingid' in item]
			
 
				+    not_nomal_ids = [1 for item in cursor if 'biddingid' not in item]
			
 
				+
			
 
				+print(f'未上传数据量:{sum(not_nomal_ids)}')
			
 
				+
			
 
				+# bidding_nomal
			
 
				+p = {'biddingid': 1, '_id': 0}
			
 
				+not_bidding_ids = []
			
 
				+for _id in nomal_ids:
			
 
				+    ret = bidding_nomal.find_one({'_id': bson.ObjectId(_id)}, projection=p)
			
 
				+    if not ret:
			
 
				+        not_bidding_ids.append(_id)
			
 
				+
			
 
				+for _id in not_bidding_ids:
			
 
				+    print('未入bidding库>>>', _id)
			
--- a/clean_html.py
+++ b/clean_html.py
@@ -0,0 +1,189 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import re
			
 
				+
			
 
				+from lxml.html import fromstring, HtmlElement, tostring
			
 
				+
			
 
				+__all__ = ['cleaner', 'drop_tree_by_lxml']
			
 
				+
			
 
				+'''独立元素'''
			
 
				+INDEPENDENT_TAGS = {
			
 
				+    '<head>[\s\S]*?</head>': '',
			
 
				+    '<html>|<html [^>]*>|</html>': '',
			
 
				+    '<body>|<body [^>]*>|</body>': '',
			
 
				+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
			
 
				+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+    '\\xa0|\\u3000': '',  # 空格
			
 
				+    '<!--[\s\S]*?-->': '',  # 注释
			
 
				+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+    '<input>': '',  # 输入框
			
 
				+    '</input>': '',  # 输入框
			
 
				+    '<img[^>]*>': '<br>',  # 图片
			
 
				+}
			
 
				+'''行内元素'''
			
 
				+INLINE_TAGS = {
			
 
				+    '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				+    '<link>|<link [^>]*>|</link>': '',  # 超链接
			
 
				+    '<span>|<span [^>]*>|</span>': '',  # span
			
 
				+    '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				+    '<font>|<font [^>]*>|</font>': '',  # font
			
 
				+    'data:image(.*?) ': '',  # 图片base64
			
 
				+}
			
 
				+'''块级元素'''
			
 
				+BLOCK_TAGS = {
			
 
				+    '<div>\s*?</div>': '',
			
 
				+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				+    '<p>|<p [^>]*>': '<br>',  # 段落
			
 
				+    '</p>': '',  # 段落
			
 
				+    '<div>|<div [^>]*>': '<br>',  # 分割
			
 
				+    '</div>': '',  # 分割 division
			
 
				+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				+}
			
 
				+'''其他'''
			
 
				+OTHER = {
			
 
				+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
			
 
				+    '<epointform>': '',
			
 
				+    '<!doctype html>|<!doctype html [^>]*>': '',
			
 
				+    '【关闭】|关闭': '',
			
 
				+    '【打印】|打印本页': '',
			
 
				+    '【字体：[\s\S]*】': '',
			
 
				+    '文章来源：[\u4e00-\u9fa5]+': '',
			
 
				+    '浏览次数：.*[<]+': '',
			
 
				+    '（责任编辑：.*?）': '',
			
 
				+    '分享到[：]': '',
			
 
				+}
			
 
				+'''样式'''
			
 
				+CSS_STYLE = {
			
 
				+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
			
 
				+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
			
 
				+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
			
 
				+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				+}
			
 
				+'''空白符'''
			
 
				+BLANKS = {
			
 
				+    '\n\s*\n': '\n',
			
 
				+    '\s*\n\s*': '\n',
			
 
				+    '[^\S\n]': ' ',
			
 
				+    '\s+': ' ',
			
 
				+}
			
 
				+'''css标签集合'''
			
 
				+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				+'''css属性集合'''
			
 
				+ATTRS = {'id', 'class', 'style', 'width'}
			
 
				+'''特殊样式的标签'''
			
 
				+SPECIAL_TAGS = {
			
 
				+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def _repair_tag():
			
 
				+    """异常的标签组合,用来替换非标准页面的标签"""
			
 
				+    _repairs = {}
			
 
				+    for tag in TAGS:
			
 
				+        for attr in ATTRS:
			
 
				+            key = '{}{}'.format(tag, attr)
			
 
				+            val = '{} {}'.format(tag, attr)
			
 
				+            _repairs[key] = val
			
 
				+    return _repairs
			
 
				+
			
 
				+
			
 
				+def _escape_character(html):
			
 
				+    """转义字符"""
			
 
				+    html = html.replace('&lt;', '<')
			
 
				+    html = html.replace('&gt;', '>')
			
 
				+    html = html.replace('&quot;', '"')
			
 
				+    html = html.replace('&amp;', '&')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _lowercase_tag(html):
			
 
				+    """元素标签转成小写，不影响页面文本"""
			
 
				+    tags = re.findall("<[^>]+>", html)
			
 
				+    tag_sets = set(tags)
			
 
				+
			
 
				+    if len(tag_sets) > 10000:
			
 
				+        from bs4 import BeautifulSoup
			
 
				+        soup = BeautifulSoup(html, 'lxml')
			
 
				+        html = str(soup.body.next_element)
			
 
				+    else:
			
 
				+        for tag in tag_sets:
			
 
				+            html = html.replace(tag, str(tag).lower())
			
 
				+
			
 
				+    repair_tags = _repair_tag()  # 标签修复
			
 
				+    for err, right in repair_tags.items():
			
 
				+        html = html.replace(err, right)
			
 
				+
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _clear_special_tag(html):
			
 
				+    """删除特殊元素标签"""
			
 
				+    for tag, repl in SPECIAL_TAGS.items():
			
 
				+        html = tag.sub(repl, html)
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _clear_input_tag(html, display=False):
			
 
				+    """提取value值，替换input标签"""
			
 
				+    if not display:
			
 
				+        html = html.replace('<input', '<input style="border-color: transparent;"')  # 不显示输入框边框
			
 
				+
			
 
				+    tag = re.compile(r'<input .*?>', re.S)
			
 
				+    value = re.compile(r'value=["|\'](.*?)["|\']')
			
 
				+
			
 
				+    lst = re.findall(tag, html) or []
			
 
				+    for ipt in lst:
			
 
				+        val = re.findall(value, ipt)
			
 
				+        if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
			
 
				+            html = html.replace(ipt, val[0])
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def drop_tree_by_lxml(html, feature):
			
 
				+    tree: HtmlElement = fromstring(html)
			
 
				+    tag_lst = tree.xpath(feature)
			
 
				+    for tag in tag_lst:
			
 
				+        tag.drop_tree()
			
 
				+
			
 
				+    html = tostring(tree, encoding='utf8').decode('utf8')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
			
 
				+    """
			
 
				+    源码清洗
			
 
				+
			
 
				+    :param html: 清洗的页面
			
 
				+    :param special: 额外指定页面清洗规则
			
 
				+    :param completely: 是否完全清洗页面
			
 
				+    :param del_tag: 删除标签
			
 
				+    :return: 页面源码
			
 
				+    """
			
 
				+    special = set() if special is None else special
			
 
				+    OTHER.update(special)
			
 
				+    remove_tags = {
			
 
				+        **INDEPENDENT_TAGS,
			
 
				+        **INLINE_TAGS,
			
 
				+        **BLOCK_TAGS,
			
 
				+        **OTHER,
			
 
				+        **CSS_STYLE,
			
 
				+        **BLANKS,
			
 
				+    }
			
 
				+
			
 
				+    html = _lowercase_tag(html)
			
 
				+    if del_tag:
			
 
				+        html = _clear_special_tag(html)
			
 
				+
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				+
			
 
				+    if completely:
			
 
				+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
			
 
				+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				+
			
 
				+    html = _escape_character(html)
			
 
				+    html = _clear_input_tag(html, **kwargs)
			
 
				+    return html
			
--- a/login.py
+++ b/login.py
@@ -0,0 +1,115 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-10 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from DrissionPage import ChromiumPage, ChromiumOptions
			
 
				+from DrissionPage._functions.tools import PortFinder
			
 
				+
			
 
				+
			
 
				+account_pool = [
			
 
				+    ('18530014520', 'qp!4LXH_'),
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def auto_login(username, password, proxy=False, headless=False, auto_quit=False, accident_url=None):
			
 
				+    co = ChromiumOptions()
			
 
				+
			
 
				+    port, _ = PortFinder(path='./').get_port()
			
 
				+    co.set_paths(
			
 
				+        local_port=port,
			
 
				+        user_data_path=f'./chrome/{username}',
			
 
				+        download_path=f'./download/{username}'
			
 
				+    )
			
 
				+    # 禁用密码保存弹窗
			
 
				+    co.set_argument('--disable-extensions')
			
 
				+
			
 
				+    if proxy:
			
 
				+        proxies = {
			
 
				+            'https': 'socks5://58.221.59.179:8860',
			
 
				+            'http': 'socks5://58.221.59.179:8860'
			
 
				+        }
			
 
				+        co.set_argument('--proxy-server', value=proxies['https'])
			
 
				+    else:
			
 
				+        proxies = None
			
 
				+
			
 
				+    if headless:
			
 
				+        co.set_argument('--headless', value='new')
			
 
				+        co.set_user_agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36')
			
 
				+
			
 
				+    page = ChromiumPage(addr_or_opts=co)
			
 
				+    try:
			
 
				+        targets = [
			
 
				+            'website-seo/v2/cm/getcatid',
			
 
				+            'rest/detail/alltypesdetail/detail',
			
 
				+            'rest/account/companySpace/checkNewUser'
			
 
				+        ]
			
 
				+        page.listen.start(targets=targets, res_type=['Document', 'XHR'])  # 开启路由监听
			
 
				+
			
 
				+        success = page.get('https://vip.qianlima.com/')  # 访问用户管理界面
			
 
				+        if not success:
			
 
				+            return
			
 
				+
			
 
				+        login = page.wait.ele_displayed('x://span[text()="登录"]', timeout=5)
			
 
				+        if login:
			
 
				+            page.ele('x://input[@name="username"]').input(username, clear=True)
			
 
				+            page.ele('x://input[@name="password"]').input(password, clear=True)
			
 
				+            page.ele('x://span[text()="登录"]/parent::*').click()
			
 
				+
			
 
				+        loaded = page.wait.ele_displayed(f'x://p[contains(text(), "{username}")]')  # 等待页面加载完成
			
 
				+        if not loaded:
			
 
				+            print(f'登录失败>{username}')
			
 
				+            return
			
 
				+
			
 
				+        # page.get('http://www.qianlima.com/zb/detail/20241016_454396207.html')
			
 
				+
			
 
				+        packet = page.listen.wait()
			
 
				+        root = Path(__file__).parent
			
 
				+        if not (root / 'account').exists():
			
 
				+            (root / 'account').mkdir(exist_ok=True)
			
 
				+
			
 
				+        file = (root / f'account/{username}.json').absolute()
			
 
				+        with open(file, 'w') as f:
			
 
				+            print(packet.url)  # 打印数据包url
			
 
				+            # print(packet.response.body)
			
 
				+            headers = dict(packet.request.headers)
			
 
				+            print(f'** headers ** \n{json.dumps(headers, indent=4)}')
			
 
				+            cookies = page.cookies(as_dict=True)
			
 
				+            print(f'** cookies ** \n{json.dumps(cookies, indent=4)}')
			
 
				+            user = {
			
 
				+                'cookies': cookies,
			
 
				+                'headers': headers,
			
 
				+                'proxies': proxies
			
 
				+            }
			
 
				+            f.write(json.dumps(user, indent=4))
			
 
				+
			
 
				+            if accident_url is not None:
			
 
				+                while True:
			
 
				+                    page.get(accident_url)  # 人工处理意外情况
			
 
				+                    if input('异常已处理?[Y|N]').upper() == 'Y':
			
 
				+                        break
			
 
				+
			
 
				+            if not auto_quit:
			
 
				+                f.flush()
			
 
				+                while True:
			
 
				+                    if input("退出? >>>"):
			
 
				+                        break
			
 
				+
			
 
				+    except KeyboardInterrupt:
			
 
				+        pass
			
 
				+
			
 
				+    finally:
			
 
				+        page.quit()
			
 
				+        print('关闭浏览器')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    for username, password in account_pool:
			
 
				+        auto_login(username, password, proxy=True, auto_quit=False, headless=False)
			
--- a/net.py
+++ b/net.py
@@ -0,0 +1,191 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-10 
			
 
				+---------
			
 
				+@summary:  千里马详情页专用下载器
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import copy
			
 
				+import functools
			
 
				+import json
			
 
				+
			
 
				+import execjs
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+
			
 
				+
			
 
				+# 来源 rest/detail/alltypesdetail/detail
			
 
				+_cookies = {}
			
 
				+_headers = {}
			
 
				+_proxies = None
			
 
				+
			
 
				+
			
 
				+def router(func):
			
 
				+    @functools.wraps(func)
			
 
				+    def wrapper(*args, **kwargs):
			
 
				+        try:
			
 
				+            return func(*args, **kwargs)
			
 
				+        except AssertionError:
			
 
				+            logger.exception('账号异常')
			
 
				+            return False, {}
			
 
				+
			
 
				+        except KeyboardInterrupt:
			
 
				+            pass
			
 
				+
			
 
				+        except requests.exceptions.RequestException as e:
			
 
				+            logger.exception(f'网络请求错误, 原因:{e}')
			
 
				+            return False, {}
			
 
				+
			
 
				+    return wrapper
			
 
				+
			
 
				+
			
 
				+def set_cookies(ck):
			
 
				+    global _cookies
			
 
				+    _cookies = ck
			
 
				+
			
 
				+
			
 
				+def set_headers(h):
			
 
				+    global _headers
			
 
				+    _headers = h
			
 
				+
			
 
				+
			
 
				+def set_proxies(p):
			
 
				+    global _proxies
			
 
				+    _proxies = p
			
 
				+
			
 
				+
			
 
				+def _extract_cid(href):
			
 
				+    script = '''
			
 
				+    function extractCid(url) {
			
 
				+        if(url.indexOf('/zb/detail') != -1){
			
 
				+            var cidArr = url.split('_');
			
 
				+            if (cidArr.length > 1) {
			
 
				+                var cid = cidArr[1].replace('.html', '');
			
 
				+                if (cid.indexOf('-') != -1) {
			
 
				+                    cid = cid.split("-")[1];
			
 
				+                }
			
 
				+                return cid
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        if (url.indexOf('-') != -1) {
			
 
				+            t = url.lastIndexOf("-")
			
 
				+            n = url.substring(t + 1)
			
 
				+            cid = n.split(".html")[0]
			
 
				+            return cid
			
 
				+        }
			
 
				+        
			
 
				+    }
			
 
				+    '''
			
 
				+    ctx = execjs.compile(script)
			
 
				+    result = ctx.call('extractCid', href)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _extract_referer(href, cid):
			
 
				+    global _cookies, _proxies
			
 
				+    href = str(href).replace('http:', 'https:')
			
 
				+
			
 
				+    url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
			
 
				+    headers = {
			
 
				+        'Accept': '*/*',
			
 
				+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
			
 
				+        'Cache-Control': 'no-cache',
			
 
				+        'Connection': 'keep-alive',
			
 
				+        'Pragma': 'no-cache',
			
 
				+        'Referer': href,
			
 
				+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
			
 
				+        'X-Requested-With': 'XMLHttpRequest',
			
 
				+    }
			
 
				+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
			
 
				+    response = requests.get(url, timeout=10, **requests_params)
			
 
				+    assert response.status_code == 200
			
 
				+    text = response.content.decode()
			
 
				+
			
 
				+    script = '''
			
 
				+    function extractDetailUrl(cid, dataStr) {
			
 
				+        var data = JSON.parse(dataStr)
			
 
				+        var catId = data.data;
			
 
				+        var pageName;
			
 
				+        switch (catId) {
			
 
				+            case 301:
			
 
				+            case 601:
			
 
				+                pageName = "tenderDetail.html";
			
 
				+                break;
			
 
				+            case 202:
			
 
				+                pageName = "projectDetail.html";
			
 
				+                break;
			
 
				+            case 201:
			
 
				+                pageName = "tenderDetail.html";
			
 
				+                break;
			
 
				+            case 101:
			
 
				+                pageName = "projectDetail.html";
			
 
				+                break;
			
 
				+            default:
			
 
				+                pageName = "tenderDetail.html";
			
 
				+                break;
			
 
				+        }
			
 
				+        return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
			
 
				+    }
			
 
				+    '''
			
 
				+    ctx = execjs.compile(script)
			
 
				+    result = ctx.call('extractDetailUrl', cid, text)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _download_detail(href, referer=False):
			
 
				+    global _cookies, _headers, _proxies
			
 
				+    headers = copy.deepcopy(_headers)
			
 
				+    cid = _extract_cid(href)
			
 
				+    if not cid:
			
 
				+        raise ValueError('cid is not exist')
			
 
				+
			
 
				+    url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
			
 
				+    if referer:
			
 
				+        referer = _extract_referer(href, cid)
			
 
				+        headers['Referer'] = referer
			
 
				+
			
 
				+    requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
			
 
				+    try:
			
 
				+        response = requests.post(url, timeout=10, **requests_params)
			
 
				+    except requests.exceptions.Timeout:
			
 
				+        logger.error(f'采集失败|访问超时|{href}')
			
 
				+        return False, None  # 账号额度不足时，返回：None
			
 
				+
			
 
				+    username = _cookies['qlm_username']
			
 
				+    status_code = response.status_code
			
 
				+    if status_code != 200:
			
 
				+        result = response.content.decode()
			
 
				+        logger.error(f'采集失败|{username}|状态码|{status_code}|请求响应|{result}')
			
 
				+        return False, status_code
			
 
				+
			
 
				+    result = response.json()
			
 
				+    data = result['data']
			
 
				+    if not data:
			
 
				+        logger.warning(f'数据异常|{result}')
			
 
				+        return False, data  # 账号额度不足时，返回：None
			
 
				+
			
 
				+    logger.info(f'采集成功[{href}]')
			
 
				+    return True, data
			
 
				+
			
 
				+
			
 
				+@router
			
 
				+def download_json(href, **kwargs):
			
 
				+    _, result = _download_detail(href, **kwargs)
			
 
				+    if not result:
			
 
				+        return False
			
 
				+
			
 
				+    if isinstance(result, int):
			
 
				+        return result
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+@router
			
 
				+def download_html(href, **kwargs):
			
 
				+    _, result = _download_detail(href, **kwargs)
			
 
				+    if not result:
			
 
				+        return False
			
 
				+
			
 
				+    return result['content']
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
 
				+pymongo==3.12.0
			
 
				+pybloom_live==4.0.0
			
 
				+loguru==0.5.3
			
 
				+DrissionPage==4.0.5.6
			
 
				+beautifulsoup4==4.12.3
			
 
				+PyExecJS==1.5.1
			
--- a/数据处理.py
+++ b/数据处理.py
@@ -0,0 +1,70 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-19 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import time
			
 
				+from datetime import datetime
			
 
				+
			
 
				+import bson
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+Int64 = bson.int64.Int64
			
 
				+
			
 
				+client = MongoClient('192.168.3.182', 27017)
			
 
				+qlm_coll = client['zjb_poc']['qlm_data_lst']
			
 
				+jy_coll = client['zjb_poc']['jy_data_lst']
			
 
				+
			
 
				+count = 0
			
 
				+insert_lst = []
			
 
				+with qlm_coll.find() as cursor:
			
 
				+    for item in cursor:
			
 
				+        href = item['url']
			
 
				+        title = item['popTitle'] if 'popTitle' in item else item['showTitle']
			
 
				+        publishtime = item['updateTime']
			
 
				+        l_np_publishtime = datetime.strptime(publishtime, '%Y-%m-%d').timestamp()
			
 
				+
			
 
				+        addr = str(item['areaName']).split('-')
			
 
				+        area = addr[0] if len(addr) > 0 else ''
			
 
				+        city = addr[1] if len(addr) > 1 else ''
			
 
				+        if '国土' in item.get('progName', ''):
			
 
				+            toptype = item['progName']
			
 
				+        else:
			
 
				+            toptype = (item['noticeSegmentTypeName'] or item['progName'])
			
 
				+
			
 
				+        data = {
			
 
				+            'site': '千里马',
			
 
				+            'channel': item['channel'],
			
 
				+            'spidercode': 'sdxzbiddingsjzypc',
			
 
				+            'area': area,
			
 
				+            'city': city,
			
 
				+            'district': '',
			
 
				+            'comeintime': Int64(int(time.time())),
			
 
				+            'isdownload': False,  # 是否下载
			
 
				+            'isfailed': False,  # 是否失败
			
 
				+            'title': title,  # 标题
			
 
				+            'href': href,  # 信息链接
			
 
				+            'publishtime': publishtime,  # 发布时间(字符串)
			
 
				+            'l_np_publishtime': Int64(l_np_publishtime),  # 发布时间(时间戳)
			
 
				+            'buyer': item['tenderees'],  # 招标单位
			
 
				+            'toptype': toptype,  # 公告类型
			
 
				+            'winner': item['bidder'] if item.get('bidder') is not None else '',  # 中标单位
			
 
				+            'agency': item['agent'] if item.get('agent') is not None else '',  # 代理单位
			
 
				+        }
			
 
				+
			
 
				+        insert_lst.append(data)
			
 
				+        if len(insert_lst) == 100:
			
 
				+            jy_coll.insert_many(insert_lst, ordered=False)
			
 
				+            count += len(insert_lst)
			
 
				+            insert_lst = []
			
 
				+            print('已处理{}条'.format(count))
			
 
				+
			
 
				+    if len(insert_lst) > 0:
			
 
				+        jy_coll.insert_many(insert_lst, ordered=False)
			
 
				+        count += len(insert_lst)
			
 
				+        print('已处理{}条'.format(count))
			
 
				+
			
 
				+print('数据处理结束')
			
--- a/采集任务清单.py
+++ b/采集任务清单.py
@@ -0,0 +1,50 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-18 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+from pymongo import MongoClient
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+to_db = MongoClient('192.168.3.182', 27017)
			
 
				+coll = to_db['31zg_poc']['keyword_company']
			
 
				+
			
 
				+suffix_lst = '重机,挖掘机,装载机,泵送,桥泵车,搅拌车,拖泵,搅拌站,车载泵,搅拌车,重起,汽车起重机,履带起重机,桩机,旋挖钻,大旋挖,中旋挖,小旋挖,路机,铣刨机,摊铺机,平地机,压路机,沥青站'.split(',')
			
 
				+suffix_set = set(suffix_lst)
			
 
				+
			
 
				+f = '/Users/dongzhaorui/Desktop/qlm数据采集.xlsx'
			
 
				+df = pd.read_excel(f)
			
 
				+df.fillna('',  inplace=True)
			
 
				+
			
 
				+company_lst = []
			
 
				+for _, i in df.iterrows():
			
 
				+    items = i.to_dict()
			
 
				+    s_company = str(items['集团名称']).strip()
			
 
				+    s_sub_company = str(items['二级局名称']).strip()
			
 
				+
			
 
				+    if s_company and s_company not in company_lst:
			
 
				+        company_lst.append(s_company)
			
 
				+
			
 
				+    if s_sub_company and s_sub_company not in company_lst:
			
 
				+        company_lst.append(s_sub_company)
			
 
				+
			
 
				+data = []
			
 
				+for suffix in suffix_set:
			
 
				+    print(suffix)
			
 
				+
			
 
				+    for company in company_lst:
			
 
				+        data.append({'s_suffix': suffix, 's_company': company, 's_keyword': f'{company}+{suffix}'})
			
 
				+
			
 
				+    if len(data) == 100:
			
 
				+        coll.insert_many(data, ordered=False)
			
 
				+        data = []
			
 
				+
			
 
				+if len(data) > 0:
			
 
				+    coll.insert_many(data, ordered=False)
			
 
				+
			
 
				+print('1234')
			
 
				+
			
--- a/采集列表页(关键词).py
+++ b/采集列表页(关键词).py
@@ -0,0 +1,217 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-06-01 
			
 
				+---------
			
 
				+@summary:  千里马列表页采集
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import json
			
 
				+import math
			
 
				+import random
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+from pybloom_live import BloomFilter
			
 
				+from pymongo import MongoClient
			
 
				+from login import auto_login, account_pool
			
 
				+
			
 
				+
			
 
				+_cookies = None
			
 
				+_headers = None
			
 
				+_proxies = None
			
 
				+
			
 
				+
			
 
				+def send_wechat_warning(msg, send=True):
			
 
				+    markdown = f'采集异常中断，请切换d模式处理。'
			
 
				+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
			
 
				+
			
 
				+    if not send:
			
 
				+        logger.info(markdown)
			
 
				+        return
			
 
				+
			
 
				+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
			
 
				+    headers_ = {'Content-Type': 'application/json'}
			
 
				+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
			
 
				+    request_params = dict(headers=headers_, json=json_data, timeout=10)
			
 
				+    response = requests.post(url, **request_params)
			
 
				+    logger.info(response.json())
			
 
				+
			
 
				+
			
 
				+def setup_cfg(username):
			
 
				+    global _cookies, _headers, _proxies
			
 
				+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
			
 
				+    with open(file, encoding='utf-8') as rp:
			
 
				+        json_data = json.load(rp)
			
 
				+        _cookies = json_data['cookies']
			
 
				+        _headers = json_data['headers']
			
 
				+        _proxies = json_data['proxies']
			
 
				+
			
 
				+
			
 
				+def launch_filter():
			
 
				+    """创建布隆过滤器"""
			
 
				+    logger.debug('创建布隆过滤器...')
			
 
				+    backup = (Path(__file__).parent / 'backup')
			
 
				+    if not backup.exists():
			
 
				+        backup.mkdir(exist_ok=True)
			
 
				+
			
 
				+    file = (backup / 'bloomfilter.f')
			
 
				+    if not file.exists():
			
 
				+        file.touch()  # 初始创建存储文件
			
 
				+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
			
 
				+    else:
			
 
				+        if file.stat().st_size == 0:
			
 
				+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
			
 
				+        else:
			
 
				+            bf = BloomFilter.fromfile(file.open('rb'))
			
 
				+
			
 
				+    return file, bf
			
 
				+
			
 
				+
			
 
				+def fetch(collection, username, keywords, page, page_size, channel, bf):
			
 
				+    # rest/service/website/search/solr -> cookies
			
 
				+    global _cookies, _headers, _proxies
			
 
				+    response = None
			
 
				+    try:
			
 
				+        json_data = {
			
 
				+            'keywords': keywords,
			
 
				+            'timeType': '4',
			
 
				+            'beginTime': '2024-01-01',
			
 
				+            'endTime': '2024-12-04',
			
 
				+            'filtermode': 5,
			
 
				+            'searchMode': 1,
			
 
				+            'currentPage': page,
			
 
				+            'numPerPage': page_size,
			
 
				+            'sortType': 2,
			
 
				+            'allType': -1,
			
 
				+            'noticeSegmentTypeStr': '',
			
 
				+            'beginAmount': '',
			
 
				+            'endAmount': '',
			
 
				+            'purchasingUnitIdList': '',
			
 
				+            'threeClassifyTagStr': '',
			
 
				+            'fourLevelCategoryIdListStr': '',
			
 
				+            'threeLevelCategoryIdListStr': '',
			
 
				+            'levelId': '',
			
 
				+            'tab': 0,
			
 
				+            'searchDataType': 0,
			
 
				+            'types': '-1',
			
 
				+            'showContent': 1,
			
 
				+            'hasTenderTransferProject': 1,
			
 
				+            'newAreas': '',
			
 
				+            'hasChooseSortType': 1,
			
 
				+            'summaryType': 0,
			
 
				+        }
			
 
				+
			
 
				+        response = requests.post(
			
 
				+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
			
 
				+            cookies=_cookies,
			
 
				+            headers=_headers,
			
 
				+            json=json_data,
			
 
				+            proxies=_proxies,
			
 
				+            timeout=60
			
 
				+        )
			
 
				+        assert response.status_code == 200
			
 
				+        result = response.json()
			
 
				+        try:
			
 
				+            total = result['data']['rowCount']
			
 
				+        except TypeError:
			
 
				+            return False, -1, 0
			
 
				+
			
 
				+        dedup_count = 0
			
 
				+        count = 0
			
 
				+        insert_lst = []
			
 
				+        data = result['data']['data']
			
 
				+        for item in data:
			
 
				+            href = item.get('url')
			
 
				+            if href is None or href in bf:
			
 
				+                dedup_count += 1
			
 
				+                # logger.debug(f'重复数据[{href}]')
			
 
				+                continue
			
 
				+
			
 
				+            item['channel'] = channel
			
 
				+            insert_lst.append(item)
			
 
				+            if len(insert_lst) == page_size:
			
 
				+                collection.insert_many(insert_lst, ordered=False)
			
 
				+                count += len(insert_lst)
			
 
				+                insert_lst = []
			
 
				+
			
 
				+            bf.add(href)
			
 
				+
			
 
				+        if len(insert_lst) > 0:
			
 
				+            collection.insert_many(insert_lst, ordered=False)
			
 
				+            count += len(insert_lst)
			
 
				+
			
 
				+        logger.info(f'自动翻页|第{page}页|入库{count}条|重复{dedup_count}条')
			
 
				+        return True, total, len(data)
			
 
				+
			
 
				+    except AssertionError:
			
 
				+        logger.error(f'{username}|账号异常|请求失败')
			
 
				+        # send_wechat_warning(msg=response.content.decode())
			
 
				+        return False, -2, 0
			
 
				+
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        logger.exception(f'网络请求错误, 原因:{e}')
			
 
				+        return False, -3, 0
			
 
				+
			
 
				+
			
 
				+def spider(username, keywords, bf, coll, channel):
			
 
				+    setup_cfg(username)
			
 
				+
			
 
				+    page = 1
			
 
				+    page_size = 100
			
 
				+
			
 
				+    # 翻页
			
 
				+    retries = 0
			
 
				+    while True:
			
 
				+        ok, total, count = fetch(coll, username, keywords, page, page_size, channel, bf)
			
 
				+        if ok is False:
			
 
				+            state = total
			
 
				+            if state == -1:
			
 
				+                logger.info(f'{username}|请求参数错误|修改参数')
			
 
				+                return False
			
 
				+            elif state == -2:
			
 
				+                logger.info(f'{username}|访问频繁|3秒后切换账号')
			
 
				+                time.sleep(3)
			
 
				+                return
			
 
				+            else:
			
 
				+                logger.error(f'{username}|网络异常|准备重试~{retries}')
			
 
				+                if retries > 3:
			
 
				+                    return
			
 
				+                else:
			
 
				+                    retries += 1
			
 
				+                    continue
			
 
				+
			
 
				+        # time.sleep(math.log(random.randint(100, 2400), 2))
			
 
				+        time.sleep(.5)
			
 
				+        if ok is True and count < page_size:
			
 
				+            logger.info(f'采集完成|保存{total}条')
			
 
				+            break
			
 
				+        else:
			
 
				+            page += 1
			
 
				+
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    f, bf = launch_filter()  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
			
 
				+
			
 
				+    client = MongoClient('192.168.3.182', 27017)
			
 
				+    coll = client['zjb_poc']['qlm_data_lst']
			
 
				+    channel = '综合'
			
 
				+    keywords = '黑龙江省八目科技开发有限公司'
			
 
				+    try:
			
 
				+        username, password = account_pool.pop(0)
			
 
				+        auto_login(username, password, proxy=True, headless=True, auto_quit=True)
			
 
				+        spider(username, keywords, bf, coll, channel)
			
 
				+    except KeyboardInterrupt:
			
 
				+        pass
			
 
				+
			
 
				+    finally:
			
 
				+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
			
 
				+        logger.info('采集结束')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/采集列表页(地域).py
+++ b/采集列表页(地域).py
@@ -0,0 +1,253 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-06-01 
			
 
				+---------
			
 
				+@summary:  千里马列表页采集
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import json
			
 
				+import math
			
 
				+import random
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+from pybloom_live import BloomFilter
			
 
				+from pymongo import MongoClient
			
 
				+from login import auto_login, account_pool
			
 
				+
			
 
				+
			
 
				+_cookies = None
			
 
				+_headers = None
			
 
				+_proxies = None
			
 
				+
			
 
				+
			
 
				+def send_wechat_warning(msg, send=True):
			
 
				+    markdown = f'采集异常中断，请切换d模式处理。'
			
 
				+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
			
 
				+
			
 
				+    if not send:
			
 
				+        logger.info(markdown)
			
 
				+        return
			
 
				+
			
 
				+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
			
 
				+    headers_ = {'Content-Type': 'application/json'}
			
 
				+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
			
 
				+    request_params = dict(headers=headers_, json=json_data, timeout=10)
			
 
				+    response = requests.post(url, **request_params)
			
 
				+    logger.info(response.json())
			
 
				+
			
 
				+
			
 
				+def setup_cfg(username):
			
 
				+    global _cookies, _headers, _proxies
			
 
				+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
			
 
				+    with open(file, encoding='utf-8') as rp:
			
 
				+        json_data = json.load(rp)
			
 
				+        _cookies = json_data['cookies']
			
 
				+        _headers = json_data['headers']
			
 
				+        _proxies = json_data['proxies']
			
 
				+
			
 
				+
			
 
				+def launch_filter():
			
 
				+    """创建布隆过滤器"""
			
 
				+    logger.debug('创建布隆过滤器...')
			
 
				+    backup = (Path(__file__).parent / 'backup')
			
 
				+    if not backup.exists():
			
 
				+        backup.mkdir(exist_ok=True)
			
 
				+
			
 
				+    file = (backup / 'bloomfilter.f')
			
 
				+    if not file.exists():
			
 
				+        file.touch()  # 初始创建存储文件
			
 
				+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
			
 
				+    else:
			
 
				+        if file.stat().st_size == 0:
			
 
				+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
			
 
				+        else:
			
 
				+            bf = BloomFilter.fromfile(file.open('rb'))
			
 
				+
			
 
				+    return file, bf
			
 
				+
			
 
				+
			
 
				+def fetch(collection, username, page, page_size, channel, bf):
			
 
				+    # rest/service/website/search/solr -> cookies
			
 
				+    global _cookies, _headers, _proxies
			
 
				+    response = None
			
 
				+    try:
			
 
				+        if channel == '中标信息':
			
 
				+            json_data = {
			
 
				+                'keywords': '',
			
 
				+                'timeType': 4,
			
 
				+                'beginTime': '2024-09-01',
			
 
				+                'endTime': '2024-09-30',
			
 
				+                'filtermode': '8',
			
 
				+                'searchMode': 1,
			
 
				+                'currentPage': page,
			
 
				+                'numPerPage': page_size,
			
 
				+                'sortType': 1,
			
 
				+                'allType': 3,
			
 
				+                'beginAmount': '',
			
 
				+                'endAmount': '',
			
 
				+                'purchasingUnitIdList': '',
			
 
				+                'threeClassifyTagStr': '',
			
 
				+                'fourLevelCategoryIdListStr': '',
			
 
				+                'threeLevelCategoryIdListStr': '',
			
 
				+                'levelId': '',
			
 
				+                'tab': 0,
			
 
				+                'searchDataType': 1,
			
 
				+                'types': 3,
			
 
				+                'showContent': 1,
			
 
				+                'newAreas': '1744',
			
 
				+                'hasChooseSortType': 1,
			
 
				+                'progIdAndNoticeSegmentTypeMaps': {
			
 
				+                    '3': [],
			
 
				+                },
			
 
				+                'summaryType': 0,
			
 
				+            }
			
 
				+        elif channel == '招标信息':
			
 
				+            json_data = {
			
 
				+                'keywords': '',
			
 
				+                'timeType': 4,
			
 
				+                'beginTime': '2024-09-01',
			
 
				+                'endTime': '2024-09-30',
			
 
				+                'filtermode': '8',
			
 
				+                'searchMode': 1,
			
 
				+                'currentPage': page,
			
 
				+                'numPerPage': page_size,
			
 
				+                'sortType': 1,
			
 
				+                'allType': 0,
			
 
				+                'beginAmount': '',
			
 
				+                'endAmount': '',
			
 
				+                'purchasingUnitIdList': '',
			
 
				+                'threeClassifyTagStr': '',
			
 
				+                'fourLevelCategoryIdListStr': '',
			
 
				+                'threeLevelCategoryIdListStr': '',
			
 
				+                'levelId': '',
			
 
				+                'tab': 0,
			
 
				+                'searchDataType': 1,
			
 
				+                'types': -1,
			
 
				+                'showContent': 1,
			
 
				+                'newAreas': '1744',
			
 
				+                'hasChooseSortType': 1,
			
 
				+                'progIdAndNoticeSegmentTypeMaps': {
			
 
				+                    '0': [],
			
 
				+                    '1': [],
			
 
				+                },
			
 
				+                'summaryType': 0,
			
 
				+            }
			
 
				+        else:
			
 
				+            pass
			
 
				+
			
 
				+        response = requests.post(
			
 
				+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
			
 
				+            cookies=_cookies,
			
 
				+            headers=_headers,
			
 
				+            json=json_data,
			
 
				+            proxies=_proxies,
			
 
				+            timeout=60
			
 
				+        )
			
 
				+        assert response.status_code == 200
			
 
				+        result = response.json()
			
 
				+        try:
			
 
				+            total = result['data']['rowCount']
			
 
				+        except TypeError:
			
 
				+            return False, -1, 0
			
 
				+
			
 
				+        dedup_count = 0
			
 
				+        count = 0
			
 
				+        insert_lst = []
			
 
				+        data = result['data']['data']
			
 
				+        for item in data:
			
 
				+            href = item.get('url')
			
 
				+            if href is None or href in bf:
			
 
				+                dedup_count += 1
			
 
				+                # logger.debug(f'重复数据[{href}]')
			
 
				+                continue
			
 
				+
			
 
				+            item['channel'] = channel
			
 
				+            insert_lst.append(item)
			
 
				+            if len(insert_lst) == page_size:
			
 
				+                collection.insert_many(insert_lst, ordered=False)
			
 
				+                count += len(insert_lst)
			
 
				+                insert_lst = []
			
 
				+
			
 
				+            bf.add(href)
			
 
				+
			
 
				+        if len(insert_lst) > 0:
			
 
				+            collection.insert_many(insert_lst, ordered=False)
			
 
				+            count += len(insert_lst)
			
 
				+
			
 
				+        logger.info(f'自动翻页|第{page}页|入库{count}条|重复{dedup_count}条')
			
 
				+        return True, total, len(data)
			
 
				+
			
 
				+    except AssertionError:
			
 
				+        logger.error(f'{username}|账号异常|请求失败')
			
 
				+        # send_wechat_warning(msg=response.content.decode())
			
 
				+        return False, -2, 0
			
 
				+
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        logger.exception(f'网络请求错误, 原因:{e}')
			
 
				+        return False, -3, 0
			
 
				+
			
 
				+
			
 
				+def spider(username, bf, coll, channel):
			
 
				+    setup_cfg(username)
			
 
				+
			
 
				+    page = 1
			
 
				+    page_size = 100
			
 
				+
			
 
				+    # 翻页
			
 
				+    retries = 0
			
 
				+    while True:
			
 
				+        ok, total, count = fetch(coll, username, page, page_size, channel, bf)
			
 
				+        if ok is False:
			
 
				+            state = total
			
 
				+            if state == -1:
			
 
				+                logger.info(f'{username}|请求参数错误|修改参数')
			
 
				+                return False
			
 
				+            elif state == -2:
			
 
				+                logger.info(f'{username}|访问频繁|3秒后切换账号')
			
 
				+                time.sleep(3)
			
 
				+                return
			
 
				+            else:
			
 
				+                logger.error(f'{username}|网络异常|准备重试~{retries}')
			
 
				+                if retries > 3:
			
 
				+                    return
			
 
				+                else:
			
 
				+                    retries += 1
			
 
				+                    continue
			
 
				+
			
 
				+        # time.sleep(math.log(random.randint(100, 2400), 2))
			
 
				+        time.sleep(.5)
			
 
				+        if ok is True and count < page_size:
			
 
				+            logger.info(f'采集完成|保存{total}条')
			
 
				+            break
			
 
				+        else:
			
 
				+            page += 1
			
 
				+
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    f, bf = launch_filter()  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
			
 
				+
			
 
				+    client = MongoClient('192.168.3.182', 27017)
			
 
				+    coll = client['sdlt_poc']['qlm_data_lst']
			
 
				+    # channel = '招标信息'
			
 
				+    channel = '中标信息'
			
 
				+
			
 
				+    try:
			
 
				+        username, password = account_pool.pop(0)
			
 
				+        auto_login(username, password, proxy=True, headless=True, auto_quit=True)
			
 
				+        spider(username, bf, coll, channel)
			
 
				+    except KeyboardInterrupt:
			
 
				+        pass
			
 
				+
			
 
				+    finally:
			
 
				+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
			
 
				+        logger.info('采集结束')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/采集列表页.py
+++ b/采集列表页.py
@@ -0,0 +1,268 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-06-01 
			
 
				+---------
			
 
				+@summary:  千里马列表页采集
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import json
			
 
				+import math
			
 
				+import random
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+from pybloom_live import BloomFilter
			
 
				+from pymongo import MongoClient
			
 
				+from login import auto_login, account_pool
			
 
				+
			
 
				+
			
 
				+_cookies = None
			
 
				+_headers = None
			
 
				+_proxies = None
			
 
				+
			
 
				+
			
 
				+def send_wechat_warning(msg, send=True):
			
 
				+    markdown = f'采集异常中断，请切换d模式处理。'
			
 
				+    markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
			
 
				+
			
 
				+    if not send:
			
 
				+        logger.info(markdown)
			
 
				+        return
			
 
				+
			
 
				+    url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
			
 
				+    headers_ = {'Content-Type': 'application/json'}
			
 
				+    json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
			
 
				+    request_params = dict(headers=headers_, json=json_data, timeout=10)
			
 
				+    response = requests.post(url, **request_params)
			
 
				+    logger.info(response.json())
			
 
				+
			
 
				+
			
 
				+def setup_cfg(username):
			
 
				+    global _cookies, _headers, _proxies
			
 
				+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
			
 
				+    with open(file, encoding='utf-8') as rp:
			
 
				+        json_data = json.load(rp)
			
 
				+        _cookies = json_data['cookies']
			
 
				+        _headers = json_data['headers']
			
 
				+        _proxies = json_data['proxies']
			
 
				+
			
 
				+
			
 
				+def launch_filter():
			
 
				+    """创建布隆过滤器"""
			
 
				+    logger.debug('创建布隆过滤器...')
			
 
				+    backup = (Path(__file__).parent / 'backup')
			
 
				+    if not backup.exists():
			
 
				+        backup.mkdir(exist_ok=True)
			
 
				+
			
 
				+    file = (backup / 'bloomfilter.f')
			
 
				+    if not file.exists():
			
 
				+        file.touch()  # 初始创建存储文件
			
 
				+        bf = BloomFilter(capacity=1000000, error_rate=0.001)  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
			
 
				+    else:
			
 
				+        if file.stat().st_size == 0:
			
 
				+            bf = BloomFilter(capacity=1000000, error_rate=0.001)
			
 
				+        else:
			
 
				+            bf = BloomFilter.fromfile(file.open('rb'))
			
 
				+
			
 
				+    return file, bf
			
 
				+
			
 
				+
			
 
				+def fetch(collection, username, page, page_size, keywords, bf):
			
 
				+    # rest/service/website/search/solr -> cookies
			
 
				+    global _cookies, _headers, _proxies
			
 
				+    response = None
			
 
				+    try:
			
 
				+        json_data = {
			
 
				+            'keywords': keywords,
			
 
				+            'timeType': 4,
			
 
				+            'beginTime': '2024-09-01',
			
 
				+            'endTime': '2024-09-30',
			
 
				+            'filtermode': '8',
			
 
				+            'searchMode': 1,
			
 
				+            'currentPage': page,
			
 
				+            'numPerPage': page_size,
			
 
				+            'sortType': '1',
			
 
				+            'allType': -1,
			
 
				+            'beginAmount': '',
			
 
				+            'endAmount': '',
			
 
				+            'purchasingUnitIdList': '',
			
 
				+            'threeClassifyTagStr': '',
			
 
				+            'fourLevelCategoryIdListStr': '',
			
 
				+            'threeLevelCategoryIdListStr': '',
			
 
				+            'levelId': '',
			
 
				+            'tab': 2,
			
 
				+            'types': '-1',
			
 
				+            'searchDataType': 1,
			
 
				+            'showContent': 1,
			
 
				+            'hasLinkName': '',
			
 
				+            'newAreas': '',
			
 
				+            'hasChooseSortType': 1,
			
 
				+            'progIdAndNoticeSegmentTypeMaps': {
			
 
				+                '3': [],
			
 
				+                '4': [
			
 
				+                    11,
			
 
				+                    12,
			
 
				+                ],
			
 
				+                '5': [],
			
 
				+            },
			
 
				+            'summaryType': 1,
			
 
				+        }
			
 
				+
			
 
				+        response = requests.post(
			
 
				+            'https://search.vip.qianlima.com/rest/service/website/search/solr',
			
 
				+            cookies=_cookies,
			
 
				+            headers=_headers,
			
 
				+            json=json_data,
			
 
				+            proxies=_proxies,
			
 
				+            timeout=60
			
 
				+        )
			
 
				+        assert response.status_code == 200
			
 
				+        result = response.json()
			
 
				+        try:
			
 
				+            total = result['data']['rowCount']
			
 
				+            if total > 500:
			
 
				+                # 丢弃不要
			
 
				+                return True, total, page_size
			
 
				+
			
 
				+        except TypeError:
			
 
				+            return False, -1, 0
			
 
				+
			
 
				+        data = result['data']['data']
			
 
				+
			
 
				+        dedup_count = 0
			
 
				+        count = 0
			
 
				+        insert_lst = []
			
 
				+        for item in data:
			
 
				+            href = item.get('url')
			
 
				+            if href is None or href in bf:
			
 
				+                dedup_count += 1
			
 
				+                # logger.debug(f'重复数据[{href}]')
			
 
				+                continue
			
 
				+
			
 
				+            insert_lst.append(item)
			
 
				+            if len(insert_lst) == page_size:
			
 
				+                collection.insert_many(insert_lst, ordered=False)
			
 
				+                count += len(insert_lst)
			
 
				+                insert_lst = []
			
 
				+
			
 
				+            bf.add(href)
			
 
				+
			
 
				+        if len(insert_lst) > 0:
			
 
				+            collection.insert_many(insert_lst, ordered=False)
			
 
				+            count += len(insert_lst)
			
 
				+
			
 
				+        logger.info(f'自动翻页|第{page}页|{keywords}|入库{count}条|重复{dedup_count}条')
			
 
				+        return True, total, len(data)
			
 
				+
			
 
				+    except AssertionError:
			
 
				+        logger.error(f'{username}|账号异常|请求失败')
			
 
				+        # send_wechat_warning(msg=response.content.decode())
			
 
				+        return False, -2, 0
			
 
				+
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        logger.exception(f'网络请求错误, 原因:{e}')
			
 
				+        return False, -3, 0
			
 
				+
			
 
				+
			
 
				+def spider(username, tasks, bf, to_data_lst, coll):
			
 
				+    setup_cfg(username)
			
 
				+
			
 
				+    while tasks:
			
 
				+        page = 1
			
 
				+        page_size = 100
			
 
				+
			
 
				+        # 翻页
			
 
				+        state = 1
			
 
				+        retries = 0
			
 
				+        isdownload = True
			
 
				+        _id, keywords = tasks.pop()
			
 
				+        while True:
			
 
				+            ok, total, count = fetch(coll, username, page, page_size, keywords, bf)
			
 
				+            if ok is False:
			
 
				+                state = total
			
 
				+                if state == -1:
			
 
				+                    logger.info(f'{username}|请求参数错误|修改参数')
			
 
				+                    return False
			
 
				+                elif state == -2:
			
 
				+                    logger.info(f'{username}|访问频繁|3秒后切换账号')
			
 
				+                    time.sleep(3)
			
 
				+                    return
			
 
				+                else:
			
 
				+                    logger.error(f'{username}|网络异常|准备重试~{retries}')
			
 
				+                    if retries > 3:
			
 
				+                        return
			
 
				+                    else:
			
 
				+                        retries += 1
			
 
				+                        continue
			
 
				+
			
 
				+            # time.sleep(math.log(random.randint(100, 2400), 2))
			
 
				+            time.sleep(.5)
			
 
				+
			
 
				+            if ok is True and total >= 500:
			
 
				+                logger.error(f'采集完成|{keywords}|疑似模糊匹配|跳过采集')
			
 
				+                isdownload = False
			
 
				+                break
			
 
				+
			
 
				+            if ok is True and count < page_size:
			
 
				+                logger.info(f'采集完成|{keywords}|保存{total}条')
			
 
				+                break
			
 
				+            else:
			
 
				+                page += 1
			
 
				+
			
 
				+        # 更新任务状态
			
 
				+        if state >= 0:
			
 
				+            to_data_lst.update_one(
			
 
				+                {'_id': _id},
			
 
				+                {
			
 
				+                    '$set': {
			
 
				+                        'b_isdownload': isdownload,
			
 
				+                        'i_total': total,
			
 
				+                        'i_pages': page,
			
 
				+                        'i_state': state,
			
 
				+                        'i_updatetime': int(time.time())
			
 
				+                    }
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    f, bf = launch_filter()  # 创建布隆过滤器，预计插入100万个元素，错误率0.1%
			
 
				+
			
 
				+    client = MongoClient('192.168.3.182', 27017)
			
 
				+    to_data_lst = client['31zg_poc']['keyword_company']
			
 
				+    coll = client['31zg_poc']['qlm_data_lst']
			
 
				+
			
 
				+    try:
			
 
				+        while True:
			
 
				+            q = {'b_isdownload': None}
			
 
				+            p = {'s_keyword': 1, '_id': 1}
			
 
				+            with to_data_lst.find(q, projection=p, limit=50) as cursor:
			
 
				+                tasks = [(item['_id'], item['s_keyword']) for item in cursor]
			
 
				+
			
 
				+            username, password = account_pool.pop(0)
			
 
				+            auto_login(username, password, proxy=True, headless=True, auto_quit=True)
			
 
				+            state = spider(username, tasks, bf, to_data_lst, coll)
			
 
				+            if state is True:
			
 
				+                account_pool.append((username, password))
			
 
				+
			
 
				+            if state is False:
			
 
				+                break
			
 
				+
			
 
				+            if not to_data_lst.count_documents(q):
			
 
				+                break
			
 
				+
			
 
				+    except KeyboardInterrupt:
			
 
				+        pass
			
 
				+
			
 
				+    finally:
			
 
				+        bf.tofile(f.open('wb'))  # 保存布隆过滤器到本地
			
 
				+        logger.info('采集结束')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/采集详情页.py
+++ b/采集详情页.py
@@ -0,0 +1,178 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-10
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import json
			
 
				+import time
			
 
				+
			
 
				+import bson
			
 
				+from loguru import logger
			
 
				+from pymongo import MongoClient
			
 
				+from pymongo.operations import UpdateOne
			
 
				+
			
 
				+import net
			
 
				+from clean_html import cleaner, drop_tree_by_lxml
			
 
				+from pathlib import Path
			
 
				+from login import auto_login, account_pool
			
 
				+
			
 
				+Int64 = bson.int64.Int64
			
 
				+
			
 
				+
			
 
				+def setup_cfg(username):
			
 
				+    file = (Path(__file__).parent / f'account/{username}.json').absolute()
			
 
				+    with open(file, encoding='utf-8') as rp:
			
 
				+        json_data = json.load(rp)
			
 
				+        net.set_cookies(ck=json_data['cookies'])
			
 
				+        net.set_headers(h=json_data['headers'])
			
 
				+        net.set_proxies(p=json_data['proxies'])
			
 
				+
			
 
				+
			
 
				+def bulk_update(collection, id_lst, update):
			
 
				+    """
			
 
				+    批量更新任务状态
			
 
				+
			
 
				+    :param pymongo.collection.Collection collection:
			
 
				+    :param id_lst:
			
 
				+    :param dict update:更新条件
			
 
				+    :return:
			
 
				+    """
			
 
				+    count = 0
			
 
				+    update_lst = []
			
 
				+    for id_ in id_lst:
			
 
				+        update['updatetime'] = Int64(int(time.time()))  # 更新任务时间
			
 
				+        update_lst.append(UpdateOne({'_id': id_}, {'$set': update}))
			
 
				+        if len(update_lst) == 50:
			
 
				+            results = collection.bulk_write(update_lst, ordered=False)
			
 
				+            count += results.modified_count
			
 
				+            update_lst = []
			
 
				+
			
 
				+    if len(update_lst) > 0:
			
 
				+        results = collection.bulk_write(update_lst, ordered=False)
			
 
				+        count += results.modified_count
			
 
				+
			
 
				+    return count
			
 
				+
			
 
				+
			
 
				+def finalize(insert_lst, update_dict, data_coll, lst_coll):
			
 
				+    if len(insert_lst) > 0:
			
 
				+        data_coll.insert_many(insert_lst, ordered=False)
			
 
				+
			
 
				+    success_ids = update_dict['success']
			
 
				+    if bulk_update(lst_coll, success_ids, {'isdownload': True}):
			
 
				+        logger.info(f'批量更新[采集成功{len(success_ids)}条]任务状态')
			
 
				+
			
 
				+    failed_ids = update_dict['failed']
			
 
				+    if bulk_update(lst_coll, failed_ids, {'isdownload': True, 'isfailed': True}):
			
 
				+        logger.info(f'批量更新[采集失败{len(failed_ids)}条]任务状态')
			
 
				+
			
 
				+
			
 
				+def spider(username, password, task_lst, data_coll, lst_coll):
			
 
				+    setup_cfg(username)
			
 
				+
			
 
				+    update_dict = {'success': [], 'failed': []}
			
 
				+    insert_lst = []
			
 
				+
			
 
				+    def handle_task(task, ret):
			
 
				+        if len(ret) == 0:
			
 
				+            update_dict['failed'].append(task['_id'])
			
 
				+            logger.error(f'下载失败|{href}')
			
 
				+        else:
			
 
				+            html = drop_tree_by_lxml(ret['content'], '//*[contains(text(), "企业信息")]')
			
 
				+            insert_lst.append({
			
 
				+                'site': task['site'],
			
 
				+                'channel': task['channel'],
			
 
				+                'spidercode': task['spidercode'],
			
 
				+                'area': task['area'],
			
 
				+                'city': task['city'],
			
 
				+                'district': task['district'],
			
 
				+                'href': '#',
			
 
				+                'competehref': href,
			
 
				+                'title': task['title'],
			
 
				+                's_title': task['title'],
			
 
				+                'contenthtml': html,
			
 
				+                'detail': cleaner(html),
			
 
				+                'publishtime': task['publishtime'],
			
 
				+                'l_np_publishtime': task['l_np_publishtime'],
			
 
				+                'comeintime': Int64(int(time.time())),
			
 
				+                'T': 'bidding',
			
 
				+                'infoformat': 1,
			
 
				+                'sendflag': 'false',
			
 
				+                'repeat': 'true',
			
 
				+                'iscompete': True,
			
 
				+                '_d': 'comeintime',
			
 
				+                'publishdept': '',
			
 
				+                'type': '',
			
 
				+                'is_mixed': True
			
 
				+            })
			
 
				+            update_dict['success'].append(task['_id'])
			
 
				+
			
 
				+    for task in task_lst:
			
 
				+        href = task['href']
			
 
				+        ret = net.download_json(href, referer=False)
			
 
				+        if isinstance(ret, int) and ret == 429:
			
 
				+            auto_login(username, password, proxy=True, headless=False, auto_quit=True, accident_url=href)
			
 
				+            setup_cfg(username)
			
 
				+            ret = net.download_json(href, referer=False)
			
 
				+            if input('退出:0 继续:1\n') == '0':
			
 
				+                finalize(insert_lst, update_dict, data_coll, lst_coll)
			
 
				+                return False
			
 
				+
			
 
				+        if ret is False:
			
 
				+            logger.error(f'账号失效|{username}')
			
 
				+            finalize(insert_lst, update_dict, data_coll, lst_coll)
			
 
				+            return False
			
 
				+
			
 
				+        handle_task(task, ret)
			
 
				+
			
 
				+        if len(insert_lst) == 50:
			
 
				+            data_coll.insert_many(insert_lst, ordered=False)
			
 
				+            insert_lst = []
			
 
				+
			
 
				+        time.sleep(.5)
			
 
				+
			
 
				+    finalize(insert_lst, update_dict, data_coll, lst_coll)
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    logger.info('**** 数据采集开始 ****')
			
 
				+
			
 
				+    client = MongoClient('192.168.3.182', 27017)
			
 
				+    data_coll = client['zjb_poc']['jy_data_bak']
			
 
				+    lst_coll = client['zjb_poc']['jy_data_lst']
			
 
				+
			
 
				+    try:
			
 
				+        while True:
			
 
				+            if len(account_pool) == 0:
			
 
				+                logger.warning('账号数量已不足,请及时补充')
			
 
				+                break
			
 
				+
			
 
				+            # q = {'isdownload': False, 'isuse': {'$in': [4]}}
			
 
				+            # q = {'isdownload': False, 'isuse': {'$in': [2, 3]}}
			
 
				+            q = {'isdownload': False, 'is_use': 0}
			
 
				+            with lst_coll.find(q, limit=100) as cursor:
			
 
				+                task_lst = [item for item in cursor]
			
 
				+
			
 
				+            username, password = account_pool.pop(0)
			
 
				+            auto_login(username, password, proxy=True, headless=True, auto_quit=True)
			
 
				+            ret = spider(username, password, task_lst, data_coll, lst_coll)
			
 
				+            if ret is False:
			
 
				+                logger.info('切换账号')
			
 
				+                continue
			
 
				+
			
 
				+            if not lst_coll.count_documents(q):
			
 
				+                break
			
 
				+
			
 
				+    except KeyboardInterrupt:
			
 
				+        pass
			
 
				+
			
 
				+    finally:
			
 
				+        logger.info('**** 数据采集结束 ****')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()