2 년 전 · 76bf869160
--- a/bzz/spider.py
+++ b/bzz/spider.py
@@ -0,0 +1,443 @@
 
				+import json
			
 
				+import re
			
 
				+import time
			
 
				+
			
 
				+import redis
			
 
				+import requests
			
 
				+from bson.int64 import Int64
			
 
				+from parsel import Selector
			
 
				+from pymongo import MongoClient
			
 
				+from urllib.parse import quote
			
 
				+
			
 
				+cli = MongoClient("192.168.20.248", 27017)
			
 
				+bzz_list = cli['dzr']['bzz_zb_list']
			
 
				+
			
 
				+cli1 = MongoClient("127.0.0.1", 27001)
			
 
				+data_bak = cli['dzr']['data_bak']
			
 
				+
			
 
				+
			
 
				+_pool = redis.ConnectionPool(
			
 
				+    host='192.168.20.248',
			
 
				+    port=6379,
			
 
				+    password='top@123',
			
 
				+    db=2
			
 
				+)
			
 
				+r = redis.Redis(connection_pool=_pool, decode_responses=True)
			
 
				+redis_key = 'duplicate_bzz_list'
			
 
				+type_maps = {
			
 
				+    '中标': '42',
			
 
				+    '成交': '43',
			
 
				+    '单一来源': '410',
			
 
				+    '合同及验收': '48',
			
 
				+}
			
 
				+
			
 
				+# channel = "中标"
			
 
				+# channel_sign = "42"
			
 
				+
			
 
				+headers = {
			
 
				+    "Accept": "application/json, text/plain, */*",
			
 
				+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
			
 
				+    "Cache-Control": "no-cache",
			
 
				+    "Connection": "keep-alive",
			
 
				+    "Content-Type": "application/json",
			
 
				+    "Origin": "https://www.biaozhaozhao.com",
			
 
				+    "Pragma": "no-cache",
			
 
				+    "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
			
 
				+    "Sec-Fetch-Dest": "empty",
			
 
				+    "Sec-Fetch-Mode": "cors",
			
 
				+    "Sec-Fetch-Site": "same-origin",
			
 
				+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
			
 
				+    "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
			
 
				+    "sec-ch-ua-mobile": "?0",
			
 
				+    "sec-ch-ua-platform": "\"macOS\"",
			
 
				+    "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
			
 
				+    "x-kzz-request-from": "qcc-tender-web",
			
 
				+    "x-kzz-request-id": "27328858-5576-2734-188319849847",
			
 
				+    "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
			
 
				+    "x-kzz-request-time": "1668845731606"
			
 
				+}
			
 
				+cookies = {
			
 
				+    "QCCSESSID": "b410c26e4aa5895529a652519c",
			
 
				+    "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
			
 
				+    "ls_371ef9f55b6b63dc": "42d6a4659d87930e"
			
 
				+}
			
 
				+
			
 
				+# 独立元素
			
 
				+INDEPENDENT_TAGS = {
			
 
				+    '<head>[\s\S]*?</head>': '',
			
 
				+    '<html>|<html [^>]*>|</html>': '',
			
 
				+    '<body>|<body [^>]*>|</body>': '',
			
 
				+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
			
 
				+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+    '\\xa0|\\u3000': '',  # 空格
			
 
				+    '<!--[\s\S]*?-->': '',  # 注释
			
 
				+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+    '<input>': '',  # 输入框
			
 
				+    '<img[^>]*>': '<br>',  # 图片
			
 
				+}
			
 
				+# 行内元素
			
 
				+INLINE_TAGS = {
			
 
				+    '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				+    '<span>|<span [^>]*>|</span>': '',  # span
			
 
				+    '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				+    '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '',  # font
			
 
				+}
			
 
				+# 块级元素
			
 
				+BLOCK_TAGS = {
			
 
				+    # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
			
 
				+    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				+    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
			
 
				+    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
			
 
				+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				+}
			
 
				+# 其他
			
 
				+OTHER = {
			
 
				+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
			
 
				+    '<epointform>': '',
			
 
				+    '<!doctype html>|<!doctype html [^>]*>': '',
			
 
				+    '【关闭】|关闭': '',
			
 
				+    '【打印】|打印本页': '',
			
 
				+    '【字体：[\s\S]*】': '',
			
 
				+    '文章来源：[\u4e00-\u9fa5]+': '',
			
 
				+    '浏览次数：.*[<]+': '',
			
 
				+    '（责任编辑：.*?）': '',
			
 
				+    '分享到[：]': '',
			
 
				+    '相关链接：[\s\S]+': '',
			
 
				+    '阅读数[:：]\d+': '',
			
 
				+}
			
 
				+# 样式
			
 
				+CSS_STYLE = {
			
 
				+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
			
 
				+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
			
 
				+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
			
 
				+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				+}
			
 
				+# 空白符
			
 
				+BLANKS = {
			
 
				+    '\n\s*\n': '\n',
			
 
				+    '\s*\n\s*': '\n',
			
 
				+    '[^\S\n]': ' ',
			
 
				+    '\s+': ' ',
			
 
				+}
			
 
				+# css标签集合
			
 
				+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				+# css属性集合
			
 
				+ATTRS = {'id', 'class', 'style', 'width'}
			
 
				+
			
 
				+
			
 
				+def _repair_tag():
			
 
				+    """异常的标签组合,用来替换非标准页面的标签"""
			
 
				+    _repairs = {}
			
 
				+    for tag in TAGS:
			
 
				+        for attr in ATTRS:
			
 
				+            key = '{}{}'.format(tag, attr)
			
 
				+            val = '{} {}'.format(tag, attr)
			
 
				+            _repairs[key] = val
			
 
				+    return _repairs
			
 
				+
			
 
				+
			
 
				+def _escape_character(html):
			
 
				+    """转义字符"""
			
 
				+    html = html.replace('&lt;', '<')
			
 
				+    html = html.replace('&gt;', '>')
			
 
				+    html = html.replace('&quot;', '"')
			
 
				+    html = html.replace('&amp;', '&')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _lowercase_tag(html):
			
 
				+    """标签归一化处理（全部小写）"""
			
 
				+    tags = re.findall("<[^>]+>", html)
			
 
				+    for tag in tags:
			
 
				+        html = html.replace(tag, str(tag).lower())
			
 
				+
			
 
				+    repair_tags = _repair_tag()
			
 
				+    for err, right in repair_tags.items():
			
 
				+        html = html.replace(err, right)
			
 
				+
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def cleaner(html, special=None, completely=False):
			
 
				+    """
			
 
				+    数据清洗
			
 
				+
			
 
				+    :param html: 清洗的页面
			
 
				+    :param special: 额外指定页面清洗规则
			
 
				+    :param completely: 是否完全清洗页面
			
 
				+    :return: 清洗后的页面源码
			
 
				+    """
			
 
				+    if special is None:
			
 
				+        special = {}
			
 
				+    OTHER.update(special)
			
 
				+    remove_tags = {
			
 
				+        **INDEPENDENT_TAGS,
			
 
				+        **INLINE_TAGS,
			
 
				+        **BLOCK_TAGS,
			
 
				+        **OTHER,
			
 
				+        **CSS_STYLE,
			
 
				+        **BLANKS,
			
 
				+    }
			
 
				+    html = _lowercase_tag(html)
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				+
			
 
				+    if completely:
			
 
				+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
			
 
				+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				+
			
 
				+    html = _escape_character(html)
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def save_data(documents, col):
			
 
				+    if isinstance(documents, list):
			
 
				+        col.insert_many(documents)
			
 
				+    else:
			
 
				+        col.insert_many([documents])
			
 
				+
			
 
				+
			
 
				+def crawl_spider_zb(channel, progress, page, keyword):
			
 
				+    url = "https://www.biaozhaozhao.com/qcc/tender/search"
			
 
				+    data = {
			
 
				+        "sortField": "publishdate",
			
 
				+        "sortOrder": "DESC",
			
 
				+        "searchType": "accurate",
			
 
				+        "searchKeyList": [keyword],
			
 
				+        "filter": {
			
 
				+            "publishdate": [
			
 
				+                {
			
 
				+                    "currently": True,
			
 
				+                    "flag": 5,
			
 
				+                    "number": 1,
			
 
				+                    "unit": "day",
			
 
				+                    "min": "2022-07-31T16:00:00.000Z",
			
 
				+                    "max": "2022-08-31T15:59:59.999Z"
			
 
				+                }
			
 
				+            ],
			
 
				+            "ifbprogress": [
			
 
				+                progress
			
 
				+            ],
			
 
				+            "region": [
			
 
				+                {
			
 
				+                    "pr": "GD"
			
 
				+                }
			
 
				+            ],
			
 
				+            "isvalid": 1
			
 
				+        },
			
 
				+        "queryLink": "or",
			
 
				+        "pageIndex": page,
			
 
				+        "pageSize": 50,
			
 
				+        "isHighlight": True,
			
 
				+        "isDegrade": True
			
 
				+    }
			
 
				+    data = json.dumps(data)
			
 
				+    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
			
 
				+    if response.status_code == 403:
			
 
				+        print(f"{keyword}>>>当前第{page}页, 账号cookie失效")
			
 
				+        return True, 0
			
 
				+
			
 
				+    results = []
			
 
				+    info_list = response.json()["Result"]
			
 
				+    if len(info_list) == 0:
			
 
				+        print(f"{keyword}>>>暂无数据")
			
 
				+        return True, 0
			
 
				+
			
 
				+    for item in info_list:
			
 
				+        href = 'https://www.biaozhaozhao.com/detail/{}?keywords=%5B"{}"%5D'.format(item["id"], quote(keyword))
			
 
				+        if not r.hexists(redis_key, href):
			
 
				+            title = item['title']
			
 
				+            publish_time = item['publishdate']
			
 
				+            time_array = time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")
			
 
				+            publishtime_ts = int(time.mktime(time_array))
			
 
				+            data = {
			
 
				+                'site': '标找找',
			
 
				+                'channel': channel,
			
 
				+                'spidercode': 'sdxzbiddingsjzypc',
			
 
				+                'title': title,
			
 
				+                'area': '广东',  # 省
			
 
				+                'city': item['city'],  # 市
			
 
				+                'district': item['district'],
			
 
				+                'publishdept': '',
			
 
				+                'type': '',
			
 
				+                'T': 'bidding',  # 数据表名称
			
 
				+                'sendflag': 'false',
			
 
				+                '_d': 'comeintime',
			
 
				+                'iscompete': True,  # 新爬虫
			
 
				+                'crawl': False,
			
 
				+                'href': '#',
			
 
				+                'competehref': href,
			
 
				+                'publishtime': publish_time,
			
 
				+                'l_np_publishtime': Int64(publishtime_ts),
			
 
				+            }
			
 
				+            results.append(data)
			
 
				+            r.hset(redis_key, href, '')
			
 
				+
			
 
				+    if len(results) > 0:
			
 
				+        save_data(results, bzz_list)
			
 
				+    print(f"{keyword}>>>第{page}页采集完成,收录{len(results)}条")
			
 
				+    return False, len(results)
			
 
				+
			
 
				+
			
 
				+def list_page(channel, progress, keyword, pages):
			
 
				+    for page in range(1, pages + 1):
			
 
				+        stop, total = crawl_spider_zb(channel, progress, page, keyword)
			
 
				+        if stop or total < 50:
			
 
				+            return False, page
			
 
				+        time.sleep(2)
			
 
				+    return True, page
			
 
				+
			
 
				+
			
 
				+def select_label(keyword, pages):
			
 
				+    print(f"{keyword}>>>开始采集{pages}页")
			
 
				+    for name, index in type_maps.items():
			
 
				+        normal_stop, page = list_page(name, index, keyword, pages)
			
 
				+        if not normal_stop:
			
 
				+            return False
			
 
				+        print(f"{name}>>>完成采集:{keyword} 第{page}页")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def handler_detail(items):
			
 
				+    headers = {
			
 
				+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
			
 
				+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
			
 
				+        "Cache-Control": "no-cache",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Pragma": "no-cache",
			
 
				+        "Sec-Fetch-Dest": "document",
			
 
				+        "Sec-Fetch-Mode": "navigate",
			
 
				+        "Sec-Fetch-Site": "none",
			
 
				+        "Sec-Fetch-User": "?1",
			
 
				+        "Upgrade-Insecure-Requests": "1",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
			
 
				+        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
			
 
				+        "sec-ch-ua-mobile": "?0",
			
 
				+        "sec-ch-ua-platform": "\"macOS\""
			
 
				+    }
			
 
				+    channel = items['channel']
			
 
				+    url = items['competehref']
			
 
				+    try:
			
 
				+        response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
			
 
				+        if '您当天的查看项目额度已用完！' in response.text:
			
 
				+            print(f'{channel}>>>{items["title"]} 额度已用完！')
			
 
				+            return False
			
 
				+
			
 
				+        response = Selector(response.text)
			
 
				+        rubbish = response.xpath('//div[@class="dfUzPvnO"]').extract_first()
			
 
				+        html = response.xpath('//div[@class="_4nFsQzFd"]').extract_first()
			
 
				+        if html is not None:
			
 
				+            contenthtml = html.replace(rubbish, '')
			
 
				+            items['contenthtml'] = contenthtml
			
 
				+            items['detail'] = cleaner(contenthtml)
			
 
				+            items['comeintime'] = Int64(int(time.time()))
			
 
				+
			
 
				+            bzz_list.update_one(
			
 
				+                {'_id': items['_id']},
			
 
				+                {'$set': {'crawl': True}}
			
 
				+            )
			
 
				+
			
 
				+            if '_id' in items:
			
 
				+                del items['_id'], items['crawl']
			
 
				+            data_bak.insert_one(items)
			
 
				+            print(f'{channel}>>>{items["title"]} 下载成功')
			
 
				+            return True
			
 
				+    except requests.RequestException:
			
 
				+        print(f'{channel}>>>{items["title"]} 下载失败')
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def detail_page():
			
 
				+    # q = {'crawl': False}
			
 
				+    q = {'crawl': True}
			
 
				+    amount_of_project = 0
			
 
				+    # total = bzz_list.count_documents(q)
			
 
				+    # while total > 0:
			
 
				+    #     print(f"{channel}>>>剩余任务{total}条")
			
 
				+    with bzz_list.find(q) as cursor:
			
 
				+        tasks = [item for item in cursor]
			
 
				+        for item in tasks:
			
 
				+            handler_detail(item)
			
 
				+            # if stop_crawl:
			
 
				+            #     amount_of_project += 1
			
 
				+            time.sleep(1.5)
			
 
				+    print("任务结束")
			
 
				+
			
 
				+
			
 
				+def _count_zb_total(page, keyword, progress):
			
 
				+    url = "https://www.biaozhaozhao.com/qcc/tender/search"
			
 
				+    data = {
			
 
				+        "sortField": "publishdate",
			
 
				+        "sortOrder": "DESC",
			
 
				+        "searchType": "accurate",
			
 
				+        "searchKeyList": [keyword],
			
 
				+        "filter": {
			
 
				+            "publishdate": [
			
 
				+                {
			
 
				+                    "currently": True,
			
 
				+                    "flag": 5,
			
 
				+                    "number": 1,
			
 
				+                    "unit": "day",
			
 
				+                    "min": "2022-07-31T16:00:00.000Z",
			
 
				+                    "max": "2022-08-31T15:59:59.999Z"
			
 
				+                }
			
 
				+            ],
			
 
				+            "ifbprogress": [
			
 
				+                progress
			
 
				+            ],
			
 
				+            "region": [
			
 
				+                {
			
 
				+                    "pr": "GD",
			
 
				+                    "ct": "4403"
			
 
				+                }
			
 
				+            ],
			
 
				+            "isvalid": 1
			
 
				+        },
			
 
				+        "queryLink": "or",
			
 
				+        "pageIndex": page,
			
 
				+        "pageSize": 50,
			
 
				+        "isHighlight": True,
			
 
				+        "isDegrade": True
			
 
				+    }
			
 
				+    data = json.dumps(data)
			
 
				+    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
			
 
				+    # print(response.json())
			
 
				+    total_records = response.json()['Paging']['TotalRecords']
			
 
				+    return int(total_records)
			
 
				+
			
 
				+
			
 
				+def count_total():
			
 
				+    count = 0
			
 
				+    for tp_name, tp_index in type_maps.items():
			
 
				+        totals = 0
			
 
				+        for kw in ['一体机', '白板', '黑板', '大屏', '智慧屏', '录播', '智能教师']:
			
 
				+            totals += _count_zb_total(1, kw, tp_index)
			
 
				+
			
 
				+        print(tp_name, totals)
			
 
				+        count += totals
			
 
				+    print("总计 ", count)
			
 
				+
			
 
				+
			
 
				+def push_spider_dbs():
			
 
				+    dbk = cli1['py_spider']['data_bak']
			
 
				+    cur1 = data_bak.find()
			
 
				+    for item in cur1:
			
 
				+        del item['_id']
			
 
				+        print(item)
			
 
				+        item['comeintime'] = Int64(int(time.time()))
			
 
				+        dbk.insert_one(item)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    select_label('一体机', 13)
			
 
				+    # select_label('白板', 2)
			
 
				+    # select_label('黑板', 2)
			
 
				+    # select_label('大屏', 3)
			
 
				+    # select_label('智慧屏', 1)
			
 
				+    # select_label('录播', 2)
			
 
				+    # select_label('智能教师', 1)
			
 
				+    detail_page()