2 years ago · 76bf869160
--- a/bzz/spider.py
+++ b/bzz/spider.py
@@ -0,0 +1,443 @@
 
															+import json
														
 
															+import re
														
 
															+import time
														
 
															+
														
 
															+import redis
														
 
															+import requests
														
 
															+from bson.int64 import Int64
														
 
															+from parsel import Selector
														
 
															+from pymongo import MongoClient
														
 
															+from urllib.parse import quote
														
 
															+
														
 
															+cli = MongoClient("192.168.20.248", 27017)
														
 
															+bzz_list = cli['dzr']['bzz_zb_list']
														
 
															+
														
 
															+cli1 = MongoClient("127.0.0.1", 27001)
														
 
															+data_bak = cli['dzr']['data_bak']
														
 
															+
														
 
															+
														
 
															+_pool = redis.ConnectionPool(
														
 
															+    host='192.168.20.248',
														
 
															+    port=6379,
														
 
															+    password='top@123',
														
 
															+    db=2
														
 
															+)
														
 
															+r = redis.Redis(connection_pool=_pool, decode_responses=True)
														
 
															+redis_key = 'duplicate_bzz_list'
														
 
															+type_maps = {
														
 
															+    '中标': '42',
														
 
															+    '成交': '43',
														
 
															+    '单一来源': '410',
														
 
															+    '合同及验收': '48',
														
 
															+}
														
 
															+
														
 
															+# channel = "中标"
														
 
															+# channel_sign = "42"
														
 
															+
														
 
															+headers = {
														
 
															+    "Accept": "application/json, text/plain, */*",
														
 
															+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
														
 
															+    "Cache-Control": "no-cache",
														
 
															+    "Connection": "keep-alive",
														
 
															+    "Content-Type": "application/json",
														
 
															+    "Origin": "https://www.biaozhaozhao.com",
														
 
															+    "Pragma": "no-cache",
														
 
															+    "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
														
 
															+    "Sec-Fetch-Dest": "empty",
														
 
															+    "Sec-Fetch-Mode": "cors",
														
 
															+    "Sec-Fetch-Site": "same-origin",
														
 
															+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
														
 
															+    "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
														
 
															+    "sec-ch-ua-mobile": "?0",
														
 
															+    "sec-ch-ua-platform": "\"macOS\"",
														
 
															+    "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
														
 
															+    "x-kzz-request-from": "qcc-tender-web",
														
 
															+    "x-kzz-request-id": "27328858-5576-2734-188319849847",
														
 
															+    "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
														
 
															+    "x-kzz-request-time": "1668845731606"
														
 
															+}
														
 
															+cookies = {
														
 
															+    "QCCSESSID": "b410c26e4aa5895529a652519c",
														
 
															+    "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
														
 
															+    "ls_371ef9f55b6b63dc": "42d6a4659d87930e"
														
 
															+}
														
 
															+
														
 
															+# 独立元素
														
 
															+INDEPENDENT_TAGS = {
														
 
															+    '<head>[\s\S]*?</head>': '',
														
 
															+    '<html>|<html [^>]*>|</html>': '',
														
 
															+    '<body>|<body [^>]*>|</body>': '',
														
 
															+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
														
 
															+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
														
 
															+    '\\xa0|\\u3000': '',  # 空格
														
 
															+    '<!--[\s\S]*?-->': '',  # 注释
														
 
															+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
														
 
															+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
														
 
															+    '<input>': '',  # 输入框
														
 
															+    '<img[^>]*>': '<br>',  # 图片
														
 
															+}
														
 
															+# 行内元素
														
 
															+INLINE_TAGS = {
														
 
															+    '<a>|<a [^>]*>|</a>': '',  # 超链接
														
 
															+    '<span>|<span [^>]*>|</span>': '',  # span
														
 
															+    '<label>|<label [^>]*>|</label>': '<br>',  # label
														
 
															+    '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '',  # font
														
 
															+}
														
 
															+# 块级元素
														
 
															+BLOCK_TAGS = {
														
 
															+    # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
														
 
															+    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
														
 
															+    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
														
 
															+    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
														
 
															+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
														
 
															+}
														
 
															+# 其他
														
 
															+OTHER = {
														
 
															+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
														
 
															+    '<epointform>': '',
														
 
															+    '<!doctype html>|<!doctype html [^>]*>': '',
														
 
															+    '【关闭】|关闭': '',
														
 
															+    '【打印】|打印本页': '',
														
 
															+    '【字体：[\s\S]*】': '',
														
 
															+    '文章来源：[\u4e00-\u9fa5]+': '',
														
 
															+    '浏览次数：.*[<]+': '',
														
 
															+    '（责任编辑：.*?）': '',
														
 
															+    '分享到[：]': '',
														
 
															+    '相关链接：[\s\S]+': '',
														
 
															+    '阅读数[:：]\d+': '',
														
 
															+}
														
 
															+# 样式
														
 
															+CSS_STYLE = {
														
 
															+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
														
 
															+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
														
 
															+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
														
 
															+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
														
 
															+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
														
 
															+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
														
 
															+}
														
 
															+# 空白符
														
 
															+BLANKS = {
														
 
															+    '\n\s*\n': '\n',
														
 
															+    '\s*\n\s*': '\n',
														
 
															+    '[^\S\n]': ' ',
														
 
															+    '\s+': ' ',
														
 
															+}
														
 
															+# css标签集合
														
 
															+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
														
 
															+# css属性集合
														
 
															+ATTRS = {'id', 'class', 'style', 'width'}
														
 
															+
														
 
															+
														
 
															+def _repair_tag():
														
 
															+    """异常的标签组合,用来替换非标准页面的标签"""
														
 
															+    _repairs = {}
														
 
															+    for tag in TAGS:
														
 
															+        for attr in ATTRS:
														
 
															+            key = '{}{}'.format(tag, attr)
														
 
															+            val = '{} {}'.format(tag, attr)
														
 
															+            _repairs[key] = val
														
 
															+    return _repairs
														
 
															+
														
 
															+
														
 
															+def _escape_character(html):
														
 
															+    """转义字符"""
														
 
															+    html = html.replace('&lt;', '<')
														
 
															+    html = html.replace('&gt;', '>')
														
 
															+    html = html.replace('&quot;', '"')
														
 
															+    html = html.replace('&amp;', '&')
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def _lowercase_tag(html):
														
 
															+    """标签归一化处理（全部小写）"""
														
 
															+    tags = re.findall("<[^>]+>", html)
														
 
															+    for tag in tags:
														
 
															+        html = html.replace(tag, str(tag).lower())
														
 
															+
														
 
															+    repair_tags = _repair_tag()
														
 
															+    for err, right in repair_tags.items():
														
 
															+        html = html.replace(err, right)
														
 
															+
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def cleaner(html, special=None, completely=False):
														
 
															+    """
														
 
															+    数据清洗
														
 
															+
														
 
															+    :param html: 清洗的页面
														
 
															+    :param special: 额外指定页面清洗规则
														
 
															+    :param completely: 是否完全清洗页面
														
 
															+    :return: 清洗后的页面源码
														
 
															+    """
														
 
															+    if special is None:
														
 
															+        special = {}
														
 
															+    OTHER.update(special)
														
 
															+    remove_tags = {
														
 
															+        **INDEPENDENT_TAGS,
														
 
															+        **INLINE_TAGS,
														
 
															+        **BLOCK_TAGS,
														
 
															+        **OTHER,
														
 
															+        **CSS_STYLE,
														
 
															+        **BLANKS,
														
 
															+    }
														
 
															+    html = _lowercase_tag(html)
														
 
															+    for tag, repl in remove_tags.items():
														
 
															+        html = re.sub(tag, repl, html)
														
 
															+
														
 
															+    if completely:
														
 
															+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
														
 
															+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
														
 
															+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
														
 
															+
														
 
															+    html = _escape_character(html)
														
 
															+    return html
														
 
															+
														
 
															+
														
 
															+def save_data(documents, col):
														
 
															+    if isinstance(documents, list):
														
 
															+        col.insert_many(documents)
														
 
															+    else:
														
 
															+        col.insert_many([documents])
														
 
															+
														
 
															+
														
 
															+def crawl_spider_zb(channel, progress, page, keyword):
														
 
															+    url = "https://www.biaozhaozhao.com/qcc/tender/search"
														
 
															+    data = {
														
 
															+        "sortField": "publishdate",
														
 
															+        "sortOrder": "DESC",
														
 
															+        "searchType": "accurate",
														
 
															+        "searchKeyList": [keyword],
														
 
															+        "filter": {
														
 
															+            "publishdate": [
														
 
															+                {
														
 
															+                    "currently": True,
														
 
															+                    "flag": 5,
														
 
															+                    "number": 1,
														
 
															+                    "unit": "day",
														
 
															+                    "min": "2022-07-31T16:00:00.000Z",
														
 
															+                    "max": "2022-08-31T15:59:59.999Z"
														
 
															+                }
														
 
															+            ],
														
 
															+            "ifbprogress": [
														
 
															+                progress
														
 
															+            ],
														
 
															+            "region": [
														
 
															+                {
														
 
															+                    "pr": "GD"
														
 
															+                }
														
 
															+            ],
														
 
															+            "isvalid": 1
														
 
															+        },
														
 
															+        "queryLink": "or",
														
 
															+        "pageIndex": page,
														
 
															+        "pageSize": 50,
														
 
															+        "isHighlight": True,
														
 
															+        "isDegrade": True
														
 
															+    }
														
 
															+    data = json.dumps(data)
														
 
															+    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
														
 
															+    if response.status_code == 403:
														
 
															+        print(f"{keyword}>>>当前第{page}页, 账号cookie失效")
														
 
															+        return True, 0
														
 
															+
														
 
															+    results = []
														
 
															+    info_list = response.json()["Result"]
														
 
															+    if len(info_list) == 0:
														
 
															+        print(f"{keyword}>>>暂无数据")
														
 
															+        return True, 0
														
 
															+
														
 
															+    for item in info_list:
														
 
															+        href = 'https://www.biaozhaozhao.com/detail/{}?keywords=%5B"{}"%5D'.format(item["id"], quote(keyword))
														
 
															+        if not r.hexists(redis_key, href):
														
 
															+            title = item['title']
														
 
															+            publish_time = item['publishdate']
														
 
															+            time_array = time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")
														
 
															+            publishtime_ts = int(time.mktime(time_array))
														
 
															+            data = {
														
 
															+                'site': '标找找',
														
 
															+                'channel': channel,
														
 
															+                'spidercode': 'sdxzbiddingsjzypc',
														
 
															+                'title': title,
														
 
															+                'area': '广东',  # 省
														
 
															+                'city': item['city'],  # 市
														
 
															+                'district': item['district'],
														
 
															+                'publishdept': '',
														
 
															+                'type': '',
														
 
															+                'T': 'bidding',  # 数据表名称
														
 
															+                'sendflag': 'false',
														
 
															+                '_d': 'comeintime',
														
 
															+                'iscompete': True,  # 新爬虫
														
 
															+                'crawl': False,
														
 
															+                'href': '#',
														
 
															+                'competehref': href,
														
 
															+                'publishtime': publish_time,
														
 
															+                'l_np_publishtime': Int64(publishtime_ts),
														
 
															+            }
														
 
															+            results.append(data)
														
 
															+            r.hset(redis_key, href, '')
														
 
															+
														
 
															+    if len(results) > 0:
														
 
															+        save_data(results, bzz_list)
														
 
															+    print(f"{keyword}>>>第{page}页采集完成,收录{len(results)}条")
														
 
															+    return False, len(results)
														
 
															+
														
 
															+
														
 
															+def list_page(channel, progress, keyword, pages):
														
 
															+    for page in range(1, pages + 1):
														
 
															+        stop, total = crawl_spider_zb(channel, progress, page, keyword)
														
 
															+        if stop or total < 50:
														
 
															+            return False, page
														
 
															+        time.sleep(2)
														
 
															+    return True, page
														
 
															+
														
 
															+
														
 
															+def select_label(keyword, pages):
														
 
															+    print(f"{keyword}>>>开始采集{pages}页")
														
 
															+    for name, index in type_maps.items():
														
 
															+        normal_stop, page = list_page(name, index, keyword, pages)
														
 
															+        if not normal_stop:
														
 
															+            return False
														
 
															+        print(f"{name}>>>完成采集:{keyword} 第{page}页")
														
 
															+    return True
														
 
															+
														
 
															+
														
 
															+def handler_detail(items):
														
 
															+    headers = {
														
 
															+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
														
 
															+        "Cache-Control": "no-cache",
														
 
															+        "Connection": "keep-alive",
														
 
															+        "Pragma": "no-cache",
														
 
															+        "Sec-Fetch-Dest": "document",
														
 
															+        "Sec-Fetch-Mode": "navigate",
														
 
															+        "Sec-Fetch-Site": "none",
														
 
															+        "Sec-Fetch-User": "?1",
														
 
															+        "Upgrade-Insecure-Requests": "1",
														
 
															+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
														
 
															+        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
														
 
															+        "sec-ch-ua-mobile": "?0",
														
 
															+        "sec-ch-ua-platform": "\"macOS\""
														
 
															+    }
														
 
															+    channel = items['channel']
														
 
															+    url = items['competehref']
														
 
															+    try:
														
 
															+        response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
														
 
															+        if '您当天的查看项目额度已用完！' in response.text:
														
 
															+            print(f'{channel}>>>{items["title"]} 额度已用完！')
														
 
															+            return False
														
 
															+
														
 
															+        response = Selector(response.text)
														
 
															+        rubbish = response.xpath('//div[@class="dfUzPvnO"]').extract_first()
														
 
															+        html = response.xpath('//div[@class="_4nFsQzFd"]').extract_first()
														
 
															+        if html is not None:
														
 
															+            contenthtml = html.replace(rubbish, '')
														
 
															+            items['contenthtml'] = contenthtml
														
 
															+            items['detail'] = cleaner(contenthtml)
														
 
															+            items['comeintime'] = Int64(int(time.time()))
														
 
															+
														
 
															+            bzz_list.update_one(
														
 
															+                {'_id': items['_id']},
														
 
															+                {'$set': {'crawl': True}}
														
 
															+            )
														
 
															+
														
 
															+            if '_id' in items:
														
 
															+                del items['_id'], items['crawl']
														
 
															+            data_bak.insert_one(items)
														
 
															+            print(f'{channel}>>>{items["title"]} 下载成功')
														
 
															+            return True
														
 
															+    except requests.RequestException:
														
 
															+        print(f'{channel}>>>{items["title"]} 下载失败')
														
 
															+        return False
														
 
															+
														
 
															+
														
 
															+def detail_page():
														
 
															+    # q = {'crawl': False}
														
 
															+    q = {'crawl': True}
														
 
															+    amount_of_project = 0
														
 
															+    # total = bzz_list.count_documents(q)
														
 
															+    # while total > 0:
														
 
															+    #     print(f"{channel}>>>剩余任务{total}条")
														
 
															+    with bzz_list.find(q) as cursor:
														
 
															+        tasks = [item for item in cursor]
														
 
															+        for item in tasks:
														
 
															+            handler_detail(item)
														
 
															+            # if stop_crawl:
														
 
															+            #     amount_of_project += 1
														
 
															+            time.sleep(1.5)
														
 
															+    print("任务结束")
														
 
															+
														
 
															+
														
 
															+def _count_zb_total(page, keyword, progress):
														
 
															+    url = "https://www.biaozhaozhao.com/qcc/tender/search"
														
 
															+    data = {
														
 
															+        "sortField": "publishdate",
														
 
															+        "sortOrder": "DESC",
														
 
															+        "searchType": "accurate",
														
 
															+        "searchKeyList": [keyword],
														
 
															+        "filter": {
														
 
															+            "publishdate": [
														
 
															+                {
														
 
															+                    "currently": True,
														
 
															+                    "flag": 5,
														
 
															+                    "number": 1,
														
 
															+                    "unit": "day",
														
 
															+                    "min": "2022-07-31T16:00:00.000Z",
														
 
															+                    "max": "2022-08-31T15:59:59.999Z"
														
 
															+                }
														
 
															+            ],
														
 
															+            "ifbprogress": [
														
 
															+                progress
														
 
															+            ],
														
 
															+            "region": [
														
 
															+                {
														
 
															+                    "pr": "GD",
														
 
															+                    "ct": "4403"
														
 
															+                }
														
 
															+            ],
														
 
															+            "isvalid": 1
														
 
															+        },
														
 
															+        "queryLink": "or",
														
 
															+        "pageIndex": page,
														
 
															+        "pageSize": 50,
														
 
															+        "isHighlight": True,
														
 
															+        "isDegrade": True
														
 
															+    }
														
 
															+    data = json.dumps(data)
														
 
															+    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
														
 
															+    # print(response.json())
														
 
															+    total_records = response.json()['Paging']['TotalRecords']
														
 
															+    return int(total_records)
														
 
															+
														
 
															+
														
 
															+def count_total():
														
 
															+    count = 0
														
 
															+    for tp_name, tp_index in type_maps.items():
														
 
															+        totals = 0
														
 
															+        for kw in ['一体机', '白板', '黑板', '大屏', '智慧屏', '录播', '智能教师']:
														
 
															+            totals += _count_zb_total(1, kw, tp_index)
														
 
															+
														
 
															+        print(tp_name, totals)
														
 
															+        count += totals
														
 
															+    print("总计 ", count)
														
 
															+
														
 
															+
														
 
															+def push_spider_dbs():
														
 
															+    dbk = cli1['py_spider']['data_bak']
														
 
															+    cur1 = data_bak.find()
														
 
															+    for item in cur1:
														
 
															+        del item['_id']
														
 
															+        print(item)
														
 
															+        item['comeintime'] = Int64(int(time.time()))
														
 
															+        dbk.insert_one(item)
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    select_label('一体机', 13)
														
 
															+    # select_label('白板', 2)
														
 
															+    # select_label('黑板', 2)
														
 
															+    # select_label('大屏', 3)
														
 
															+    # select_label('智慧屏', 1)
														
 
															+    # select_label('录播', 2)
														
 
															+    # select_label('智能教师', 1)
														
 
															+    detail_page()