소스 검색

标找找

dongzhaorui 2 년 전
부모
커밋
76bf869160
1개의 변경된 파일443개의 추가작업 그리고 0개의 파일을 삭제
  1. 443 0
      bzz/spider.py

+ 443 - 0
bzz/spider.py

@@ -0,0 +1,443 @@
+import json
+import re
+import time
+
+import redis
+import requests
+from bson.int64 import Int64
+from parsel import Selector
+from pymongo import MongoClient
+from urllib.parse import quote
+
+cli = MongoClient("192.168.20.248", 27017)
+bzz_list = cli['dzr']['bzz_zb_list']
+
+cli1 = MongoClient("127.0.0.1", 27001)
+data_bak = cli['dzr']['data_bak']
+
+
+_pool = redis.ConnectionPool(
+    host='192.168.20.248',
+    port=6379,
+    password='top@123',
+    db=2
+)
+r = redis.Redis(connection_pool=_pool, decode_responses=True)
+redis_key = 'duplicate_bzz_list'
+type_maps = {
+    '中标': '42',
+    '成交': '43',
+    '单一来源': '410',
+    '合同及验收': '48',
+}
+
+# channel = "中标"
+# channel_sign = "42"
+
+headers = {
+    "Accept": "application/json, text/plain, */*",
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+    "Cache-Control": "no-cache",
+    "Connection": "keep-alive",
+    "Content-Type": "application/json",
+    "Origin": "https://www.biaozhaozhao.com",
+    "Pragma": "no-cache",
+    "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
+    "Sec-Fetch-Dest": "empty",
+    "Sec-Fetch-Mode": "cors",
+    "Sec-Fetch-Site": "same-origin",
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
+    "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": "\"macOS\"",
+    "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
+    "x-kzz-request-from": "qcc-tender-web",
+    "x-kzz-request-id": "27328858-5576-2734-188319849847",
+    "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
+    "x-kzz-request-time": "1668845731606"
+}
+cookies = {
+    "QCCSESSID": "b410c26e4aa5895529a652519c",
+    "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
+    "ls_371ef9f55b6b63dc": "42d6a4659d87930e"
+}
+
+# 独立元素
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+# 行内元素
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '',  # font
+}
+# 块级元素
+BLOCK_TAGS = {
+    # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
+    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
+    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+# 其他
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+    '相关链接:[\s\S]+': '',
+    '阅读数[::]\d+': '',
+}
+# 样式
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+}
+# 空白符
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+# css标签集合
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+# css属性集合
+ATTRS = {'id', 'class', 'style', 'width'}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    return html
+
+
+def _lowercase_tag(html):
+    """标签归一化处理(全部小写)"""
+    tags = re.findall("<[^>]+>", html)
+    for tag in tags:
+        html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def cleaner(html, special=None, completely=False):
+    """
+    数据清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :return: 清洗后的页面源码
+    """
+    if special is None:
+        special = {}
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    return html
+
+
+def save_data(documents, col):
+    if isinstance(documents, list):
+        col.insert_many(documents)
+    else:
+        col.insert_many([documents])
+
+
+def crawl_spider_zb(channel, progress, page, keyword):
+    url = "https://www.biaozhaozhao.com/qcc/tender/search"
+    data = {
+        "sortField": "publishdate",
+        "sortOrder": "DESC",
+        "searchType": "accurate",
+        "searchKeyList": [keyword],
+        "filter": {
+            "publishdate": [
+                {
+                    "currently": True,
+                    "flag": 5,
+                    "number": 1,
+                    "unit": "day",
+                    "min": "2022-07-31T16:00:00.000Z",
+                    "max": "2022-08-31T15:59:59.999Z"
+                }
+            ],
+            "ifbprogress": [
+                progress
+            ],
+            "region": [
+                {
+                    "pr": "GD"
+                }
+            ],
+            "isvalid": 1
+        },
+        "queryLink": "or",
+        "pageIndex": page,
+        "pageSize": 50,
+        "isHighlight": True,
+        "isDegrade": True
+    }
+    data = json.dumps(data)
+    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
+    if response.status_code == 403:
+        print(f"{keyword}>>>当前第{page}页, 账号cookie失效")
+        return True, 0
+
+    results = []
+    info_list = response.json()["Result"]
+    if len(info_list) == 0:
+        print(f"{keyword}>>>暂无数据")
+        return True, 0
+
+    for item in info_list:
+        href = 'https://www.biaozhaozhao.com/detail/{}?keywords=%5B"{}"%5D'.format(item["id"], quote(keyword))
+        if not r.hexists(redis_key, href):
+            title = item['title']
+            publish_time = item['publishdate']
+            time_array = time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")
+            publishtime_ts = int(time.mktime(time_array))
+            data = {
+                'site': '标找找',
+                'channel': channel,
+                'spidercode': 'sdxzbiddingsjzypc',
+                'title': title,
+                'area': '广东',  # 省
+                'city': item['city'],  # 市
+                'district': item['district'],
+                'publishdept': '',
+                'type': '',
+                'T': 'bidding',  # 数据表名称
+                'sendflag': 'false',
+                '_d': 'comeintime',
+                'iscompete': True,  # 新爬虫
+                'crawl': False,
+                'href': '#',
+                'competehref': href,
+                'publishtime': publish_time,
+                'l_np_publishtime': Int64(publishtime_ts),
+            }
+            results.append(data)
+            r.hset(redis_key, href, '')
+
+    if len(results) > 0:
+        save_data(results, bzz_list)
+    print(f"{keyword}>>>第{page}页采集完成,收录{len(results)}条")
+    return False, len(results)
+
+
+def list_page(channel, progress, keyword, pages):
+    for page in range(1, pages + 1):
+        stop, total = crawl_spider_zb(channel, progress, page, keyword)
+        if stop or total < 50:
+            return False, page
+        time.sleep(2)
+    return True, page
+
+
+def select_label(keyword, pages):
+    print(f"{keyword}>>>开始采集{pages}页")
+    for name, index in type_maps.items():
+        normal_stop, page = list_page(name, index, keyword, pages)
+        if not normal_stop:
+            return False
+        print(f"{name}>>>完成采集:{keyword} 第{page}页")
+    return True
+
+
+def handler_detail(items):
+    headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "Pragma": "no-cache",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none",
+        "Sec-Fetch-User": "?1",
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
+        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
+        "sec-ch-ua-mobile": "?0",
+        "sec-ch-ua-platform": "\"macOS\""
+    }
+    channel = items['channel']
+    url = items['competehref']
+    try:
+        response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
+        if '您当天的查看项目额度已用完!' in response.text:
+            print(f'{channel}>>>{items["title"]} 额度已用完!')
+            return False
+
+        response = Selector(response.text)
+        rubbish = response.xpath('//div[@class="dfUzPvnO"]').extract_first()
+        html = response.xpath('//div[@class="_4nFsQzFd"]').extract_first()
+        if html is not None:
+            contenthtml = html.replace(rubbish, '')
+            items['contenthtml'] = contenthtml
+            items['detail'] = cleaner(contenthtml)
+            items['comeintime'] = Int64(int(time.time()))
+
+            bzz_list.update_one(
+                {'_id': items['_id']},
+                {'$set': {'crawl': True}}
+            )
+
+            if '_id' in items:
+                del items['_id'], items['crawl']
+            data_bak.insert_one(items)
+            print(f'{channel}>>>{items["title"]} 下载成功')
+            return True
+    except requests.RequestException:
+        print(f'{channel}>>>{items["title"]} 下载失败')
+        return False
+
+
+def detail_page():
+    # q = {'crawl': False}
+    q = {'crawl': True}
+    amount_of_project = 0
+    # total = bzz_list.count_documents(q)
+    # while total > 0:
+    #     print(f"{channel}>>>剩余任务{total}条")
+    with bzz_list.find(q) as cursor:
+        tasks = [item for item in cursor]
+        for item in tasks:
+            handler_detail(item)
+            # if stop_crawl:
+            #     amount_of_project += 1
+            time.sleep(1.5)
+    print("任务结束")
+
+
+def _count_zb_total(page, keyword, progress):
+    url = "https://www.biaozhaozhao.com/qcc/tender/search"
+    data = {
+        "sortField": "publishdate",
+        "sortOrder": "DESC",
+        "searchType": "accurate",
+        "searchKeyList": [keyword],
+        "filter": {
+            "publishdate": [
+                {
+                    "currently": True,
+                    "flag": 5,
+                    "number": 1,
+                    "unit": "day",
+                    "min": "2022-07-31T16:00:00.000Z",
+                    "max": "2022-08-31T15:59:59.999Z"
+                }
+            ],
+            "ifbprogress": [
+                progress
+            ],
+            "region": [
+                {
+                    "pr": "GD",
+                    "ct": "4403"
+                }
+            ],
+            "isvalid": 1
+        },
+        "queryLink": "or",
+        "pageIndex": page,
+        "pageSize": 50,
+        "isHighlight": True,
+        "isDegrade": True
+    }
+    data = json.dumps(data)
+    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
+    # print(response.json())
+    total_records = response.json()['Paging']['TotalRecords']
+    return int(total_records)
+
+
+def count_total():
+    count = 0
+    for tp_name, tp_index in type_maps.items():
+        totals = 0
+        for kw in ['一体机', '白板', '黑板', '大屏', '智慧屏', '录播', '智能教师']:
+            totals += _count_zb_total(1, kw, tp_index)
+
+        print(tp_name, totals)
+        count += totals
+    print("总计 ", count)
+
+
+def push_spider_dbs():
+    dbk = cli1['py_spider']['data_bak']
+    cur1 = data_bak.find()
+    for item in cur1:
+        del item['_id']
+        print(item)
+        item['comeintime'] = Int64(int(time.time()))
+        dbk.insert_one(item)
+
+
+if __name__ == '__main__':
+    select_label('一体机', 13)
+    # select_label('白板', 2)
+    # select_label('黑板', 2)
+    # select_label('大屏', 3)
+    # select_label('智慧屏', 1)
+    # select_label('录播', 2)
+    # select_label('智能教师', 1)
+    detail_page()