data_spider
/
match_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
							import json
import re
import time

import redis
import requests
from bson.int64 import Int64
from parsel import Selector
from pymongo import MongoClient
from urllib.parse import quote

cli = MongoClient("192.168.20.248", 27017)
bzz_list = cli['dzr']['bzz_zb_list']

cli1 = MongoClient("127.0.0.1", 27001)
data_bak = cli['dzr']['data_bak']


_pool = redis.ConnectionPool(
    host='192.168.20.248',
    port=6379,
    password='top@123',
    db=2
)
r = redis.Redis(connection_pool=_pool, decode_responses=True)
redis_key = 'duplicate_bzz_list'
type_maps = {
    '中标': '42',
    '成交': '43',
    '单一来源': '410',
    '合同及验收': '48',
}

# channel = "中标"
# channel_sign = "42"

headers = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Content-Type": "application/json",
    "Origin": "https://www.biaozhaozhao.com",
    "Pragma": "no-cache",
    "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
    "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"macOS\"",
    "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
    "x-kzz-request-from": "qcc-tender-web",
    "x-kzz-request-id": "27328858-5576-2734-188319849847",
    "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
    "x-kzz-request-time": "1668845731606"
}
cookies = {
    "QCCSESSID": "b410c26e4aa5895529a652519c",
    "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
    "ls_371ef9f55b6b63dc": "42d6a4659d87930e"
}

# 独立元素
INDEPENDENT_TAGS = {
    '<head>[\s\S]*?</head>': '',
    '<html>|<html [^>]*>|</html>': '',
    '<body>|<body [^>]*>|</body>': '',
    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
    '\\xa0|\\u3000': '',  # 空格
    '<!--[\s\S]*?-->': '',  # 注释
    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
    '<input>': '',  # 输入框
    '<img[^>]*>': '<br>',  # 图片
}
# 行内元素
INLINE_TAGS = {
    '<a>|<a [^>]*>|</a>': '',  # 超链接
    '<span>|<span [^>]*>|</span>': '',  # span
    '<label>|<label [^>]*>|</label>': '<br>',  # label
    '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '',  # font
}
# 块级元素
BLOCK_TAGS = {
    # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
}
# 其他
OTHER = {
    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
    '<epointform>': '',
    '<!doctype html>|<!doctype html [^>]*>': '',
    '【关闭】|关闭': '',
    '【打印】|打印本页': '',
    '【字体：[\s\S]*】': '',
    '文章来源：[\u4e00-\u9fa5]+': '',
    '浏览次数：.*[<]+': '',
    '（责任编辑：.*?）': '',
    '分享到[：]': '',
    '相关链接：[\s\S]+': '',
    '阅读数[:：]\d+': '',
}
# 样式
CSS_STYLE = {
    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
}
# 空白符
BLANKS = {
    '\n\s*\n': '\n',
    '\s*\n\s*': '\n',
    '[^\S\n]': ' ',
    '\s+': ' ',
}
# css标签集合
TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
# css属性集合
ATTRS = {'id', 'class', 'style', 'width'}


def _repair_tag():
    """异常的标签组合,用来替换非标准页面的标签"""
    _repairs = {}
    for tag in TAGS:
        for attr in ATTRS:
            key = '{}{}'.format(tag, attr)
            val = '{} {}'.format(tag, attr)
            _repairs[key] = val
    return _repairs


def _escape_character(html):
    """转义字符"""
    html = html.replace('&lt;', '<')
    html = html.replace('&gt;', '>')
    html = html.replace('&quot;', '"')
    html = html.replace('&amp;', '&')
    return html


def _lowercase_tag(html):
    """标签归一化处理（全部小写）"""
    tags = re.findall("<[^>]+>", html)
    for tag in tags:
        html = html.replace(tag, str(tag).lower())

    repair_tags = _repair_tag()
    for err, right in repair_tags.items():
        html = html.replace(err, right)

    return html


def cleaner(html, special=None, completely=False):
    """
    数据清洗

    :param html: 清洗的页面
    :param special: 额外指定页面清洗规则
    :param completely: 是否完全清洗页面
    :return: 清洗后的页面源码
    """
    if special is None:
        special = {}
    OTHER.update(special)
    remove_tags = {
        **INDEPENDENT_TAGS,
        **INLINE_TAGS,
        **BLOCK_TAGS,
        **OTHER,
        **CSS_STYLE,
        **BLANKS,
    }
    html = _lowercase_tag(html)
    for tag, repl in remove_tags.items():
        html = re.sub(tag, repl, html)

    if completely:
        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)

    html = _escape_character(html)
    return html


def save_data(documents, col):
    if isinstance(documents, list):
        col.insert_many(documents)
    else:
        col.insert_many([documents])


def crawl_spider_zb(channel, progress, page, keyword):
    url = "https://www.biaozhaozhao.com/qcc/tender/search"
    data = {
        "sortField": "publishdate",
        "sortOrder": "DESC",
        "searchType": "accurate",
        "searchKeyList": [keyword],
        "filter": {
            "publishdate": [
                {
                    "currently": True,
                    "flag": 5,
                    "number": 1,
                    "unit": "day",
                    "min": "2022-07-31T16:00:00.000Z",
                    "max": "2022-08-31T15:59:59.999Z"
                }
            ],
            "ifbprogress": [
                progress
            ],
            "region": [
                {
                    "pr": "GD"
                }
            ],
            "isvalid": 1
        },
        "queryLink": "or",
        "pageIndex": page,
        "pageSize": 50,
        "isHighlight": True,
        "isDegrade": True
    }
    data = json.dumps(data)
    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
    if response.status_code == 403:
        print(f"{keyword}>>>当前第{page}页, 账号cookie失效")
        return True, 0

    results = []
    info_list = response.json()["Result"]
    if len(info_list) == 0:
        print(f"{keyword}>>>暂无数据")
        return True, 0

    for item in info_list:
        href = 'https://www.biaozhaozhao.com/detail/{}?keywords=%5B"{}"%5D'.format(item["id"], quote(keyword))
        if not r.hexists(redis_key, href):
            title = item['title']
            publish_time = item['publishdate']
            time_array = time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")
            publishtime_ts = int(time.mktime(time_array))
            data = {
                'site': '标找找',
                'channel': channel,
                'spidercode': 'sdxzbiddingsjzypc',
                'title': title,
                'area': '广东',  # 省
                'city': item['city'],  # 市
                'district': item['district'],
                'publishdept': '',
                'type': '',
                'T': 'bidding',  # 数据表名称
                'sendflag': 'false',
                '_d': 'comeintime',
                'iscompete': True,  # 新爬虫
                'crawl': False,
                'href': '#',
                'competehref': href,
                'publishtime': publish_time,
                'l_np_publishtime': Int64(publishtime_ts),
            }
            results.append(data)
            r.hset(redis_key, href, '')

    if len(results) > 0:
        save_data(results, bzz_list)
    print(f"{keyword}>>>第{page}页采集完成,收录{len(results)}条")
    return False, len(results)


def list_page(channel, progress, keyword, pages):
    for page in range(1, pages + 1):
        stop, total = crawl_spider_zb(channel, progress, page, keyword)
        if stop or total < 50:
            return False, page
        time.sleep(2)
    return True, page


def select_label(keyword, pages):
    print(f"{keyword}>>>开始采集{pages}页")
    for name, index in type_maps.items():
        normal_stop, page = list_page(name, index, keyword, pages)
        if not normal_stop:
            return False
        print(f"{name}>>>完成采集:{keyword} 第{page}页")
    return True


def handler_detail(items):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
        "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"macOS\""
    }
    channel = items['channel']
    url = items['competehref']
    try:
        response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
        if '您当天的查看项目额度已用完！' in response.text:
            print(f'{channel}>>>{items["title"]} 额度已用完！')
            return False

        response = Selector(response.text)
        rubbish = response.xpath('//div[@class="dfUzPvnO"]').extract_first()
        html = response.xpath('//div[@class="_4nFsQzFd"]').extract_first()
        if html is not None:
            contenthtml = html.replace(rubbish, '')
            items['contenthtml'] = contenthtml
            items['detail'] = cleaner(contenthtml)
            items['comeintime'] = Int64(int(time.time()))

            bzz_list.update_one(
                {'_id': items['_id']},
                {'$set': {'crawl': True}}
            )

            if '_id' in items:
                del items['_id'], items['crawl']
            data_bak.insert_one(items)
            print(f'{channel}>>>{items["title"]} 下载成功')
            return True
    except requests.RequestException:
        print(f'{channel}>>>{items["title"]} 下载失败')
        return False


def detail_page():
    # q = {'crawl': False}
    q = {'crawl': True}
    amount_of_project = 0
    # total = bzz_list.count_documents(q)
    # while total > 0:
    #     print(f"{channel}>>>剩余任务{total}条")
    with bzz_list.find(q) as cursor:
        tasks = [item for item in cursor]
        for item in tasks:
            handler_detail(item)
            # if stop_crawl:
            #     amount_of_project += 1
            time.sleep(1.5)
    print("任务结束")


def _count_zb_total(page, keyword, progress):
    url = "https://www.biaozhaozhao.com/qcc/tender/search"
    data = {
        "sortField": "publishdate",
        "sortOrder": "DESC",
        "searchType": "accurate",
        "searchKeyList": [keyword],
        "filter": {
            "publishdate": [
                {
                    "currently": True,
                    "flag": 5,
                    "number": 1,
                    "unit": "day",
                    "min": "2022-07-31T16:00:00.000Z",
                    "max": "2022-08-31T15:59:59.999Z"
                }
            ],
            "ifbprogress": [
                progress
            ],
            "region": [
                {
                    "pr": "GD",
                    "ct": "4403"
                }
            ],
            "isvalid": 1
        },
        "queryLink": "or",
        "pageIndex": page,
        "pageSize": 50,
        "isHighlight": True,
        "isDegrade": True
    }
    data = json.dumps(data)
    response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
    # print(response.json())
    total_records = response.json()['Paging']['TotalRecords']
    return int(total_records)


def count_total():
    count = 0
    for tp_name, tp_index in type_maps.items():
        totals = 0
        for kw in ['一体机', '白板', '黑板', '大屏', '智慧屏', '录播', '智能教师']:
            totals += _count_zb_total(1, kw, tp_index)

        print(tp_name, totals)
        count += totals
    print("总计 ", count)


def push_spider_dbs():
    dbk = cli1['py_spider']['data_bak']
    cur1 = data_bak.find()
    for item in cur1:
        del item['_id']
        print(item)
        item['comeintime'] = Int64(int(time.time()))
        dbk.insert_one(item)


if __name__ == '__main__':
    select_label('一体机', 13)
    # select_label('白板', 2)
    # select_label('黑板', 2)
    # select_label('大屏', 3)
    # select_label('智慧屏', 1)
    # select_label('录播', 2)
    # select_label('智能教师', 1)
    detail_page()