123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443 |
- import json
- import re
- import time
- import redis
- import requests
- from bson.int64 import Int64
- from parsel import Selector
- from pymongo import MongoClient
- from urllib.parse import quote
- cli = MongoClient("192.168.20.248", 27017)
- bzz_list = cli['dzr']['bzz_zb_list']
- cli1 = MongoClient("127.0.0.1", 27001)
- data_bak = cli['dzr']['data_bak']
- _pool = redis.ConnectionPool(
- host='192.168.20.248',
- port=6379,
- password='top@123',
- db=2
- )
- r = redis.Redis(connection_pool=_pool, decode_responses=True)
- redis_key = 'duplicate_bzz_list'
- type_maps = {
- '中标': '42',
- '成交': '43',
- '单一来源': '410',
- '合同及验收': '48',
- }
- # channel = "中标"
- # channel_sign = "42"
- headers = {
- "Accept": "application/json, text/plain, */*",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Content-Type": "application/json",
- "Origin": "https://www.biaozhaozhao.com",
- "Pragma": "no-cache",
- "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
- "Sec-Fetch-Dest": "empty",
- "Sec-Fetch-Mode": "cors",
- "Sec-Fetch-Site": "same-origin",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
- "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": "\"macOS\"",
- "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
- "x-kzz-request-from": "qcc-tender-web",
- "x-kzz-request-id": "27328858-5576-2734-188319849847",
- "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
- "x-kzz-request-time": "1668845731606"
- }
- cookies = {
- "QCCSESSID": "b410c26e4aa5895529a652519c",
- "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
- "ls_371ef9f55b6b63dc": "42d6a4659d87930e"
- }
- # 独立元素
- INDEPENDENT_TAGS = {
- '<head>[\s\S]*?</head>': '',
- '<html>|<html [^>]*>|</html>': '',
- '<body>|<body [^>]*>|</body>': '',
- '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '', # 元数据
- '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
- '\\xa0|\\u3000': '', # 空格
- '<!--[\s\S]*?-->': '', # 注释
- '<style[^<>]*>[\s\S]*?</style>': '', # 样式
- '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
- '<input>': '', # 输入框
- '<img[^>]*>': '<br>', # 图片
- }
- # 行内元素
- INLINE_TAGS = {
- '<a>|<a [^>]*>|</a>': '', # 超链接
- '<span>|<span [^>]*>|</span>': '', # span
- '<label>|<label [^>]*>|</label>': '<br>', # label
- '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '', # font
- }
- # 块级元素
- BLOCK_TAGS = {
- # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '', # 标题
- # '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
- '<p>|<p [^>]*>|</p>': '<br>', # 段落
- '<div>|<div [^>]*>|</div>': '<br>', # 分割 division
- '<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
- }
- # 其他
- OTHER = {
- '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
- '<epointform>': '',
- '<!doctype html>|<!doctype html [^>]*>': '',
- '【关闭】|关闭': '',
- '【打印】|打印本页': '',
- '【字体:[\s\S]*】': '',
- '文章来源:[\u4e00-\u9fa5]+': '',
- '浏览次数:.*[<]+': '',
- '(责任编辑:.*?)': '',
- '分享到[:]': '',
- '相关链接:[\s\S]+': '',
- '阅读数[::]\d+': '',
- }
- # 样式
- CSS_STYLE = {
- 'style="[\s\S]*?"|style ="[\s\S]*?"': '',
- 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
- 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
- 'class="[\s\S]*?"|class ="[\s\S]*?"': '',
- 'align="[\s\S]*?"|align ="[\s\S]*?"': '',
- 'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
- }
- # 空白符
- BLANKS = {
- '\n\s*\n': '\n',
- '\s*\n\s*': '\n',
- '[^\S\n]': ' ',
- '\s+': ' ',
- }
- # css标签集合
- TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
- # css属性集合
- ATTRS = {'id', 'class', 'style', 'width'}
- def _repair_tag():
- """异常的标签组合,用来替换非标准页面的标签"""
- _repairs = {}
- for tag in TAGS:
- for attr in ATTRS:
- key = '{}{}'.format(tag, attr)
- val = '{} {}'.format(tag, attr)
- _repairs[key] = val
- return _repairs
- def _escape_character(html):
- """转义字符"""
- html = html.replace('<', '<')
- html = html.replace('>', '>')
- html = html.replace('"', '"')
- html = html.replace('&', '&')
- return html
- def _lowercase_tag(html):
- """标签归一化处理(全部小写)"""
- tags = re.findall("<[^>]+>", html)
- for tag in tags:
- html = html.replace(tag, str(tag).lower())
- repair_tags = _repair_tag()
- for err, right in repair_tags.items():
- html = html.replace(err, right)
- return html
- def cleaner(html, special=None, completely=False):
- """
- 数据清洗
- :param html: 清洗的页面
- :param special: 额外指定页面清洗规则
- :param completely: 是否完全清洗页面
- :return: 清洗后的页面源码
- """
- if special is None:
- special = {}
- OTHER.update(special)
- remove_tags = {
- **INDEPENDENT_TAGS,
- **INLINE_TAGS,
- **BLOCK_TAGS,
- **OTHER,
- **CSS_STYLE,
- **BLANKS,
- }
- html = _lowercase_tag(html)
- for tag, repl in remove_tags.items():
- html = re.sub(tag, repl, html)
- if completely:
- html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布
- html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架
- html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
- html = _escape_character(html)
- return html
- def save_data(documents, col):
- if isinstance(documents, list):
- col.insert_many(documents)
- else:
- col.insert_many([documents])
- def crawl_spider_zb(channel, progress, page, keyword):
- url = "https://www.biaozhaozhao.com/qcc/tender/search"
- data = {
- "sortField": "publishdate",
- "sortOrder": "DESC",
- "searchType": "accurate",
- "searchKeyList": [keyword],
- "filter": {
- "publishdate": [
- {
- "currently": True,
- "flag": 5,
- "number": 1,
- "unit": "day",
- "min": "2022-07-31T16:00:00.000Z",
- "max": "2022-08-31T15:59:59.999Z"
- }
- ],
- "ifbprogress": [
- progress
- ],
- "region": [
- {
- "pr": "GD"
- }
- ],
- "isvalid": 1
- },
- "queryLink": "or",
- "pageIndex": page,
- "pageSize": 50,
- "isHighlight": True,
- "isDegrade": True
- }
- data = json.dumps(data)
- response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
- if response.status_code == 403:
- print(f"{keyword}>>>当前第{page}页, 账号cookie失效")
- return True, 0
- results = []
- info_list = response.json()["Result"]
- if len(info_list) == 0:
- print(f"{keyword}>>>暂无数据")
- return True, 0
- for item in info_list:
- href = 'https://www.biaozhaozhao.com/detail/{}?keywords=%5B"{}"%5D'.format(item["id"], quote(keyword))
- if not r.hexists(redis_key, href):
- title = item['title']
- publish_time = item['publishdate']
- time_array = time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")
- publishtime_ts = int(time.mktime(time_array))
- data = {
- 'site': '标找找',
- 'channel': channel,
- 'spidercode': 'sdxzbiddingsjzypc',
- 'title': title,
- 'area': '广东', # 省
- 'city': item['city'], # 市
- 'district': item['district'],
- 'publishdept': '',
- 'type': '',
- 'T': 'bidding', # 数据表名称
- 'sendflag': 'false',
- '_d': 'comeintime',
- 'iscompete': True, # 新爬虫
- 'crawl': False,
- 'href': '#',
- 'competehref': href,
- 'publishtime': publish_time,
- 'l_np_publishtime': Int64(publishtime_ts),
- }
- results.append(data)
- r.hset(redis_key, href, '')
- if len(results) > 0:
- save_data(results, bzz_list)
- print(f"{keyword}>>>第{page}页采集完成,收录{len(results)}条")
- return False, len(results)
- def list_page(channel, progress, keyword, pages):
- for page in range(1, pages + 1):
- stop, total = crawl_spider_zb(channel, progress, page, keyword)
- if stop or total < 50:
- return False, page
- time.sleep(2)
- return True, page
- def select_label(keyword, pages):
- print(f"{keyword}>>>开始采集{pages}页")
- for name, index in type_maps.items():
- normal_stop, page = list_page(name, index, keyword, pages)
- if not normal_stop:
- return False
- print(f"{name}>>>完成采集:{keyword} 第{page}页")
- return True
- def handler_detail(items):
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Pragma": "no-cache",
- "Sec-Fetch-Dest": "document",
- "Sec-Fetch-Mode": "navigate",
- "Sec-Fetch-Site": "none",
- "Sec-Fetch-User": "?1",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
- "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": "\"macOS\""
- }
- channel = items['channel']
- url = items['competehref']
- try:
- response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
- if '您当天的查看项目额度已用完!' in response.text:
- print(f'{channel}>>>{items["title"]} 额度已用完!')
- return False
- response = Selector(response.text)
- rubbish = response.xpath('//div[@class="dfUzPvnO"]').extract_first()
- html = response.xpath('//div[@class="_4nFsQzFd"]').extract_first()
- if html is not None:
- contenthtml = html.replace(rubbish, '')
- items['contenthtml'] = contenthtml
- items['detail'] = cleaner(contenthtml)
- items['comeintime'] = Int64(int(time.time()))
- bzz_list.update_one(
- {'_id': items['_id']},
- {'$set': {'crawl': True}}
- )
- if '_id' in items:
- del items['_id'], items['crawl']
- data_bak.insert_one(items)
- print(f'{channel}>>>{items["title"]} 下载成功')
- return True
- except requests.RequestException:
- print(f'{channel}>>>{items["title"]} 下载失败')
- return False
- def detail_page():
- # q = {'crawl': False}
- q = {'crawl': True}
- amount_of_project = 0
- # total = bzz_list.count_documents(q)
- # while total > 0:
- # print(f"{channel}>>>剩余任务{total}条")
- with bzz_list.find(q) as cursor:
- tasks = [item for item in cursor]
- for item in tasks:
- handler_detail(item)
- # if stop_crawl:
- # amount_of_project += 1
- time.sleep(1.5)
- print("任务结束")
- def _count_zb_total(page, keyword, progress):
- url = "https://www.biaozhaozhao.com/qcc/tender/search"
- data = {
- "sortField": "publishdate",
- "sortOrder": "DESC",
- "searchType": "accurate",
- "searchKeyList": [keyword],
- "filter": {
- "publishdate": [
- {
- "currently": True,
- "flag": 5,
- "number": 1,
- "unit": "day",
- "min": "2022-07-31T16:00:00.000Z",
- "max": "2022-08-31T15:59:59.999Z"
- }
- ],
- "ifbprogress": [
- progress
- ],
- "region": [
- {
- "pr": "GD",
- "ct": "4403"
- }
- ],
- "isvalid": 1
- },
- "queryLink": "or",
- "pageIndex": page,
- "pageSize": 50,
- "isHighlight": True,
- "isDegrade": True
- }
- data = json.dumps(data)
- response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
- # print(response.json())
- total_records = response.json()['Paging']['TotalRecords']
- return int(total_records)
- def count_total():
- count = 0
- for tp_name, tp_index in type_maps.items():
- totals = 0
- for kw in ['一体机', '白板', '黑板', '大屏', '智慧屏', '录播', '智能教师']:
- totals += _count_zb_total(1, kw, tp_index)
- print(tp_name, totals)
- count += totals
- print("总计 ", count)
- def push_spider_dbs():
- dbk = cli1['py_spider']['data_bak']
- cur1 = data_bak.find()
- for item in cur1:
- del item['_id']
- print(item)
- item['comeintime'] = Int64(int(time.time()))
- dbk.insert_one(item)
- if __name__ == '__main__':
- select_label('一体机', 13)
- # select_label('白板', 2)
- # select_label('黑板', 2)
- # select_label('大屏', 3)
- # select_label('智慧屏', 1)
- # select_label('录播', 2)
- # select_label('智能教师', 1)
- detail_page()
|