|
@@ -0,0 +1,443 @@
|
|
|
|
+import json
|
|
|
|
+import re
|
|
|
|
+import time
|
|
|
|
+
|
|
|
|
+import redis
|
|
|
|
+import requests
|
|
|
|
+from bson.int64 import Int64
|
|
|
|
+from parsel import Selector
|
|
|
|
+from pymongo import MongoClient
|
|
|
|
+from urllib.parse import quote
|
|
|
|
+
|
|
|
|
+cli = MongoClient("192.168.20.248", 27017)
|
|
|
|
+bzz_list = cli['dzr']['bzz_zb_list']
|
|
|
|
+
|
|
|
|
+cli1 = MongoClient("127.0.0.1", 27001)
|
|
|
|
+data_bak = cli['dzr']['data_bak']
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+_pool = redis.ConnectionPool(
|
|
|
|
+ host='192.168.20.248',
|
|
|
|
+ port=6379,
|
|
|
|
+ password='top@123',
|
|
|
|
+ db=2
|
|
|
|
+)
|
|
|
|
+r = redis.Redis(connection_pool=_pool, decode_responses=True)
|
|
|
|
+redis_key = 'duplicate_bzz_list'
|
|
|
|
+type_maps = {
|
|
|
|
+ '中标': '42',
|
|
|
|
+ '成交': '43',
|
|
|
|
+ '单一来源': '410',
|
|
|
|
+ '合同及验收': '48',
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+# channel = "中标"
|
|
|
|
+# channel_sign = "42"
|
|
|
|
+
|
|
|
|
+headers = {
|
|
|
|
+ "Accept": "application/json, text/plain, */*",
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
|
+ "Connection": "keep-alive",
|
|
|
|
+ "Content-Type": "application/json",
|
|
|
|
+ "Origin": "https://www.biaozhaozhao.com",
|
|
|
|
+ "Pragma": "no-cache",
|
|
|
|
+ "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
|
|
|
|
+ "Sec-Fetch-Dest": "empty",
|
|
|
|
+ "Sec-Fetch-Mode": "cors",
|
|
|
|
+ "Sec-Fetch-Site": "same-origin",
|
|
|
|
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
|
|
|
+ "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
|
|
|
|
+ "sec-ch-ua-mobile": "?0",
|
|
|
|
+ "sec-ch-ua-platform": "\"macOS\"",
|
|
|
|
+ "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
|
|
|
|
+ "x-kzz-request-from": "qcc-tender-web",
|
|
|
|
+ "x-kzz-request-id": "27328858-5576-2734-188319849847",
|
|
|
|
+ "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
|
|
|
|
+ "x-kzz-request-time": "1668845731606"
|
|
|
|
+}
|
|
|
|
+cookies = {
|
|
|
|
+ "QCCSESSID": "b410c26e4aa5895529a652519c",
|
|
|
|
+ "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
|
|
|
|
+ "ls_371ef9f55b6b63dc": "42d6a4659d87930e"
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+# 独立元素
|
|
|
|
+INDEPENDENT_TAGS = {
|
|
|
|
+ '<head>[\s\S]*?</head>': '',
|
|
|
|
+ '<html>|<html [^>]*>|</html>': '',
|
|
|
|
+ '<body>|<body [^>]*>|</body>': '',
|
|
|
|
+ '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '', # 元数据
|
|
|
|
+ '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
|
|
|
|
+ '\\xa0|\\u3000': '', # 空格
|
|
|
|
+ '<!--[\s\S]*?-->': '', # 注释
|
|
|
|
+ '<style[^<>]*>[\s\S]*?</style>': '', # 样式
|
|
|
|
+ '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
|
|
|
|
+ '<input>': '', # 输入框
|
|
|
|
+ '<img[^>]*>': '<br>', # 图片
|
|
|
|
+}
|
|
|
|
+# 行内元素
|
|
|
|
+INLINE_TAGS = {
|
|
|
|
+ '<a>|<a [^>]*>|</a>': '', # 超链接
|
|
|
|
+ '<span>|<span [^>]*>|</span>': '', # span
|
|
|
|
+ '<label>|<label [^>]*>|</label>': '<br>', # label
|
|
|
|
+ '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '', # font
|
|
|
|
+}
|
|
|
|
+# 块级元素
|
|
|
|
+BLOCK_TAGS = {
|
|
|
|
+ # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '', # 标题
|
|
|
|
+ # '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
|
|
|
|
+ '<p>|<p [^>]*>|</p>': '<br>', # 段落
|
|
|
|
+ '<div>|<div [^>]*>|</div>': '<br>', # 分割 division
|
|
|
|
+ '<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
|
|
|
|
+}
|
|
|
|
+# 其他
|
|
|
|
+OTHER = {
|
|
|
|
+ '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
|
|
|
|
+ '<epointform>': '',
|
|
|
|
+ '<!doctype html>|<!doctype html [^>]*>': '',
|
|
|
|
+ '【关闭】|关闭': '',
|
|
|
|
+ '【打印】|打印本页': '',
|
|
|
|
+ '【字体:[\s\S]*】': '',
|
|
|
|
+ '文章来源:[\u4e00-\u9fa5]+': '',
|
|
|
|
+ '浏览次数:.*[<]+': '',
|
|
|
|
+ '(责任编辑:.*?)': '',
|
|
|
|
+ '分享到[:]': '',
|
|
|
|
+ '相关链接:[\s\S]+': '',
|
|
|
|
+ '阅读数[::]\d+': '',
|
|
|
|
+}
|
|
|
|
+# 样式
|
|
|
|
+CSS_STYLE = {
|
|
|
|
+ 'style="[\s\S]*?"|style ="[\s\S]*?"': '',
|
|
|
|
+ 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
|
|
|
|
+ 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
|
|
|
|
+ 'class="[\s\S]*?"|class ="[\s\S]*?"': '',
|
|
|
|
+ 'align="[\s\S]*?"|align ="[\s\S]*?"': '',
|
|
|
|
+ 'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
|
|
|
|
+}
|
|
|
|
+# 空白符
|
|
|
|
+BLANKS = {
|
|
|
|
+ '\n\s*\n': '\n',
|
|
|
|
+ '\s*\n\s*': '\n',
|
|
|
|
+ '[^\S\n]': ' ',
|
|
|
|
+ '\s+': ' ',
|
|
|
|
+}
|
|
|
|
+# css标签集合
|
|
|
|
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
|
|
|
|
+# css属性集合
|
|
|
|
+ATTRS = {'id', 'class', 'style', 'width'}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _repair_tag():
|
|
|
|
+ """异常的标签组合,用来替换非标准页面的标签"""
|
|
|
|
+ _repairs = {}
|
|
|
|
+ for tag in TAGS:
|
|
|
|
+ for attr in ATTRS:
|
|
|
|
+ key = '{}{}'.format(tag, attr)
|
|
|
|
+ val = '{} {}'.format(tag, attr)
|
|
|
|
+ _repairs[key] = val
|
|
|
|
+ return _repairs
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _escape_character(html):
|
|
|
|
+ """转义字符"""
|
|
|
|
+ html = html.replace('<', '<')
|
|
|
|
+ html = html.replace('>', '>')
|
|
|
|
+ html = html.replace('"', '"')
|
|
|
|
+ html = html.replace('&', '&')
|
|
|
|
+ return html
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _lowercase_tag(html):
|
|
|
|
+ """标签归一化处理(全部小写)"""
|
|
|
|
+ tags = re.findall("<[^>]+>", html)
|
|
|
|
+ for tag in tags:
|
|
|
|
+ html = html.replace(tag, str(tag).lower())
|
|
|
|
+
|
|
|
|
+ repair_tags = _repair_tag()
|
|
|
|
+ for err, right in repair_tags.items():
|
|
|
|
+ html = html.replace(err, right)
|
|
|
|
+
|
|
|
|
+ return html
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def cleaner(html, special=None, completely=False):
|
|
|
|
+ """
|
|
|
|
+ 数据清洗
|
|
|
|
+
|
|
|
|
+ :param html: 清洗的页面
|
|
|
|
+ :param special: 额外指定页面清洗规则
|
|
|
|
+ :param completely: 是否完全清洗页面
|
|
|
|
+ :return: 清洗后的页面源码
|
|
|
|
+ """
|
|
|
|
+ if special is None:
|
|
|
|
+ special = {}
|
|
|
|
+ OTHER.update(special)
|
|
|
|
+ remove_tags = {
|
|
|
|
+ **INDEPENDENT_TAGS,
|
|
|
|
+ **INLINE_TAGS,
|
|
|
|
+ **BLOCK_TAGS,
|
|
|
|
+ **OTHER,
|
|
|
|
+ **CSS_STYLE,
|
|
|
|
+ **BLANKS,
|
|
|
|
+ }
|
|
|
|
+ html = _lowercase_tag(html)
|
|
|
|
+ for tag, repl in remove_tags.items():
|
|
|
|
+ html = re.sub(tag, repl, html)
|
|
|
|
+
|
|
|
|
+ if completely:
|
|
|
|
+ html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布
|
|
|
|
+ html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架
|
|
|
|
+ html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
|
|
|
|
+
|
|
|
|
+ html = _escape_character(html)
|
|
|
|
+ return html
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def save_data(documents, col):
|
|
|
|
+ if isinstance(documents, list):
|
|
|
|
+ col.insert_many(documents)
|
|
|
|
+ else:
|
|
|
|
+ col.insert_many([documents])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def crawl_spider_zb(channel, progress, page, keyword):
|
|
|
|
+ url = "https://www.biaozhaozhao.com/qcc/tender/search"
|
|
|
|
+ data = {
|
|
|
|
+ "sortField": "publishdate",
|
|
|
|
+ "sortOrder": "DESC",
|
|
|
|
+ "searchType": "accurate",
|
|
|
|
+ "searchKeyList": [keyword],
|
|
|
|
+ "filter": {
|
|
|
|
+ "publishdate": [
|
|
|
|
+ {
|
|
|
|
+ "currently": True,
|
|
|
|
+ "flag": 5,
|
|
|
|
+ "number": 1,
|
|
|
|
+ "unit": "day",
|
|
|
|
+ "min": "2022-07-31T16:00:00.000Z",
|
|
|
|
+ "max": "2022-08-31T15:59:59.999Z"
|
|
|
|
+ }
|
|
|
|
+ ],
|
|
|
|
+ "ifbprogress": [
|
|
|
|
+ progress
|
|
|
|
+ ],
|
|
|
|
+ "region": [
|
|
|
|
+ {
|
|
|
|
+ "pr": "GD"
|
|
|
|
+ }
|
|
|
|
+ ],
|
|
|
|
+ "isvalid": 1
|
|
|
|
+ },
|
|
|
|
+ "queryLink": "or",
|
|
|
|
+ "pageIndex": page,
|
|
|
|
+ "pageSize": 50,
|
|
|
|
+ "isHighlight": True,
|
|
|
|
+ "isDegrade": True
|
|
|
|
+ }
|
|
|
|
+ data = json.dumps(data)
|
|
|
|
+ response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
|
|
|
|
+ if response.status_code == 403:
|
|
|
|
+ print(f"{keyword}>>>当前第{page}页, 账号cookie失效")
|
|
|
|
+ return True, 0
|
|
|
|
+
|
|
|
|
+ results = []
|
|
|
|
+ info_list = response.json()["Result"]
|
|
|
|
+ if len(info_list) == 0:
|
|
|
|
+ print(f"{keyword}>>>暂无数据")
|
|
|
|
+ return True, 0
|
|
|
|
+
|
|
|
|
+ for item in info_list:
|
|
|
|
+ href = 'https://www.biaozhaozhao.com/detail/{}?keywords=%5B"{}"%5D'.format(item["id"], quote(keyword))
|
|
|
|
+ if not r.hexists(redis_key, href):
|
|
|
|
+ title = item['title']
|
|
|
|
+ publish_time = item['publishdate']
|
|
|
|
+ time_array = time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")
|
|
|
|
+ publishtime_ts = int(time.mktime(time_array))
|
|
|
|
+ data = {
|
|
|
|
+ 'site': '标找找',
|
|
|
|
+ 'channel': channel,
|
|
|
|
+ 'spidercode': 'sdxzbiddingsjzypc',
|
|
|
|
+ 'title': title,
|
|
|
|
+ 'area': '广东', # 省
|
|
|
|
+ 'city': item['city'], # 市
|
|
|
|
+ 'district': item['district'],
|
|
|
|
+ 'publishdept': '',
|
|
|
|
+ 'type': '',
|
|
|
|
+ 'T': 'bidding', # 数据表名称
|
|
|
|
+ 'sendflag': 'false',
|
|
|
|
+ '_d': 'comeintime',
|
|
|
|
+ 'iscompete': True, # 新爬虫
|
|
|
|
+ 'crawl': False,
|
|
|
|
+ 'href': '#',
|
|
|
|
+ 'competehref': href,
|
|
|
|
+ 'publishtime': publish_time,
|
|
|
|
+ 'l_np_publishtime': Int64(publishtime_ts),
|
|
|
|
+ }
|
|
|
|
+ results.append(data)
|
|
|
|
+ r.hset(redis_key, href, '')
|
|
|
|
+
|
|
|
|
+ if len(results) > 0:
|
|
|
|
+ save_data(results, bzz_list)
|
|
|
|
+ print(f"{keyword}>>>第{page}页采集完成,收录{len(results)}条")
|
|
|
|
+ return False, len(results)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def list_page(channel, progress, keyword, pages):
|
|
|
|
+ for page in range(1, pages + 1):
|
|
|
|
+ stop, total = crawl_spider_zb(channel, progress, page, keyword)
|
|
|
|
+ if stop or total < 50:
|
|
|
|
+ return False, page
|
|
|
|
+ time.sleep(2)
|
|
|
|
+ return True, page
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def select_label(keyword, pages):
|
|
|
|
+ print(f"{keyword}>>>开始采集{pages}页")
|
|
|
|
+ for name, index in type_maps.items():
|
|
|
|
+ normal_stop, page = list_page(name, index, keyword, pages)
|
|
|
|
+ if not normal_stop:
|
|
|
|
+ return False
|
|
|
|
+ print(f"{name}>>>完成采集:{keyword} 第{page}页")
|
|
|
|
+ return True
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def handler_detail(items):
|
|
|
|
+ headers = {
|
|
|
|
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
|
+ "Connection": "keep-alive",
|
|
|
|
+ "Pragma": "no-cache",
|
|
|
|
+ "Sec-Fetch-Dest": "document",
|
|
|
|
+ "Sec-Fetch-Mode": "navigate",
|
|
|
|
+ "Sec-Fetch-Site": "none",
|
|
|
|
+ "Sec-Fetch-User": "?1",
|
|
|
|
+ "Upgrade-Insecure-Requests": "1",
|
|
|
|
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
|
|
|
+ "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
|
|
|
|
+ "sec-ch-ua-mobile": "?0",
|
|
|
|
+ "sec-ch-ua-platform": "\"macOS\""
|
|
|
|
+ }
|
|
|
|
+ channel = items['channel']
|
|
|
|
+ url = items['competehref']
|
|
|
|
+ try:
|
|
|
|
+ response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
|
|
|
|
+ if '您当天的查看项目额度已用完!' in response.text:
|
|
|
|
+ print(f'{channel}>>>{items["title"]} 额度已用完!')
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ response = Selector(response.text)
|
|
|
|
+ rubbish = response.xpath('//div[@class="dfUzPvnO"]').extract_first()
|
|
|
|
+ html = response.xpath('//div[@class="_4nFsQzFd"]').extract_first()
|
|
|
|
+ if html is not None:
|
|
|
|
+ contenthtml = html.replace(rubbish, '')
|
|
|
|
+ items['contenthtml'] = contenthtml
|
|
|
|
+ items['detail'] = cleaner(contenthtml)
|
|
|
|
+ items['comeintime'] = Int64(int(time.time()))
|
|
|
|
+
|
|
|
|
+ bzz_list.update_one(
|
|
|
|
+ {'_id': items['_id']},
|
|
|
|
+ {'$set': {'crawl': True}}
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ if '_id' in items:
|
|
|
|
+ del items['_id'], items['crawl']
|
|
|
|
+ data_bak.insert_one(items)
|
|
|
|
+ print(f'{channel}>>>{items["title"]} 下载成功')
|
|
|
|
+ return True
|
|
|
|
+ except requests.RequestException:
|
|
|
|
+ print(f'{channel}>>>{items["title"]} 下载失败')
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def detail_page():
|
|
|
|
+ # q = {'crawl': False}
|
|
|
|
+ q = {'crawl': True}
|
|
|
|
+ amount_of_project = 0
|
|
|
|
+ # total = bzz_list.count_documents(q)
|
|
|
|
+ # while total > 0:
|
|
|
|
+ # print(f"{channel}>>>剩余任务{total}条")
|
|
|
|
+ with bzz_list.find(q) as cursor:
|
|
|
|
+ tasks = [item for item in cursor]
|
|
|
|
+ for item in tasks:
|
|
|
|
+ handler_detail(item)
|
|
|
|
+ # if stop_crawl:
|
|
|
|
+ # amount_of_project += 1
|
|
|
|
+ time.sleep(1.5)
|
|
|
|
+ print("任务结束")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _count_zb_total(page, keyword, progress):
|
|
|
|
+ url = "https://www.biaozhaozhao.com/qcc/tender/search"
|
|
|
|
+ data = {
|
|
|
|
+ "sortField": "publishdate",
|
|
|
|
+ "sortOrder": "DESC",
|
|
|
|
+ "searchType": "accurate",
|
|
|
|
+ "searchKeyList": [keyword],
|
|
|
|
+ "filter": {
|
|
|
|
+ "publishdate": [
|
|
|
|
+ {
|
|
|
|
+ "currently": True,
|
|
|
|
+ "flag": 5,
|
|
|
|
+ "number": 1,
|
|
|
|
+ "unit": "day",
|
|
|
|
+ "min": "2022-07-31T16:00:00.000Z",
|
|
|
|
+ "max": "2022-08-31T15:59:59.999Z"
|
|
|
|
+ }
|
|
|
|
+ ],
|
|
|
|
+ "ifbprogress": [
|
|
|
|
+ progress
|
|
|
|
+ ],
|
|
|
|
+ "region": [
|
|
|
|
+ {
|
|
|
|
+ "pr": "GD",
|
|
|
|
+ "ct": "4403"
|
|
|
|
+ }
|
|
|
|
+ ],
|
|
|
|
+ "isvalid": 1
|
|
|
|
+ },
|
|
|
|
+ "queryLink": "or",
|
|
|
|
+ "pageIndex": page,
|
|
|
|
+ "pageSize": 50,
|
|
|
|
+ "isHighlight": True,
|
|
|
|
+ "isDegrade": True
|
|
|
|
+ }
|
|
|
|
+ data = json.dumps(data)
|
|
|
|
+ response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
|
|
|
|
+ # print(response.json())
|
|
|
|
+ total_records = response.json()['Paging']['TotalRecords']
|
|
|
|
+ return int(total_records)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def count_total():
|
|
|
|
+ count = 0
|
|
|
|
+ for tp_name, tp_index in type_maps.items():
|
|
|
|
+ totals = 0
|
|
|
|
+ for kw in ['一体机', '白板', '黑板', '大屏', '智慧屏', '录播', '智能教师']:
|
|
|
|
+ totals += _count_zb_total(1, kw, tp_index)
|
|
|
|
+
|
|
|
|
+ print(tp_name, totals)
|
|
|
|
+ count += totals
|
|
|
|
+ print("总计 ", count)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def push_spider_dbs():
|
|
|
|
+ dbk = cli1['py_spider']['data_bak']
|
|
|
|
+ cur1 = data_bak.find()
|
|
|
|
+ for item in cur1:
|
|
|
|
+ del item['_id']
|
|
|
|
+ print(item)
|
|
|
|
+ item['comeintime'] = Int64(int(time.time()))
|
|
|
|
+ dbk.insert_one(item)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ select_label('一体机', 13)
|
|
|
|
+ # select_label('白板', 2)
|
|
|
|
+ # select_label('黑板', 2)
|
|
|
|
+ # select_label('大屏', 3)
|
|
|
|
+ # select_label('智慧屏', 1)
|
|
|
|
+ # select_label('录播', 2)
|
|
|
|
+ # select_label('智能教师', 1)
|
|
|
|
+ detail_page()
|