import json
import re
import time
import redis
import requests
from bson.int64 import Int64
from parsel import Selector
from pymongo import MongoClient
from urllib.parse import quote
cli = MongoClient("192.168.20.248", 27017)
bzz_list = cli['dzr']['bzz_zb_list']
cli1 = MongoClient("127.0.0.1", 27001)
data_bak = cli['dzr']['data_bak']
_pool = redis.ConnectionPool(
host='192.168.20.248',
port=6379,
password='top@123',
db=2
)
r = redis.Redis(connection_pool=_pool, decode_responses=True)
redis_key = 'duplicate_bzz_list'
type_maps = {
'中标': '42',
'成交': '43',
'单一来源': '410',
'合同及验收': '48',
}
# channel = "中标"
# channel_sign = "42"
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json",
"Origin": "https://www.biaozhaozhao.com",
"Pragma": "no-cache",
"Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
"x-kzz-request-from": "qcc-tender-web",
"x-kzz-request-id": "27328858-5576-2734-188319849847",
"x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
"x-kzz-request-time": "1668845731606"
}
cookies = {
"QCCSESSID": "b410c26e4aa5895529a652519c",
"gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
"ls_371ef9f55b6b63dc": "42d6a4659d87930e"
}
# 独立元素
INDEPENDENT_TAGS = {
'
[\s\S]*?': '',
'|]*>|': '',
'|]*>|': '',
']*>|]*>|]*>[\s\S]*?|': '', # 元数据
'&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
'\\xa0|\\u3000': '', # 空格
'': '', # 注释
'': '', # 样式
'': '', # JavaScript
'': '', # 输入框
'
]*>': '
', # 图片
}
# 行内元素
INLINE_TAGS = {
'|]*>|': '', # 超链接
'|]*>|': '', # span
'