import json import re import time import redis import requests from bson.int64 import Int64 from parsel import Selector from pymongo import MongoClient from urllib.parse import quote cli = MongoClient("192.168.20.248", 27017) bzz_list = cli['dzr']['bzz_zb_list'] cli1 = MongoClient("127.0.0.1", 27001) data_bak = cli['dzr']['data_bak'] _pool = redis.ConnectionPool( host='192.168.20.248', port=6379, password='top@123', db=2 ) r = redis.Redis(connection_pool=_pool, decode_responses=True) redis_key = 'duplicate_bzz_list' type_maps = { '中标': '42', '成交': '43', '单一来源': '410', '合同及验收': '48', } # channel = "中标" # channel_sign = "42" headers = { "Accept": "application/json, text/plain, */*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Type": "application/json", "Origin": "https://www.biaozhaozhao.com", "Pragma": "no-cache", "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1", "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"macOS\"", "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0", "x-kzz-request-from": "qcc-tender-web", "x-kzz-request-id": "27328858-5576-2734-188319849847", "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D", "x-kzz-request-time": "1668845731606" } cookies = { "QCCSESSID": "b410c26e4aa5895529a652519c", "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c", "ls_371ef9f55b6b63dc": "42d6a4659d87930e" } # 独立元素 INDEPENDENT_TAGS = { '[\s\S]*?': '', '|]*>|': '', '|]*>|': '', ']*>|]*>|]*>[\s\S]*?|': '', # 元数据 '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格 '\\xa0|\\u3000': '', # 空格 '': '', # 注释 ']*>[\s\S]*?': '', # 样式 ']*>[\s\S]*?': '', # JavaScript '': '', # 输入框 ']*>': '
', # 图片 } # 行内元素 INLINE_TAGS = { '|]*>|': '', # 超链接 '|]*>|': '', # span '