# -*- coding: utf-8 -*- """ Created on 2024-06-17 --------- @summary: 元博网 - 列表页信息检索 --------- @author: Lzz """ import math import random import time import warnings from collections import namedtuple import requests from pymongo import MongoClient import setting import utils.tools as tool from dbs.RedisDB import RedisFilter from log import logger warnings.filterwarnings('ignore') class Spider: def __init__(self): _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT) self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"] self.dedup = RedisFilter() self.total = 0 self.crawl_page = 1 self.areas_dict = {1: '北京', 2: '上海', 3: '天津', 4: '重庆', 5: '河北', 6: '山西', 7: '内蒙古', 8: '辽宁', 9: '吉林', 10: '黑龙江', 11: '江苏', 12: '浙江', 13: '安徽', 14: '福建', 15: '江西', 16: '山东', 17: '河南', 18: '湖北', 19: '湖南', 20: '广东', 21: '广西', 22: '海南', 23: '贵州', 24: '云南', 25: '西藏', 26: '陕西', 27: '四川', 28: '甘肃', 29: '青海', 30: '新疆', 31: '宁夏'} def fetch_request(self, page, key, proxies=False): url = "https://www.chinabidding.cn/302e302e7379675f73736f/datax/json/gj_zbcg_daylimit" headers = { "accept": "application/json, text/javascript, */*; q=0.01", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "no-cache", "pragma": "no-cache", "priority": "u=1, i", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", "x-requested-with": "XMLHttpRequest" } params = { "device": "es", "cpcode": "es001", "keywords": f"{key}", "table_type": "4,", "search_type": "CONTEXT", "areaid": "17,", "categoryid": "", "b_date": "week", "time_start": "", "time_end": "", "page": f"{page}", "rp": "30", "usrecord_id": "", } request_params = dict( headers=headers, params=params, proxies=proxies, timeout=60, verify=False ) return requests.get(url, **request_params) def parse(self, response, query_date): total = response.json().get('result').get('total', 0) self.crawl_page = math.ceil(total / 30) results = [] info_list = response.json().get('result').get('list', []) for info in info_list: publish_time = info.get('fields').get('publish_date') title = tool.clean_title(info.get('fields').get('title').strip()) competehref = info.get('fields').get('url') if "chinabidding" not in competehref: competehref = 'https://www.chinabidding.cn{}'.format(competehref) area = self.areas_dict[int(info.get('fields').get('area_id'))] or "全国" if title is None: logger.error(f"[标题为空]{competehref}") return if not self.dedup.get(competehref) and query_date in publish_time: item = { "site": "元博网(采购与招标网)", "channel": "政府采购", "area": area if area != '跨省' else '全国', "_d": "comeintime", "comeintime": tool.int2long(int(time.time())), "T": "bidding", "sendflag": "false", "spidercode": "a_ybwcgyzbw_zfcg", "city": "", "infoformat": 1, "type": "", "publishdept": "", "title": title, "competehref": competehref, "href": "#", "publishtime": publish_time, "l_np_publishtime": tool.int2long(tool.date_to_timestamp(publish_time)), } self.ybw_list.insert_one(item) self.dedup.add(competehref) results.append(item) self.total += 1 logger.info( f' *** 检索完成:去重 {len(info_list) - len(results)} 条 - 入库 {len(results)} 条 *** <{self.total}>') def crawl_list_spider(self, page, key, query_date): retry_times = 0 while retry_times < 3: proxies = tool.get_proxy() try: response = self.fetch_request(page=page, key=key, proxies=proxies) response.raise_for_status() # requests 自检 if response is not None and response.status_code == 200: self.parse(response, query_date) logger.debug(f"[检索完成] {key}") time.sleep(random.random()) return else: retry_times += 1 time.sleep(1) except Exception as e: logger.error(f"采集异常:{e}") retry_times += 1 time.sleep(2) logger.warning(f"[检索失败] {key}") def start(self, query_date): logger.info("********** 检索开始 **********") data_sets = { "中国移动河南分公司", "中国移动通信集团", "中移建设有限公司", "中移铁通有限公司", "中移系统集成有限公司", "中移信息系统集成有限公司", "中移在线服务有限公司", "联通(河南)产业互联网有限公司", "联通数字科技有限公司", "中国联合网络通信", "中国联合网络通信有限公司", "中讯邮电咨询设计院有限公司", "天翼云科技有限公司", "中电信数智科技有限公司", "中国电信股份有限公司", "中国电信集团有限公司", "中国电信数智科技有限公司", "中国联合网络通信有限公司", "新疆天富天源燃气有限公司2025年八师居民及商服入户工程材料采购" } for key in data_sets: self.crawl_list_spider(1, key, query_date) if self.crawl_page != 1: for page in range(2, self.crawl_page + 1): self.crawl_list_spider(page, key, query_date) logger.info("********** 检索结束 **********") if __name__ == '__main__': Menu = namedtuple( 'Menu', ['channel', 'code', 'types', 'rout', 'query_date', 'crawl_page'] ) query_date = tool.get_today_of_day(-1) Spider().start(query_date)