123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-06-17
- ---------
- @summary: 元博网 - 列表页信息检索
- ---------
- @author: Lzz
- """
- import math
- import random
- import time
- import warnings
- from collections import namedtuple
- import requests
- from pymongo import MongoClient
- import setting
- import utils.tools as tool
- from dbs.RedisDB import RedisFilter
- from log import logger
- warnings.filterwarnings('ignore')
- class Spider:
- def __init__(self):
- _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
- self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"]
- self.dedup = RedisFilter()
- self.total = 0
- self.crawl_page = 1
- self.areas_dict = {1: '北京', 2: '上海', 3: '天津', 4: '重庆', 5: '河北', 6: '山西', 7: '内蒙古', 8: '辽宁',
- 9: '吉林', 10: '黑龙江', 11: '江苏', 12: '浙江', 13: '安徽', 14: '福建', 15: '江西',
- 16: '山东', 17: '河南', 18: '湖北', 19: '湖南', 20: '广东', 21: '广西', 22: '海南',
- 23: '贵州', 24: '云南', 25: '西藏', 26: '陕西', 27: '四川', 28: '甘肃', 29: '青海',
- 30: '新疆', 31: '宁夏'}
- def fetch_request(self, page, key, proxies=False):
- url = "https://www.chinabidding.cn/302e302e7379675f73736f/datax/json/gj_zbcg_daylimit"
- headers = {
- "accept": "application/json, text/javascript, */*; q=0.01",
- "accept-language": "zh-CN,zh;q=0.9",
- "cache-control": "no-cache",
- "pragma": "no-cache",
- "priority": "u=1, i",
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
- "x-requested-with": "XMLHttpRequest"
- }
- params = {
- "device": "es",
- "cpcode": "es001",
- "keywords": f"{key}",
- "table_type": "4,",
- "search_type": "CONTEXT",
- "areaid": "17,",
- "categoryid": "",
- "b_date": "week",
- "time_start": "",
- "time_end": "",
- "page": f"{page}",
- "rp": "30",
- "usrecord_id": "",
- }
- request_params = dict(
- headers=headers,
- params=params,
- proxies=proxies,
- timeout=60,
- verify=False
- )
- return requests.get(url, **request_params)
- def parse(self, response, query_date):
- total = response.json().get('result').get('total', 0)
- self.crawl_page = math.ceil(total / 30)
- results = []
- info_list = response.json().get('result').get('list', [])
- for info in info_list:
- publish_time = info.get('fields').get('publish_date')
- title = tool.clean_title(info.get('fields').get('title').strip())
- competehref = info.get('fields').get('url')
- if "chinabidding" not in competehref:
- competehref = 'https://www.chinabidding.cn{}'.format(competehref)
- area = self.areas_dict[int(info.get('fields').get('area_id'))] or "全国"
- if title is None:
- logger.error(f"[标题为空]{competehref}")
- return
- if not self.dedup.get(competehref) and query_date in publish_time:
- item = {
- "site": "元博网(采购与招标网)",
- "channel": "政府采购",
- "area": area if area != '跨省' else '全国',
- "_d": "comeintime",
- "comeintime": tool.int2long(int(time.time())),
- "T": "bidding",
- "sendflag": "false",
- "spidercode": "a_ybwcgyzbw_zfcg",
- "city": "",
- "infoformat": 1,
- "type": "",
- "publishdept": "",
- "title": title,
- "competehref": competehref,
- "href": "#",
- "publishtime": publish_time,
- "l_np_publishtime": tool.int2long(tool.date_to_timestamp(publish_time)),
- }
- self.ybw_list.insert_one(item)
- self.dedup.add(competehref)
- results.append(item)
- self.total += 1
- logger.info(
- f' *** 检索完成:去重 {len(info_list) - len(results)} 条 - 入库 {len(results)} 条 *** <{self.total}>')
- def crawl_list_spider(self, page, key, query_date):
- retry_times = 0
- while retry_times < 3:
- proxies = tool.get_proxy()
- try:
- response = self.fetch_request(page=page, key=key, proxies=proxies)
- response.raise_for_status() # requests 自检
- if response is not None and response.status_code == 200:
- self.parse(response, query_date)
- logger.debug(f"[检索完成] {key}")
- time.sleep(random.random())
- return
- else:
- retry_times += 1
- time.sleep(1)
- except Exception as e:
- logger.error(f"采集异常:{e}")
- retry_times += 1
- time.sleep(2)
- logger.warning(f"[检索失败] {key}")
- def start(self, query_date):
- logger.info("********** 检索开始 **********")
- data_sets = {
- "中国移动河南分公司",
- "中国移动通信集团",
- "中移建设有限公司",
- "中移铁通有限公司",
- "中移系统集成有限公司",
- "中移信息系统集成有限公司",
- "中移在线服务有限公司",
- "联通(河南)产业互联网有限公司",
- "联通数字科技有限公司",
- "中国联合网络通信",
- "中国联合网络通信有限公司",
- "中讯邮电咨询设计院有限公司",
- "天翼云科技有限公司",
- "中电信数智科技有限公司",
- "中国电信股份有限公司",
- "中国电信集团有限公司",
- "中国电信数智科技有限公司",
- "中国联合网络通信有限公司",
- "新疆天富天源燃气有限公司2025年八师居民及商服入户工程材料采购"
- }
- for key in data_sets:
- self.crawl_list_spider(1, key, query_date)
- if self.crawl_page != 1:
- for page in range(2, self.crawl_page + 1):
- self.crawl_list_spider(page, key, query_date)
- logger.info("********** 检索结束 **********")
- if __name__ == '__main__':
- Menu = namedtuple(
- 'Menu',
- ['channel', 'code', 'types', 'rout', 'query_date', 'crawl_page']
- )
- query_date = tool.get_today_of_day(-1)
- Spider().start(query_date)
|