ybw_query_list.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-06-17
  4. ---------
  5. @summary: 元博网 - 列表页信息检索
  6. ---------
  7. @author: Lzz
  8. """
  9. import math
  10. import random
  11. import time
  12. import warnings
  13. from collections import namedtuple
  14. import requests
  15. from pymongo import MongoClient
  16. import setting
  17. import utils.tools as tool
  18. from dbs.RedisDB import RedisFilter
  19. from log import logger
  20. warnings.filterwarnings('ignore')
  21. class Spider:
  22. def __init__(self):
  23. _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
  24. self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"]
  25. self.dedup = RedisFilter()
  26. self.total = 0
  27. self.crawl_page = 1
  28. self.areas_dict = {1: '北京', 2: '上海', 3: '天津', 4: '重庆', 5: '河北', 6: '山西', 7: '内蒙古', 8: '辽宁',
  29. 9: '吉林', 10: '黑龙江', 11: '江苏', 12: '浙江', 13: '安徽', 14: '福建', 15: '江西',
  30. 16: '山东', 17: '河南', 18: '湖北', 19: '湖南', 20: '广东', 21: '广西', 22: '海南',
  31. 23: '贵州', 24: '云南', 25: '西藏', 26: '陕西', 27: '四川', 28: '甘肃', 29: '青海',
  32. 30: '新疆', 31: '宁夏'}
  33. def fetch_request(self, page, key, proxies=False):
  34. url = "https://www.chinabidding.cn/302e302e7379675f73736f/datax/json/gj_zbcg_daylimit"
  35. headers = {
  36. "accept": "application/json, text/javascript, */*; q=0.01",
  37. "accept-language": "zh-CN,zh;q=0.9",
  38. "cache-control": "no-cache",
  39. "pragma": "no-cache",
  40. "priority": "u=1, i",
  41. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
  42. "x-requested-with": "XMLHttpRequest"
  43. }
  44. params = {
  45. "device": "es",
  46. "cpcode": "es001",
  47. "keywords": f"{key}",
  48. "table_type": "4,",
  49. "search_type": "CONTEXT",
  50. "areaid": "17,",
  51. "categoryid": "",
  52. "b_date": "week",
  53. "time_start": "",
  54. "time_end": "",
  55. "page": f"{page}",
  56. "rp": "30",
  57. "usrecord_id": "",
  58. }
  59. request_params = dict(
  60. headers=headers,
  61. params=params,
  62. proxies=proxies,
  63. timeout=60,
  64. verify=False
  65. )
  66. return requests.get(url, **request_params)
  67. def parse(self, response, query_date):
  68. total = response.json().get('result').get('total', 0)
  69. self.crawl_page = math.ceil(total / 30)
  70. results = []
  71. info_list = response.json().get('result').get('list', [])
  72. for info in info_list:
  73. publish_time = info.get('fields').get('publish_date')
  74. title = tool.clean_title(info.get('fields').get('title').strip())
  75. competehref = info.get('fields').get('url')
  76. if "chinabidding" not in competehref:
  77. competehref = 'https://www.chinabidding.cn{}'.format(competehref)
  78. area = self.areas_dict[int(info.get('fields').get('area_id'))] or "全国"
  79. if title is None:
  80. logger.error(f"[标题为空]{competehref}")
  81. return
  82. if not self.dedup.get(competehref) and query_date in publish_time:
  83. item = {
  84. "site": "元博网(采购与招标网)",
  85. "channel": "政府采购",
  86. "area": area if area != '跨省' else '全国',
  87. "_d": "comeintime",
  88. "comeintime": tool.int2long(int(time.time())),
  89. "T": "bidding",
  90. "sendflag": "false",
  91. "spidercode": "a_ybwcgyzbw_zfcg",
  92. "city": "",
  93. "infoformat": 1,
  94. "type": "",
  95. "publishdept": "",
  96. "title": title,
  97. "competehref": competehref,
  98. "href": "#",
  99. "publishtime": publish_time,
  100. "l_np_publishtime": tool.int2long(tool.date_to_timestamp(publish_time)),
  101. }
  102. self.ybw_list.insert_one(item)
  103. self.dedup.add(competehref)
  104. results.append(item)
  105. self.total += 1
  106. logger.info(
  107. f' *** 检索完成:去重 {len(info_list) - len(results)} 条 - 入库 {len(results)} 条 *** <{self.total}>')
  108. def crawl_list_spider(self, page, key, query_date):
  109. retry_times = 0
  110. while retry_times < 3:
  111. proxies = tool.get_proxy()
  112. try:
  113. response = self.fetch_request(page=page, key=key, proxies=proxies)
  114. response.raise_for_status() # requests 自检
  115. if response is not None and response.status_code == 200:
  116. self.parse(response, query_date)
  117. logger.debug(f"[检索完成] {key}")
  118. time.sleep(random.random())
  119. return
  120. else:
  121. retry_times += 1
  122. time.sleep(1)
  123. except Exception as e:
  124. logger.error(f"采集异常:{e}")
  125. retry_times += 1
  126. time.sleep(2)
  127. logger.warning(f"[检索失败] {key}")
  128. def start(self, query_date):
  129. logger.info("********** 检索开始 **********")
  130. data_sets = {
  131. "中国移动河南分公司",
  132. "中国移动通信集团",
  133. "中移建设有限公司",
  134. "中移铁通有限公司",
  135. "中移系统集成有限公司",
  136. "中移信息系统集成有限公司",
  137. "中移在线服务有限公司",
  138. "联通(河南)产业互联网有限公司",
  139. "联通数字科技有限公司",
  140. "中国联合网络通信",
  141. "中国联合网络通信有限公司",
  142. "中讯邮电咨询设计院有限公司",
  143. "天翼云科技有限公司",
  144. "中电信数智科技有限公司",
  145. "中国电信股份有限公司",
  146. "中国电信集团有限公司",
  147. "中国电信数智科技有限公司",
  148. "中国联合网络通信有限公司",
  149. "新疆天富天源燃气有限公司2025年八师居民及商服入户工程材料采购"
  150. }
  151. for key in data_sets:
  152. self.crawl_list_spider(1, key, query_date)
  153. if self.crawl_page != 1:
  154. for page in range(2, self.crawl_page + 1):
  155. self.crawl_list_spider(page, key, query_date)
  156. logger.info("********** 检索结束 **********")
  157. if __name__ == '__main__':
  158. Menu = namedtuple(
  159. 'Menu',
  160. ['channel', 'code', 'types', 'rout', 'query_date', 'crawl_page']
  161. )
  162. query_date = tool.get_today_of_day(-1)
  163. Spider().start(query_date)