source_qianlima_history.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # coding: utf-8
  2. import datetime
  3. import json
  4. import math
  5. import random
  6. import time
  7. import requests
  8. from utils.config_parms import *
  9. from utils.databases import mongo_table, redis_client
  10. from utils.log import logger
  11. from utils.sessions_521 import http_session_521
  12. from utils.tools import sha1
  13. qlm = mongo_table('qlm', 'data_merge')
  14. r = redis_client()
  15. redis_key = "qianlima_2024"
  16. session = requests.session()
  17. '''
  18. https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
  19. 搜索-2.0
  20. 1 = 招标信息
  21. 2 = 中标信息
  22. 3 = 拟在建项目
  23. 4 = 审批项目
  24. '''
  25. def delay_by_day(days, fmt="%Y-%m-%d"):
  26. """按天延时"""
  27. _days = int(days)
  28. _current_now = datetime.datetime.now()
  29. return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
  30. def crawl_request(url, data, retries=5):
  31. global session, cookies1
  32. resp = None
  33. usages, usages_521 = 0, 1
  34. while usages < retries:
  35. request_params = {}
  36. request_params.setdefault('data', data)
  37. request_params.setdefault('headers', headers1)
  38. request_params.setdefault('cookies', cookies1)
  39. request_params.setdefault('timeout', 60)
  40. try:
  41. resp = session.post(url, **request_params)
  42. if resp.status_code == 521:
  43. while usages_521 < retries:
  44. success, _, cookies1 = http_session_521(session, url, headers1, cookies1, data=data)
  45. if success:
  46. break
  47. logger.warning(f"反爬破解失败,次数:{usages_521}")
  48. time.sleep(1)
  49. usages_521 += 1
  50. usages += 1
  51. elif resp.status_code in [401, 403, 404]:
  52. logger.error(f"账号登录已失效或封停,异常状态码:{resp.status_code}")
  53. break
  54. else:
  55. break
  56. except requests.RequestException as e:
  57. logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
  58. usages += 1
  59. return resp
  60. def crawl_spider(area: str, type_: int, page: int, **kwargs):
  61. results = []
  62. request_status = 'failure' # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable
  63. curr_date = delay_by_day(0)
  64. begin_time = kwargs.pop('begin_time', curr_date)
  65. end_time = kwargs.pop('end_time', curr_date)
  66. max_per_page = kwargs.pop('max_per_page', 20)
  67. data = REQUEST_DATA_MAP[type_]
  68. data['newAreas'] = area # 设置地区
  69. data['currentPage'] = page # 页码
  70. data['numPerPage'] = max_per_page # 每页的条目数
  71. data['timeType'] = 4 # 自定义时间参数
  72. data['beginTime'] = begin_time # 开始时间,格式:xxxx-xx-xxx
  73. data['endTime'] = end_time # 结束时间,格式:xxxx-xx-xxx
  74. data = json.dumps(data)
  75. url = "https://search.vip.qianlima.com/rest/service/website/search/solr"
  76. response = crawl_request(url, data)
  77. row_count = 0
  78. if response is not None and response.status_code == 200:
  79. resp_json = response.json()
  80. if resp_json['code'] == 200:
  81. row_count = resp_json["data"]["rowCount"]
  82. items = resp_json["data"]["data"]
  83. for item in items:
  84. cid = sha1(str(item["contentid"]))
  85. if not r.hexists(redis_key, cid):
  86. r.hset(redis_key, cid, '')
  87. if "popTitle" in item:
  88. item["title"] = item["popTitle"]
  89. else:
  90. item["title"] = item["showTitle"]
  91. addr = str(item["areaName"]).split('-')
  92. _area = addr[0] if len(addr) > 0 else ''
  93. _city = addr[1] if len(addr) > 1 else ''
  94. channel = (item['noticeSegmentTypeName'] or item['progName'])
  95. res = {
  96. 'site': '千里马',
  97. 'channel': channel,
  98. 'area': _area,
  99. 'city': _city,
  100. 'title': item["title"],
  101. 'publishtime': item['updateTime'],
  102. 'href': item.get('url', '')
  103. }
  104. results.append(res)
  105. request_status = 'success'
  106. if len(items) < max_per_page:
  107. request_status = 'stop'
  108. else:
  109. '''
  110. {
  111. "code": 200520,
  112. "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
  113. "data": null
  114. }
  115. '''
  116. logger.info(resp_json['msg'])
  117. elif response is not None and response.status_code in [401, 403, 404]:
  118. request_status = 'disable'
  119. elif response is not None and response.status_code == 405:
  120. request_status = 'method_not_allowed'
  121. if len(results) > 0:
  122. qlm.insert_many(results)
  123. if request_status in ['stop', 'success']:
  124. logger.info("{}-{}-{}-共{}条-第{}页,成功上传{}条数据".format(
  125. begin_time,
  126. city_dict.get(int(area)),
  127. channel_dict.get(type_),
  128. row_count,
  129. page,
  130. len(results))
  131. )
  132. return request_status
  133. def by_area_crawl_data(area="", type_=0, **kwargs):
  134. close_spider = False
  135. disable_page, max_disable_page = 0, 3
  136. pages = list(range(1, 101)) # 目前仅支持前10000数据的搜索
  137. while len(pages) > 0:
  138. if close_spider:
  139. break
  140. elif disable_page > max_disable_page:
  141. # 此处可以添加通知邮件或者企业微信机器人接口,通知采集异常信息
  142. break
  143. page = pages.pop(0)
  144. logger.info(f"访问-{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据")
  145. while True:
  146. success = crawl_spider(area, type_, page, **kwargs)
  147. if success == 'failure':
  148. interval = math.log(random.randint(100, 2400), 2)
  149. logger.debug(f'异常重试,等待{interval}s')
  150. time.sleep(interval)
  151. continue
  152. elif success == 'disable':
  153. logger.warning(f"账号被禁止访问-{city_dict.get(int(area))}-第{page}页数据")
  154. disable_page += 1
  155. elif success == 'method_not_allowed':
  156. logger.warning("服务器禁止使用当前 HTTP 方法的请求")
  157. disable_page += 1
  158. elif success == 'stop':
  159. close_spider = True
  160. else:
  161. logger.info(f"{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据采集成功")
  162. time.sleep(math.log(random.randint(100, 2400), 2))
  163. break
  164. def select_types(date: str, area: str, prov: str):
  165. for type_ in [1, 2, 3, 4]:
  166. by_area_crawl_data(
  167. area=area,
  168. type_=type_,
  169. begin_time=date,
  170. end_time=date,
  171. max_per_page=100
  172. )
  173. logger.info(f"{date}-{province_dict.get(int(prov))}地区-{channel_dict.get(type_)}采集结束")
  174. def select_area(date: str):
  175. for province in range(1, 32):
  176. for city_ in area_dict.get(province):
  177. select_types(date, area=str(city_), prov=str(province))
  178. logger.info(f"任务结束")
  179. def history(date_lst: list):
  180. for date in date_lst:
  181. select_area(date)
  182. def start():
  183. date_str = "2023-09-25"
  184. select_area(date_str)
  185. if __name__ == '__main__':
  186. start()