source_qianlima.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. # -*- coding: utf-8 -*-
  2. import datetime
  3. import json
  4. import math
  5. import random
  6. import time
  7. import requests
  8. from utils.config_parms import *
  9. from utils.databases import mongo_table, redis_client
  10. from utils.log import logger
  11. from utils.sessions_521 import http_session_521
  12. from utils.tools import sha1
  13. qlm = mongo_table('qlm', 'data_merge')
  14. r = redis_client()
  15. redis_key = 'qianlima_2024'
  16. session = requests.session()
  17. proxies = {
  18. 'http': 'socks5://119.3.159.234:8860',
  19. 'https': 'socks5://119.3.159.234:8860',
  20. }
  21. '''
  22. https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
  23. 搜索-2.0
  24. 1 = 招标信息
  25. 2 = 中标信息
  26. 3 = 拟在建项目
  27. 4 = 审批项目
  28. '''
  29. class AccountViolationRiskError(Exception):
  30. pass
  31. def send_wechat_warning(msg, send=True):
  32. markdown = f'千里马列表页采集异常,请相关同事注意。'
  33. markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
  34. if not send:
  35. logger.info(markdown)
  36. return
  37. url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
  38. headers_ = {'Content-Type': 'application/json'}
  39. json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
  40. request_params = dict(headers=headers_, json=json_data, timeout=10)
  41. response = requests.post(url, **request_params)
  42. logger.info(response.json())
  43. def get_today_of_day(offset, fmt='%Y-%m-%d'):
  44. date = datetime.datetime.now() + datetime.timedelta(days=int(offset))
  45. return date.strftime(fmt)
  46. def request(url, data, retries=5):
  47. global session, cookies, proxies
  48. resp, msg = None, ''
  49. usages, usages_521 = 0, 1
  50. while usages < retries:
  51. request_params = {}
  52. request_params.setdefault('data', data)
  53. request_params.setdefault('headers', headers)
  54. request_params.setdefault('cookies', cookies)
  55. request_params.setdefault('proxies', proxies)
  56. request_params.setdefault('timeout', 60)
  57. try:
  58. resp = session.post(url, **request_params)
  59. if resp.status_code == 521:
  60. while usages_521 < retries:
  61. success, _, cookies = http_session_521(session, url, headers, cookies, data=data, proxies=proxies)
  62. if success:
  63. break
  64. msg = f'反爬破解失败,次数:{usages_521}'
  65. logger.warning(msg)
  66. time.sleep(1)
  67. usages_521 += 1
  68. usages += 1
  69. elif resp.status_code in [401, 403, 404]:
  70. msg = f'账号登录已失效或封停,异常状态码:{resp.status_code}'
  71. logger.error(msg)
  72. break
  73. elif resp.status_code in [429]:
  74. msg = f'图形验证,异常状态码:{resp.status_code}'
  75. logger.error(msg)
  76. logger.warning(resp.content.decode())
  77. break
  78. elif str(resp.status_code).startswith('4'):
  79. msg = f'公网IP被封禁,异常状态码:{resp.status_code}'
  80. logger.error(msg)
  81. break
  82. else:
  83. break
  84. except requests.RequestException as e:
  85. msg = f'访问失败,原因:{e.__class__.__name__}'
  86. logger.error(msg)
  87. usages += 1
  88. return resp, msg
  89. def downloader(begin_date, end_date, category, address, page, page_size):
  90. """
  91. :param str begin_date: 开始时间,格式:xxxx-xx-xxx
  92. :param str end_date: 结束时间,格式:xxxx-xx-xxx
  93. :param int category: 栏目编号
  94. :param int address: 地区编号
  95. :param int page: 页码
  96. :param int page_size: 单页数据条数
  97. """
  98. url = 'https://search.vip.qianlima.com/rest/service/website/search/solr'
  99. data = REQUEST_DATA_MAP[category]
  100. data['newAreas'] = str(address) # 设置地区
  101. data['timeType'] = 4 # 自定义时间参数
  102. data['beginTime'] = begin_date
  103. data['endTime'] = end_date
  104. data['currentPage'] = page
  105. data['numPerPage'] = page_size
  106. data = json.dumps(data)
  107. response, err = request(url, data)
  108. request_status = 'failure' # 资源请求结果状态, 成功=success 失败=failure 停止=stop 封停=disable
  109. if response is None:
  110. request_status = 'server_error'
  111. return request_status, err
  112. results = []
  113. row_count = 0
  114. if response.status_code == 200:
  115. resp_json = response.json()
  116. if resp_json['code'] == 200:
  117. row_count = resp_json['data']['rowCount']
  118. items = resp_json['data']['data']
  119. for item in items:
  120. cid = sha1(str(item['contentid']))
  121. if not r.hexists(redis_key, cid):
  122. r.hset(redis_key, cid, '')
  123. if 'popTitle' in item:
  124. item['title'] = item['popTitle']
  125. else:
  126. item['title'] = item['showTitle']
  127. addr = str(item['areaName']).split('-')
  128. area_ = addr[0] if len(addr) > 0 else ''
  129. city_ = addr[1] if len(addr) > 1 else ''
  130. if '国土' in item.get('progName', ''):
  131. channel = item['progName']
  132. else:
  133. channel = (item['noticeSegmentTypeName'] or item['progName'])
  134. results.append({
  135. 'site': '千里马',
  136. 'channel': channel,
  137. 'area': area_,
  138. 'city': city_,
  139. 'title': item['title'],
  140. 'publishtime': item['updateTime'],
  141. 'href': item.get('url', '')
  142. })
  143. if len(results) > 0:
  144. qlm.insert_many(results, ordered=False)
  145. request_status = 'success'
  146. if len(items) < page_size or len(results) == 0:
  147. request_status = 'stop'
  148. else:
  149. '''
  150. {
  151. "code": 200520,
  152. "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
  153. "data": null
  154. }
  155. '''
  156. err = resp_json['msg']
  157. logger.info(err)
  158. elif response.status_code in [401, 403, 404]:
  159. request_status = 'disable'
  160. elif response.status_code in [405]:
  161. request_status = 'method_not_allowed'
  162. elif response.status_code in [429]:
  163. request_status = 'captcha_required'
  164. elif str(response.status_code).startswith('4'):
  165. request_status = 'client_ip_disable'
  166. if request_status in ['stop', 'success']:
  167. if page == 1:
  168. logger.info(f'千里马 {begin_date} 发布 {row_count} 条数据')
  169. logger.info(f'入库 {len(results)} 条')
  170. return request_status, err
  171. def automatic_pagination(**kwargs):
  172. reason = '' # 采集失败时原因
  173. close_spider = False
  174. send_warning = False
  175. retry_times, max_retries = 0, 3
  176. pages = list(range(1, 101)) # 目前qlm仅支持查看前10000数据
  177. while len(pages) > 0:
  178. if close_spider:
  179. if send_warning:
  180. send_wechat_warning(reason)
  181. break
  182. if send_warning and retry_times > max_retries:
  183. send_wechat_warning(reason)
  184. break
  185. page = pages.pop(0)
  186. logger.info(f'下载第{page}页数据')
  187. while True:
  188. err, reason = downloader(page=page, **kwargs)
  189. if err in ['server_error', 'client_ip_disable', 'captcha_required']:
  190. close_spider = True
  191. send_warning = True
  192. elif err == 'failure':
  193. interval = math.log(random.randint(100, 2400), 2)
  194. logger.debug(f'等待{interval}s,异常重试...')
  195. time.sleep(interval)
  196. continue
  197. elif err == 'disable':
  198. logger.warning('账号被禁止访问')
  199. retry_times += 1
  200. send_warning = True
  201. elif err == 'method_not_allowed':
  202. logger.warning('服务器禁止使用当前 HTTP 方法的请求')
  203. retry_times += 1
  204. send_warning = True
  205. elif err == 'stop':
  206. close_spider = True
  207. else:
  208. time.sleep(math.log(random.randint(100, 2400), 2))
  209. break
  210. if send_warning:
  211. raise AccountViolationRiskError
  212. def core(date: str, category: int, address: int, page_size=20):
  213. try:
  214. automatic_pagination(
  215. begin_date=date,
  216. end_date=date,
  217. category=category,
  218. address=address,
  219. page_size=page_size # 每页数据最大条数
  220. )
  221. return True
  222. except AccountViolationRiskError:
  223. return False
  224. def spider(date, page_size=40):
  225. logger.info('+++ 采集开始 +++')
  226. dates = [date] if not isinstance(date, list) else date
  227. try:
  228. for date in dates:
  229. for category, category_name in channel_dict.items():
  230. for area, cities in area_dict.items():
  231. for city in cities:
  232. logger.info(' && '.join([
  233. date,
  234. category_name,
  235. province_dict[area],
  236. city_dict[city]
  237. ]))
  238. if len(cities) == 1:
  239. city = area # 千里马取消了直辖市的分区,直接采集省市区域
  240. yield core(date, category, city, page_size=page_size)
  241. except Exception as e:
  242. logger.error(e)
  243. except KeyboardInterrupt:
  244. pass
  245. finally:
  246. logger.info('+++ 采集结束 +++')
  247. def history(date_lst: list):
  248. for result in spider(date_lst):
  249. if not result:
  250. break
  251. def start():
  252. date = get_today_of_day(-1)
  253. for result in spider(date, page_size=100):
  254. if not result:
  255. break
  256. if __name__ == '__main__':
  257. start()