source_qianlima.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. # coding: utf-8
  2. import datetime
  3. import json
  4. import math
  5. import random
  6. import time
  7. import requests
  8. from utils.config_parms import *
  9. from utils.databases import mongo_table, redis_client
  10. from utils.log import logger
  11. from utils.sessions_521 import http_session_521
  12. from utils.tools import sha1
  13. qlm = mongo_table('qlm', 'data_merge')
  14. r = redis_client()
  15. redis_key = "qianlima_2024"
  16. session = requests.session()
  17. # proxies = {
  18. # 'http': 'socks5://123.101.64.83:8861',
  19. # 'https': 'socks5://123.101.64.83:8861'
  20. # }
  21. proxies = None
  22. '''
  23. https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
  24. 搜索-2.0
  25. 1 = 招标信息
  26. 2 = 中标信息
  27. 3 = 拟在建项目
  28. 4 = 审批项目
  29. '''
  30. class AccountLoginExpirationError(Exception):
  31. pass
  32. def send_wechat_warning(msg, send=True):
  33. markdown = f'千里马列表页采集异常,请相关同事注意。'
  34. markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
  35. if not send:
  36. logger.info(markdown)
  37. return
  38. url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
  39. headers_ = {'Content-Type': 'application/json'}
  40. json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
  41. request_params = dict(headers=headers_, json=json_data, timeout=10)
  42. response = requests.post(url, **request_params)
  43. logger.info(response.json())
  44. def get_today_of_day(offset, fmt="%Y-%m-%d"):
  45. date = datetime.datetime.now() + datetime.timedelta(days=int(offset))
  46. return date.strftime(fmt)
  47. def crawl_request(url, data, retries=5):
  48. global session, cookies, proxies
  49. resp, msg = None, ''
  50. usages, usages_521 = 0, 1
  51. while usages < retries:
  52. request_params = {}
  53. request_params.setdefault('data', data)
  54. request_params.setdefault('headers', headers)
  55. request_params.setdefault('cookies', cookies)
  56. request_params.setdefault('proxies', proxies)
  57. request_params.setdefault('timeout', 60)
  58. try:
  59. resp = session.post(url, **request_params)
  60. if resp.status_code == 521:
  61. while usages_521 < retries:
  62. success, _, cookies = http_session_521(session, url, headers, cookies, data=data, proxies=proxies)
  63. if success:
  64. break
  65. msg = f"反爬破解失败,次数:{usages_521}"
  66. logger.warning(msg)
  67. time.sleep(1)
  68. usages_521 += 1
  69. usages += 1
  70. elif resp.status_code in [401, 403, 404]:
  71. msg = f"账号登录已失效或封停,异常状态码:{resp.status_code}"
  72. logger.error(msg)
  73. break
  74. elif str(resp.status_code).startswith('4'):
  75. msg = f"公网IP被封禁,异常状态码:{resp.status_code}"
  76. logger.error(msg)
  77. break
  78. else:
  79. break
  80. except requests.RequestException as e:
  81. msg = f"访问失败,原因:{e.__class__.__name__}"
  82. logger.error(msg)
  83. usages += 1
  84. return resp, msg
  85. def spider(area: str, type_: int, page: int, **kwargs):
  86. request_status = 'failure' # 资源请求结果状态, 成功=success 失败=failure 停止=stop 封停=disable
  87. results = []
  88. begin_time = kwargs.pop('begin_time')
  89. end_time = kwargs.pop('end_time')
  90. max_per_page = kwargs.pop('max_per_page', 20)
  91. url = "https://search.vip.qianlima.com/rest/service/website/search/solr"
  92. data = REQUEST_DATA_MAP[type_]
  93. data['newAreas'] = area # 设置地区
  94. data['currentPage'] = page # 页码
  95. data['numPerPage'] = max_per_page # 每页的条目数
  96. data['timeType'] = 4 # 自定义时间参数
  97. data['beginTime'] = begin_time # 开始时间,格式:xxxx-xx-xxx
  98. data['endTime'] = end_time # 结束时间,格式:xxxx-xx-xxx
  99. data = json.dumps(data)
  100. response, err = crawl_request(url, data)
  101. if response is None:
  102. request_status = 'server_error'
  103. return request_status, err
  104. row_count = 0
  105. if response.status_code == 200:
  106. resp_json = response.json()
  107. if resp_json['code'] == 200:
  108. row_count = resp_json["data"]["rowCount"]
  109. items = resp_json["data"]["data"]
  110. for item in items:
  111. cid = sha1(str(item["contentid"]))
  112. if not r.hexists(redis_key, cid):
  113. r.hset(redis_key, cid, '')
  114. if "popTitle" in item:
  115. item["title"] = item["popTitle"]
  116. else:
  117. item["title"] = item["showTitle"]
  118. addr = str(item["areaName"]).split('-')
  119. _area = addr[0] if len(addr) > 0 else ''
  120. _city = addr[1] if len(addr) > 1 else ''
  121. if "国土" in item.get('progName', ''):
  122. channel = item['progName']
  123. else:
  124. channel = (item['noticeSegmentTypeName'] or item['progName'])
  125. results.append({
  126. 'site': '千里马',
  127. 'channel': channel,
  128. 'area': _area,
  129. 'city': _city,
  130. 'title': item["title"],
  131. 'publishtime': item['updateTime'],
  132. 'href': item.get('url', '')
  133. })
  134. request_status = 'success'
  135. if len(items) < max_per_page:
  136. request_status = 'stop'
  137. else:
  138. '''
  139. {
  140. "code": 200520,
  141. "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
  142. "data": null
  143. }
  144. '''
  145. err = resp_json['msg']
  146. logger.info(err)
  147. elif response.status_code in [401, 403, 404]:
  148. request_status = 'disable'
  149. elif response.status_code == 405:
  150. request_status = 'method_not_allowed'
  151. elif str(response.status_code).startswith('4'):
  152. request_status = 'client_ip_disable'
  153. if len(results) > 0:
  154. qlm.insert_many(results)
  155. if request_status in ['stop', 'success']:
  156. logger.info("{}-{}-{}-共{}条-第{}页,成功上传{}条数据".format(
  157. begin_time,
  158. city_dict.get(int(area)),
  159. channel_dict.get(type_),
  160. row_count,
  161. page,
  162. len(results))
  163. )
  164. return request_status, err
  165. def downloader(area="", type_=0, **kwargs):
  166. close_spider = False
  167. send_warning = False
  168. reason = ''
  169. retry_times, max_retries = 0, 3
  170. pages = list(range(1, 101)) # 目前qlm仅支持查看前10000数据
  171. while len(pages) > 0:
  172. if close_spider:
  173. if send_warning:
  174. send_wechat_warning(reason)
  175. break
  176. if send_warning and retry_times > max_retries:
  177. # 此处可以添加通知邮件或者企业微信机器人接口,通知采集异常信息
  178. send_wechat_warning(reason)
  179. break
  180. page = pages.pop(0)
  181. logger.info(f"访问-{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据")
  182. while True:
  183. err, reason = spider(area, type_, page, **kwargs)
  184. if err in ['server_error', 'client_ip_disable']:
  185. close_spider = True
  186. send_warning = True
  187. elif err == 'failure':
  188. interval = math.log(random.randint(100, 2400), 2)
  189. logger.debug(f'异常重试,等待{interval}s')
  190. time.sleep(interval)
  191. continue
  192. elif err == 'disable':
  193. logger.warning(f"账号被禁止访问-{city_dict.get(int(area))}-第{page}页数据")
  194. retry_times += 1
  195. send_warning = True
  196. elif err == 'method_not_allowed':
  197. logger.warning("服务器禁止使用当前 HTTP 方法的请求")
  198. retry_times += 1
  199. send_warning = True
  200. elif err == 'stop':
  201. close_spider = True
  202. else:
  203. logger.info(f"{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据采集成功")
  204. time.sleep(math.log(random.randint(100, 2400), 2))
  205. break
  206. if send_warning:
  207. raise AccountLoginExpirationError
  208. def select_types(date: str, area: str, prov: str):
  209. for type_ in [1, 2, 3, 4]:
  210. downloader(
  211. area=area,
  212. type_=type_,
  213. begin_time=date,
  214. end_time=date,
  215. max_per_page=100
  216. )
  217. tips = [
  218. f'{date}',
  219. f'{channel_dict.get(type_)}',
  220. f'{province_dict.get(int(prov))}',
  221. '完成采集'
  222. ]
  223. logger.info(f"+++ {' && '.join(tips)} +++ ")
  224. def select_area(date: str):
  225. logger.info(f" +++ 开始采集 - {date} +++")
  226. try:
  227. for province in range(1, 32):
  228. for city_ in area_dict.get(province):
  229. select_types(date, area=str(city_), prov=str(province))
  230. return True
  231. except AccountLoginExpirationError:
  232. return False
  233. finally:
  234. logger.info(f" +++ 采集结束 - {date} +++")
  235. def history(date_lst: list):
  236. for date in date_lst:
  237. success = select_area(date)
  238. if not success:
  239. break
  240. def start():
  241. date = get_today_of_day(-1)
  242. select_area(date)
  243. if __name__ == '__main__':
  244. start()