source_qianlima.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. # coding: utf-8
  2. import datetime
  3. import json
  4. import math
  5. import random
  6. import time
  7. import requests
  8. from utils.databases import mongo_table, redis_client
  9. from utils.log import logger
  10. from utils.sessions_521 import http_session_521
  11. from utils.tools import sha1
  12. qlm = mongo_table('qlm', 'data_merge')
  13. r = redis_client()
  14. redis_key = "qianlima_2022"
  15. headers = {
  16. "Accept": "*/*",
  17. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  18. "Cache-Control": "no-cache",
  19. "Connection": "keep-alive",
  20. "Content-Type": "application/json",
  21. "Origin": "http://search.vip.qianlima.com",
  22. "Pragma": "no-cache",
  23. "Referer": "http://search.vip.qianlima.com/index.html",
  24. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
  25. "X-Auth-Token": "7f15af45-30c3-4bee-8a89-1b2813100aaf"
  26. }
  27. cookies = {
  28. "BAIDU_SSP_lcr": "https://www.google.com/",
  29. "guest_id": "0124edcd-1edc-4434-ae60-5d44662dced0",
  30. "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874",
  31. "seo_curUrl": "www.qianlima.com",
  32. "source": "1",
  33. "login_ip": "1.192.62.141",
  34. "__jsluid_h": "7d0d080a30094eb57be38e4c09dd4a3b",
  35. "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1654132585",
  36. "seo_refUrl": "http%3A//www.qianlima.com/zbgg/p2",
  37. "15637008265fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "3e048e44-2e33-4936-b2d1-00784cb48e60",
  38. "useragent_hash": "32bd84bdee2cfe920dda80f92fa20070",
  39. "qlm_rem_login": "1",
  40. "qlm_username": "17610673271",
  41. "qlm_password": "fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE",
  42. "17610673271fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "b2b29454-a5f4-4fb0-ad76-cc2be246cac9",
  43. "xAuthToken": "7f15af45-30c3-4bee-8a89-1b2813100aaf",
  44. "login_time": "1654498455",
  45. "userInfo": "{%22userId%22:10609848%2C%22username%22:%2217610673271%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E8%91%A3%E5%85%88%E7%94%9F%22%2C%22companyName%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E6%B3%A8%E5%86%8C%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222022-05-30%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22http://img_al.qianlima.com/invoice/1588986761_8ebeade70a.jpg%22%2C%22customerServicePhone%22:%2217718573953%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2217610673271%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22zhiwu%22:%22%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
  46. }
  47. session = requests.session()
  48. '''
  49. 招标阶段
  50. 0 = 全部
  51. 1 = 招标信息
  52. 2 = 中标信息
  53. 3 = 采购意向
  54. '''
  55. REQUEST_DATA_MAP = {
  56. 0: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": -1, "noticeSegmentTypeStr": "", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 0, "types": "-1", "showContent": 1, "hasTenderTransferProject": 1, "newAreas": "1", "hasChooseSortType": 1, "summaryType": 0},
  57. 1: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": "0", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": -1, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"0": [], "1": []}, "summaryType": 0},
  58. 2: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 3, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 3, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"3": []}, "summaryType": 0},
  59. 3: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 99, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 99, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"99": []}, "summaryType": 0}
  60. }
  61. def delay_by_day(days, fmt="%Y-%m-%d"):
  62. """按天延时"""
  63. _days = int(days)
  64. _current_now = datetime.datetime.now()
  65. return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
  66. def crawl_request(url, data, retries=5):
  67. global session, cookies
  68. resp = None
  69. usages, usages_521 = 0, 1
  70. while usages < retries:
  71. request_params = {}
  72. request_params.setdefault('data', data)
  73. request_params.setdefault('headers', headers)
  74. request_params.setdefault('cookies', cookies)
  75. request_params.setdefault('timeout', 60)
  76. try:
  77. resp = session.post(url, **request_params)
  78. if resp.status_code == 521:
  79. while usages_521 < retries:
  80. success, _, cookies = http_session_521(session, url, headers, cookies, data=data)
  81. if success:
  82. break
  83. logger.warning(f"反爬破解失败,次数:{usages_521}")
  84. time.sleep(1)
  85. usages_521 += 1
  86. usages += 1
  87. elif resp.status_code in [401, 403, 404]:
  88. logger.error(f"账号登录已失效或封停,异常状态码:{resp.status_code}")
  89. break
  90. else:
  91. break
  92. except requests.RequestException as e:
  93. logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
  94. usages += 1
  95. # print(resp)
  96. return resp
  97. def crawl_spider(area: str, type_: int, page: int, **kwargs):
  98. results = []
  99. request_status = 'failure' # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable
  100. curr_date = delay_by_day(0)
  101. begin_time = kwargs.pop('begin_time', curr_date)
  102. end_time = kwargs.pop('end_time', curr_date)
  103. max_per_page = kwargs.pop('max_page', 20)
  104. data = REQUEST_DATA_MAP[type_]
  105. data['newAreas'] = area # 设置地区
  106. data['currentPage'] = page # 页码
  107. data['numPerPage'] = max_per_page # 每页的条目数
  108. data['timeType'] = 4 # 自定义时间参数
  109. data['beginTime'] = begin_time # 开始时间,格式:xxxx-xx-xxx
  110. data['endTime'] = end_time # 结束时间,格式:xxxx-xx-xxx
  111. data = json.dumps(data)
  112. url = "https://search.vip.qianlima.com/rest/service/website/search/solr"
  113. response = crawl_request(url, data)
  114. row_count = 0
  115. if response is not None and response.status_code == 200:
  116. resp_json = response.json()
  117. if resp_json['code'] == 200:
  118. row_count = resp_json["data"]["rowCount"]
  119. # print(row_count)
  120. items = resp_json["data"]["data"]
  121. for item in items:
  122. cid = sha1(str(item["contentid"]))
  123. if not r.hexists(redis_key, cid):
  124. r.hset(redis_key, cid, '')
  125. if "popTitle" in item:
  126. item["title"] = item["popTitle"]
  127. else:
  128. item["title"] = item["showTitle"]
  129. addr = str(item["areaName"]).split('-')
  130. _area = addr[0] if len(addr) > 0 else ''
  131. _city = addr[1] if len(addr) > 1 else ''
  132. channel = (item['noticeSegmentTypeName'] or item['progName'])
  133. res = {
  134. 'site': '千里马',
  135. 'channel': channel,
  136. 'area': _area,
  137. 'city': _city,
  138. 'title': item["title"],
  139. 'publishtime': item['updateTime'],
  140. 'href': item.get('url', '')
  141. }
  142. results.append(res)
  143. request_status = 'success'
  144. if len(items) < max_per_page:
  145. request_status = 'stop'
  146. else:
  147. '''
  148. {
  149. "code": 200520,
  150. "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
  151. "data": null
  152. }
  153. '''
  154. logger.info(resp_json['msg'])
  155. elif response is not None and response.status_code in [401, 403, 404]:
  156. request_status = 'disable'
  157. elif response is not None and response.status_code == 405:
  158. request_status = 'method_not_allowed'
  159. if len(results) > 0:
  160. qlm.insert_many(results)
  161. if request_status in ['stop', 'success']:
  162. logger.info("{}-第{}区-第{}类{}条-第{}页,成功上传{}条数据".format(
  163. begin_time,
  164. area,
  165. type_,
  166. page,
  167. row_count,
  168. len(results))
  169. )
  170. return request_status
  171. def by_area_crawl_data(area="", type_=0, **kwargs):
  172. close_spider = False
  173. disable_page, max_disable_page = 0, 3
  174. pages = list(range(1, 101)) # 目前仅支持前10000数据的搜索
  175. while len(pages) > 0:
  176. if close_spider:
  177. break
  178. elif disable_page > max_disable_page:
  179. # 此处可以添加通知邮件或者企业微信机器人接口,通知采集异常信息
  180. break
  181. page = pages.pop(0)
  182. logger.info(f"访问第{area}区-第{type_}类-第{page}页数据")
  183. while True:
  184. success = crawl_spider(area, type_, page, **kwargs)
  185. if success == 'failure':
  186. interval = math.log(random.randint(100, 2400), 2)
  187. logger.debug(f'异常重试,等待{interval}s')
  188. time.sleep(interval)
  189. continue
  190. elif success == 'disable':
  191. logger.warning(f"账号被禁止访问第{area}区-第{page}页数据")
  192. disable_page += 1
  193. elif success == 'method_not_allowed':
  194. logger.warning("服务器禁止使用当前 HTTP 方法的请求")
  195. disable_page += 1
  196. elif success == 'stop':
  197. close_spider = True
  198. else:
  199. logger.info(f"第{area}区-第{page}页数据采集成功")
  200. break
  201. def select_types(date: str, area: str):
  202. for type_ in [1, 2, 3]:
  203. by_area_crawl_data(
  204. area=area,
  205. type_=type_,
  206. begin_time=date,
  207. end_time=date,
  208. max_page=100
  209. )
  210. logger.info(f"{date}-第{area}区-第{type_}类采集结束")
  211. def select_area(date: str):
  212. for area in range(1, 32):
  213. select_types(date, str(area))
  214. logger.info(f"任务结束")
  215. def history(date_lst: list):
  216. for date in date_lst:
  217. select_area(date)
  218. def start():
  219. date_str = delay_by_day(-1)
  220. select_area(date_str)
  221. if __name__ == '__main__':
  222. start()