# coding: utf-8 import datetime import json import random import time import requests from utils.databases import mongo_table, redis_client from utils.log import logger from utils.sessions_521 import http_session_521 from utils.tools import sha1 qlm = mongo_table('qlm', 'data_merge') r = redis_client() redis_key = "qianlima_2022" headers = { "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Type": "application/json", "Origin": "http://search.vip.qianlima.com", "Pragma": "no-cache", "Referer": "http://search.vip.qianlima.com/index.html", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36", "X-Auth-Token": "6831ba66-f264-45b7-acaf-c0f7a85a12ce" } cookies = { "BAIDU_SSP_lcr": "https://www.google.com/", "guest_id": "0124edcd-1edc-4434-ae60-5d44662dced0", "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874", "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874", "seo_refUrl": "https%3A//www.google.com/", "seo_curUrl": "www.qianlima.com", "qlm_referrer": "https://www.google.com/", "qlm_rem_login": "1", "qlm_username": "17610673271", "qlm_password": "fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE", "17610673271fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "bc62b056-43d8-4f22-bfa6-606f59c2bbde", "source": "1", "useragent_hash": "1af3d6d4fbb7947d107b0170a309f510", "xAuthToken": "6831ba66-f264-45b7-acaf-c0f7a85a12ce", "login_time": "1653876714", "login_ip": "1.192.62.141", "__jsluid_h": "7d0d080a30094eb57be38e4c09dd4a3b", "userInfo": "{%22userId%22:10609848%2C%22username%22:%2217610673271%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E8%91%A3%E5%85%88%E7%94%9F%22%2C%22companyName%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E6%B3%A8%E5%86%8C%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222022-05-30%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22http://img_al.qianlima.com/invoice/1588986761_8ebeade70a.jpg%22%2C%22customerServicePhone%22:%2217718573953%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2217610673271%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22zhiwu%22:%22%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}", "__jsl_clearance": "1653887986.259|0|v4obplHHBSE1zJOt5WFKuV0TdIM%3D" } session = requests.session() def delay_by_day(days, fmt="%Y-%m-%d"): """按天延时""" _days = int(days) _current_now = datetime.datetime.now() return (_current_now + datetime.timedelta(days=_days)).strftime(fmt) def crawl_request(url, data, retries=5): global session, cookies resp = None usages, usages_521 = 0, 1 while usages < retries: request_params = {} request_params.setdefault('data', data) request_params.setdefault('headers', headers) request_params.setdefault('cookies', cookies) request_params.setdefault('timeout', 60) try: resp = session.post(url, **request_params) if resp.status_code == 521: while usages_521 < retries: success, _, cookies = http_session_521(session, url, headers, cookies, data=data) if success: break logger.info(f"反爬破解失败,次数:{usages_521}") time.sleep(1) usages_521 += 1 usages += 1 elif resp.status_code in [401, 403, 404]: logger.info(f"账号登录已失效或封停,异常状态码:{resp.status_code}") break else: break except requests.RequestException as e: logger.error(f"访问失败,失败原因:{e.__class__.__name__}") usages += 1 # print(resp) return resp def crawl_spider(area: str, page: int, **kwargs): curr_date = delay_by_day(0) begin_time = kwargs.pop('begin_time', curr_date) end_time = kwargs.pop('end_time', curr_date) max_page = kwargs.pop('max_page', 20) results = [] request_status = 'failure' # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable data = { "keywords": "", "timeType": "4", # 自定义时间参数 "beginTime": begin_time, # 格式: xxxx-xx-xxx "endTime": end_time, # 格式: xxxx-xx-xxx "filtermode": 8, "searchMode": 0, "currentPage": page, # 页码 "numPerPage": max_page, # 每页的条目数 "sortType": "1", "allType": -1, "noticeSegmentTypeStr": "", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 0, "types": "-1", "showContent": 1, "hasTenderTransferProject": 1, "newAreas": area, # 设置地区 "hasChooseSortType": 1, "summaryType": 0 } data = json.dumps(data) url = "http://search.vip.qianlima.com/rest/service/website/search/solr" response = crawl_request(url, data) if response is not None and response.status_code == 200: resp_json = response.json() if resp_json['code'] == 200: items = resp_json["data"]["data"] for item in items: tmid = sha1(str(item["contentid"])) if not r.hexists(redis_key, tmid): r.hset(redis_key, tmid, '') if "popTitle" in item: item["title"] = item["popTitle"] else: item["title"] = item["showTitle"] addr = str(item["areaName"]).split('-') _area = addr[0] if len(addr) > 0 else '' _city = addr[1] if len(addr) > 1 else '' channel = (item['noticeSegmentTypeName'] or item['progName']) res = { 'site': '千里马', 'channel': channel, 'area': _area, 'city': _city, 'title': item["title"], 'publishtime': item['updateTime'], 'competehref': item.get('url', '') } results.append(res) request_status = 'success' if len(items) < max_page: request_status = 'stop' else: ''' { "code": 200520, "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000", "data": null } ''' logger.info(resp_json['msg']) elif response is not None and response.status_code in [401, 403, 404]: request_status = 'disable' if len(results) > 0: qlm.insert_many(results) if request_status in ['stop', 'success']: logger.info("第{}区第{}页成功上传{}条数据".format(area, len(results), page)) return request_status def start(**kwargs): pages = list(range(1, 1064)) close_spider = False disable_page, max_disable_page = 0, 3 while len(pages) > 0: if close_spider: break elif disable_page > max_disable_page: # 此处可以加入邮件或者企业微信通知,告知采集异常,人工排查问题 break page = pages.pop(0) interval, max_interval = 2, 60 logger.info(f"访问第{page}页数据") while True: success = crawl_spider("", page, **kwargs) if success == 'failure': interval = max_interval if interval > max_interval else interval time.sleep(random.randint(1, interval)) interval += 3 continue elif success == 'disable': logger.info(f"当前账号在第{page}页数据被禁止访问") disable_page += 1 break elif success == 'stop': close_spider = True logger.info(f"第{page}页数据采集成功") break logger.info("采集任务结束") if __name__ == '__main__': start(begin_time="2022-05-21", end_time="2022-05-28", max_page=1000)