|
@@ -1,99 +1,216 @@
|
|
|
# coding: utf-8
|
|
|
+import datetime
|
|
|
+import json
|
|
|
+import random
|
|
|
import time
|
|
|
|
|
|
import requests
|
|
|
|
|
|
from utils.databases import mongo_table, redis_client
|
|
|
from utils.log import logger
|
|
|
+from utils.sessions_521 import http_session_521
|
|
|
from utils.tools import sha1
|
|
|
|
|
|
-qlm = mongo_table('qlm', 'qlm_2021')
|
|
|
+qlm = mongo_table('qlm', 'data_merge')
|
|
|
r = redis_client()
|
|
|
-redis_key = "qianlima_2021"
|
|
|
-
|
|
|
-'''
|
|
|
-# areas 地区
|
|
|
-# currentPage 页码
|
|
|
-# numPerPage 每页的条目数
|
|
|
-# types
|
|
|
-全部
|
|
|
-公告 0
|
|
|
-预告 1
|
|
|
-变更 2
|
|
|
-中标 3
|
|
|
-其他 5
|
|
|
-'''
|
|
|
-PROXIES = None
|
|
|
-
|
|
|
-
|
|
|
-def crawl_request(url, headers):
|
|
|
- """
|
|
|
- 公共方法,get获取url 解析json 数据
|
|
|
-
|
|
|
- :param url: 访问的url
|
|
|
- :param headers: 携带参数url
|
|
|
- :return:
|
|
|
- """
|
|
|
- while True:
|
|
|
- try:
|
|
|
- get_html = requests.get(url, headers=headers, timeout=5000)
|
|
|
- # 自动编码,自适应字符编码
|
|
|
- get_html.encoding = get_html.apparent_encoding
|
|
|
- logger.info(get_html.status_code)
|
|
|
- if get_html.status_code in [403, 404, 400, 502, 302]:
|
|
|
- continue
|
|
|
- elif get_html.status_code in [200]:
|
|
|
- return get_html
|
|
|
- except requests.exceptions.ConnectTimeout:
|
|
|
- logger.error("Reacquire proxy")
|
|
|
- except requests.RequestException:
|
|
|
- time.sleep(3)
|
|
|
- continue
|
|
|
-
|
|
|
-
|
|
|
-def crawl_spider(area, _type, i):
|
|
|
- headers = {
|
|
|
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
- "Accept-Encoding": "gzip, deflate",
|
|
|
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
|
|
+redis_key = "qianlima_2022"
|
|
|
+
|
|
|
+headers = {
|
|
|
+ "Accept": "*/*",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
"Cache-Control": "no-cache",
|
|
|
"Connection": "keep-alive",
|
|
|
- "DNT": "1",
|
|
|
- "Host": "search.qianlima.com",
|
|
|
+ "Content-Type": "application/json",
|
|
|
+ "Origin": "http://search.vip.qianlima.com",
|
|
|
"Pragma": "no-cache",
|
|
|
- "Upgrade-Insecure-Requests": "1",
|
|
|
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68",
|
|
|
- "Cookie": 'UM_distinctid=178af0c6f6f2f3-0e81be36d60604-7166786d-144000-178af0c6f70294; BAIDU_SSP_lcr=https://cn.bing.com/; guest_id=ac5769d7-b906-499d-ab85-47809ee9bc56; gr_user_id=d2cc35f6-ffa2-441b-a9ff-f836345e6f75; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844534; seo_refUrl=https%3A//cn.bing.com/; seo_curUrl=www.qianlima.com; qlm_referrer=https://cn.bing.com/; delClose200811=firstGoIn; __jsluid_h=5f702d3c66f33654fc8d1f109062bb23; __jsl_clearance=1617844553.848|0|oACHKEqjLj1O5rc480L59DWlTO4%3D; CNZZDATA1277608403=736687752-1617840159-http%253A%252F%252Fsearch.qianlima.com%252F%7C1617840159; nts_login_tip=1; fromWhereUrl="http://www.qianlima.com/mfzb/"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844615'
|
|
|
+ "Referer": "http://search.vip.qianlima.com/index.html",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
|
|
|
+ "X-Auth-Token": "6831ba66-f264-45b7-acaf-c0f7a85a12ce"
|
|
|
}
|
|
|
- url = "http://search.qianlima.com/api/v1/website/search?filtermode=1&timeType=-1&areas={}&types={}&isfirst=false&searchMode=2&keywords=%20¤tPage={}&numPerPage=1000"
|
|
|
- list_url = url.format(area, _type, i)
|
|
|
- req = crawl_request(list_url, headers)
|
|
|
- info_list = req.json()["data"]["data"]
|
|
|
- item_list = []
|
|
|
- for info in info_list:
|
|
|
- tmid = sha1(str(info["contentid"]))
|
|
|
- if r.hget(redis_key, tmid) is None:
|
|
|
- r.hset(redis_key, tmid, str(info["contentid"]))
|
|
|
- if "popTitle" in info:
|
|
|
- info["title"] = info["popTitle"]
|
|
|
+cookies = {
|
|
|
+ "BAIDU_SSP_lcr": "https://www.google.com/",
|
|
|
+ "guest_id": "0124edcd-1edc-4434-ae60-5d44662dced0",
|
|
|
+ "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874",
|
|
|
+ "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874",
|
|
|
+ "seo_refUrl": "https%3A//www.google.com/",
|
|
|
+ "seo_curUrl": "www.qianlima.com",
|
|
|
+ "qlm_referrer": "https://www.google.com/",
|
|
|
+ "qlm_rem_login": "1",
|
|
|
+ "qlm_username": "17610673271",
|
|
|
+ "qlm_password": "fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE",
|
|
|
+ "17610673271fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "bc62b056-43d8-4f22-bfa6-606f59c2bbde",
|
|
|
+ "source": "1",
|
|
|
+ "useragent_hash": "1af3d6d4fbb7947d107b0170a309f510",
|
|
|
+ "xAuthToken": "6831ba66-f264-45b7-acaf-c0f7a85a12ce",
|
|
|
+ "login_time": "1653876714",
|
|
|
+ "login_ip": "1.192.62.141",
|
|
|
+ "__jsluid_h": "7d0d080a30094eb57be38e4c09dd4a3b",
|
|
|
+ "userInfo": "{%22userId%22:10609848%2C%22username%22:%2217610673271%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E8%91%A3%E5%85%88%E7%94%9F%22%2C%22companyName%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E6%B3%A8%E5%86%8C%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222022-05-30%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22http://img_al.qianlima.com/invoice/1588986761_8ebeade70a.jpg%22%2C%22customerServicePhone%22:%2217718573953%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2217610673271%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22zhiwu%22:%22%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}",
|
|
|
+ "__jsl_clearance": "1653887986.259|0|v4obplHHBSE1zJOt5WFKuV0TdIM%3D"
|
|
|
+}
|
|
|
+session = requests.session()
|
|
|
+
|
|
|
+
|
|
|
+def delay_by_day(days, fmt="%Y-%m-%d"):
|
|
|
+ """按天延时"""
|
|
|
+ _days = int(days)
|
|
|
+ _current_now = datetime.datetime.now()
|
|
|
+ return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
|
|
|
+
|
|
|
+
|
|
|
+def crawl_request(url, data, retries=5):
|
|
|
+ global session, cookies
|
|
|
+ resp = None
|
|
|
+ usages, usages_521 = 0, 1
|
|
|
+ while usages < retries:
|
|
|
+ request_params = {}
|
|
|
+ request_params.setdefault('data', data)
|
|
|
+ request_params.setdefault('headers', headers)
|
|
|
+ request_params.setdefault('cookies', cookies)
|
|
|
+ request_params.setdefault('timeout', 60)
|
|
|
+ try:
|
|
|
+ resp = session.post(url, **request_params)
|
|
|
+ if resp.status_code == 521:
|
|
|
+ while usages_521 < retries:
|
|
|
+ success, _, cookies = http_session_521(session, url, headers, cookies, data=data)
|
|
|
+ if success:
|
|
|
+ break
|
|
|
+ logger.info(f"反爬破解失败,次数:{usages_521}")
|
|
|
+ time.sleep(1)
|
|
|
+ usages_521 += 1
|
|
|
+ usages += 1
|
|
|
+ elif resp.status_code in [401, 403, 404]:
|
|
|
+ logger.info(f"账号登录已失效或封停,异常状态码:{resp.status_code}")
|
|
|
+ break
|
|
|
else:
|
|
|
- info["title"] = info["showTitle"]
|
|
|
- item_list.append(info)
|
|
|
- if item_list:
|
|
|
- qlm.insert_many(item_list)
|
|
|
- logger.info("{}--{}抓取第{}页数据,共{}条".format(area, _type, i, len(item_list)))
|
|
|
+ break
|
|
|
+ except requests.RequestException as e:
|
|
|
+ logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
|
|
|
+ usages += 1
|
|
|
+ # print(resp)
|
|
|
+ return resp
|
|
|
+
|
|
|
|
|
|
+def crawl_spider(area: str, page: int, **kwargs):
|
|
|
+ curr_date = delay_by_day(0)
|
|
|
+ begin_time = kwargs.pop('begin_time', curr_date)
|
|
|
+ end_time = kwargs.pop('end_time', curr_date)
|
|
|
+ max_page = kwargs.pop('max_page', 20)
|
|
|
|
|
|
-def start():
|
|
|
- # 遍历省份
|
|
|
- for area in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 11, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]:
|
|
|
- # 遍历状态
|
|
|
- # [0,1,2,3,5]
|
|
|
- for _type in [0, 1, 2, 3, 5]:
|
|
|
- # 遍历页码
|
|
|
- # for i in range(1, 11):
|
|
|
- crawl_spider(area, _type, 1)
|
|
|
+ results = []
|
|
|
+ request_status = 'failure' # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable
|
|
|
+
|
|
|
+ data = {
|
|
|
+ "keywords": "",
|
|
|
+ "timeType": "4", # 自定义时间参数
|
|
|
+ "beginTime": begin_time, # 格式: xxxx-xx-xxx
|
|
|
+ "endTime": end_time, # 格式: xxxx-xx-xxx
|
|
|
+ "filtermode": 8,
|
|
|
+ "searchMode": 0,
|
|
|
+ "currentPage": page, # 页码
|
|
|
+ "numPerPage": max_page, # 每页的条目数
|
|
|
+ "sortType": "1",
|
|
|
+ "allType": -1,
|
|
|
+ "noticeSegmentTypeStr": "",
|
|
|
+ "beginAmount": "",
|
|
|
+ "endAmount": "",
|
|
|
+ "purchasingUnitIdList": "",
|
|
|
+ "threeClassifyTagStr": "",
|
|
|
+ "fourLevelCategoryIdListStr": "",
|
|
|
+ "threeLevelCategoryIdListStr": "",
|
|
|
+ "levelId": "",
|
|
|
+ "searchDataType": 0,
|
|
|
+ "types": "-1",
|
|
|
+ "showContent": 1,
|
|
|
+ "hasTenderTransferProject": 1,
|
|
|
+ "newAreas": area, # 设置地区
|
|
|
+ "hasChooseSortType": 1,
|
|
|
+ "summaryType": 0
|
|
|
+ }
|
|
|
+ data = json.dumps(data)
|
|
|
+ url = "http://search.vip.qianlima.com/rest/service/website/search/solr"
|
|
|
+ response = crawl_request(url, data)
|
|
|
+ if response is not None and response.status_code == 200:
|
|
|
+ resp_json = response.json()
|
|
|
+ if resp_json['code'] == 200:
|
|
|
+ items = resp_json["data"]["data"]
|
|
|
+ for item in items:
|
|
|
+ tmid = sha1(str(item["contentid"]))
|
|
|
+ if not r.hexists(redis_key, tmid):
|
|
|
+ r.hset(redis_key, tmid, '')
|
|
|
+
|
|
|
+ if "popTitle" in item:
|
|
|
+ item["title"] = item["popTitle"]
|
|
|
+ else:
|
|
|
+ item["title"] = item["showTitle"]
|
|
|
+
|
|
|
+ addr = str(item["areaName"]).split('-')
|
|
|
+ _area = addr[0] if len(addr) > 0 else ''
|
|
|
+ _city = addr[1] if len(addr) > 1 else ''
|
|
|
+ channel = (item['noticeSegmentTypeName'] or item['progName'])
|
|
|
+ res = {
|
|
|
+ 'site': '千里马',
|
|
|
+ 'channel': channel,
|
|
|
+ 'area': _area,
|
|
|
+ 'city': _city,
|
|
|
+ 'title': item["title"],
|
|
|
+ 'publishtime': item['updateTime'],
|
|
|
+ 'competehref': item.get('url', '')
|
|
|
+ }
|
|
|
+ results.append(res)
|
|
|
+ request_status = 'success'
|
|
|
+ if len(items) < max_page:
|
|
|
+ request_status = 'stop'
|
|
|
+ else:
|
|
|
+ '''
|
|
|
+ {
|
|
|
+ "code": 200520,
|
|
|
+ "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
|
|
|
+ "data": null
|
|
|
+ }
|
|
|
+ '''
|
|
|
+ logger.info(resp_json['msg'])
|
|
|
+ elif response is not None and response.status_code in [401, 403, 404]:
|
|
|
+ request_status = 'disable'
|
|
|
+
|
|
|
+ if len(results) > 0:
|
|
|
+ qlm.insert_many(results)
|
|
|
+
|
|
|
+ if request_status in ['stop', 'success']:
|
|
|
+ logger.info("第{}区第{}页成功上传{}条数据".format(area, len(results), page))
|
|
|
+ return request_status
|
|
|
+
|
|
|
+
|
|
|
+def start(**kwargs):
|
|
|
+ pages = list(range(1, 1064))
|
|
|
+ close_spider = False
|
|
|
+ disable_page, max_disable_page = 0, 3
|
|
|
+ while len(pages) > 0:
|
|
|
+ if close_spider:
|
|
|
+ break
|
|
|
+ elif disable_page > max_disable_page:
|
|
|
+ # 此处可以加入邮件或者企业微信通知,告知采集异常,人工排查问题
|
|
|
+ break
|
|
|
+ page = pages.pop(0)
|
|
|
+ interval, max_interval = 2, 60
|
|
|
+ logger.info(f"访问第{page}页数据")
|
|
|
+ while True:
|
|
|
+ success = crawl_spider("", page, **kwargs)
|
|
|
+ if success == 'failure':
|
|
|
+ interval = max_interval if interval > max_interval else interval
|
|
|
+ time.sleep(random.randint(1, interval))
|
|
|
+ interval += 3
|
|
|
+ continue
|
|
|
+ elif success == 'disable':
|
|
|
+ logger.info(f"当前账号在第{page}页数据被禁止访问")
|
|
|
+ disable_page += 1
|
|
|
+ break
|
|
|
+ elif success == 'stop':
|
|
|
+ close_spider = True
|
|
|
+ logger.info(f"第{page}页数据采集成功")
|
|
|
+ break
|
|
|
+ logger.info("采集任务结束")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- start()
|
|
|
+ start(begin_time="2022-05-21", end_time="2022-05-28", max_page=1000)
|