|
@@ -0,0 +1,100 @@
|
|
|
|
+# coding: utf-8
|
|
|
|
+import time
|
|
|
|
+
|
|
|
|
+import requests
|
|
|
|
+
|
|
|
|
+from utils.databases import mongo_table, redis_client
|
|
|
|
+from utils.log import logger
|
|
|
|
+from utils.tools import sha1
|
|
|
|
+
|
|
|
|
+qlm = mongo_table('qlm', 'qlm_2021')
|
|
|
|
+r = redis_client()
|
|
|
|
+redis_key = "qianlima_2021"
|
|
|
|
+
|
|
|
|
+'''
|
|
|
|
+# areas 地区
|
|
|
|
+# currentPage 页码
|
|
|
|
+# numPerPage 每页的条目数
|
|
|
|
+# types
|
|
|
|
+全部
|
|
|
|
+公告 0
|
|
|
|
+预告 1
|
|
|
|
+变更 2
|
|
|
|
+中标 3
|
|
|
|
+其他 5
|
|
|
|
+'''
|
|
|
|
+PROXIES = None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def crawl_request(url, headers):
|
|
|
|
+ """
|
|
|
|
+ 公共方法,get获取url 解析json 数据
|
|
|
|
+
|
|
|
|
+ :param url: 访问的url
|
|
|
|
+ :param headers: 携带参数url
|
|
|
|
+ :return:
|
|
|
|
+ """
|
|
|
|
+ while True:
|
|
|
|
+ try:
|
|
|
|
+ get_html = requests.get(url, headers=headers, timeout=5000)
|
|
|
|
+ # 自动编码,自适应字符编码
|
|
|
|
+ get_html.encoding = get_html.apparent_encoding
|
|
|
|
+ logger.info(get_html.status_code)
|
|
|
|
+ if get_html.status_code in [403, 404, 400, 502, 302]:
|
|
|
|
+ continue
|
|
|
|
+ elif get_html.status_code in [200]:
|
|
|
|
+ return get_html
|
|
|
|
+ except requests.exceptions.ConnectTimeout:
|
|
|
|
+ logger.error("Reacquire proxy")
|
|
|
|
+ except requests.RequestException:
|
|
|
|
+ time.sleep(3)
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def crawl_spider(area, _type, i):
|
|
|
|
+ headers = {
|
|
|
|
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
|
+ "Accept-Encoding": "gzip, deflate",
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
|
+ "Connection": "keep-alive",
|
|
|
|
+ "DNT": "1",
|
|
|
|
+ "Host": "search.qianlima.com",
|
|
|
|
+ "Pragma": "no-cache",
|
|
|
|
+ "Upgrade-Insecure-Requests": "1",
|
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68",
|
|
|
|
+ "Cookie": 'UM_distinctid=178af0c6f6f2f3-0e81be36d60604-7166786d-144000-178af0c6f70294; BAIDU_SSP_lcr=https://cn.bing.com/; guest_id=ac5769d7-b906-499d-ab85-47809ee9bc56; gr_user_id=d2cc35f6-ffa2-441b-a9ff-f836345e6f75; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844534; seo_refUrl=https%3A//cn.bing.com/; seo_curUrl=www.qianlima.com; qlm_referrer=https://cn.bing.com/; delClose200811=firstGoIn; __jsluid_h=5f702d3c66f33654fc8d1f109062bb23; __jsl_clearance=1617844553.848|0|oACHKEqjLj1O5rc480L59DWlTO4%3D; CNZZDATA1277608403=736687752-1617840159-http%253A%252F%252Fsearch.qianlima.com%252F%7C1617840159; nts_login_tip=1; fromWhereUrl="http://www.qianlima.com/mfzb/"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844615'
|
|
|
|
+ }
|
|
|
|
+ url = "http://search.qianlima.com/api/v1/website/search?filtermode=1&timeType=-1&areas={}&types={}&isfirst=false&searchMode=2&keywords=%20¤tPage={}&numPerPage=1000"
|
|
|
|
+ list_url = url.format(area, _type, i)
|
|
|
|
+ print(list_url)
|
|
|
|
+ req = crawl_request(list_url, headers)
|
|
|
|
+ info_list = req.json()["data"]["data"]
|
|
|
|
+ item_list = []
|
|
|
|
+ for info in info_list:
|
|
|
|
+ tmid = sha1(str(info["contentid"]))
|
|
|
|
+ if r.hget(redis_key, tmid) is None:
|
|
|
|
+ r.hset(redis_key, tmid, str(info["contentid"]))
|
|
|
|
+ if "popTitle" in info:
|
|
|
|
+ info["title"] = info["popTitle"]
|
|
|
|
+ else:
|
|
|
|
+ info["title"] = info["showTitle"]
|
|
|
|
+ item_list.append(info)
|
|
|
|
+ if item_list:
|
|
|
|
+ qlm.insert_many(item_list)
|
|
|
|
+ logger.info("{}--{}抓取第{}页数据,共{}条".format(area, _type, i, len(item_list)))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def start():
|
|
|
|
+ # 遍历省份
|
|
|
|
+ for area in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 11, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]:
|
|
|
|
+ # 遍历状态
|
|
|
|
+ # [0,1,2,3,5]
|
|
|
|
+ for _type in [0, 1, 2, 3, 5]:
|
|
|
|
+ # 遍历页码
|
|
|
|
+ # for i in range(1, 11):
|
|
|
|
+ crawl_spider(area, _type, 1)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ start()
|