Quellcode durchsuchen

千里马采集二级列表页(市级)

lizongze vor 2 Jahren
Ursprung
Commit
e8dab89563
3 geänderte Dateien mit 413 neuen und 58 gelöschten Zeilen
  1. 20 58
      qlm/source_qianlima.py
  2. 251 0
      qlm/source_qianlima_bak1.py
  3. 142 0
      qlm/utils/config_parms.py

+ 20 - 58
qlm/source_qianlima.py

@@ -6,7 +6,7 @@ import random
 import time
 
 import requests
-
+from utils.config_parms import *
 from utils.databases import mongo_table, redis_client
 from utils.log import logger
 from utils.sessions_521 import http_session_521
@@ -16,53 +16,16 @@ qlm = mongo_table('qlm', 'data_merge')
 r = redis_client()
 redis_key = "qianlima_2022"
 
-headers = {
-    "Accept": "*/*",
-    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-    "Cache-Control": "no-cache",
-    "Connection": "keep-alive",
-    "Content-Type": "application/json",
-    "Origin": "http://search.vip.qianlima.com",
-    "Pragma": "no-cache",
-    "Referer": "http://search.vip.qianlima.com/index.html",
-    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
-    "X-Auth-Token": "7f15af45-30c3-4bee-8a89-1b2813100aaf"
-}
-cookies = {
-    "BAIDU_SSP_lcr": "https://www.google.com/",
-    "guest_id": "0124edcd-1edc-4434-ae60-5d44662dced0",
-    "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874",
-    "seo_curUrl": "www.qianlima.com",
-    "source": "1",
-    "login_ip": "1.192.62.141",
-    "__jsluid_h": "7d0d080a30094eb57be38e4c09dd4a3b",
-    "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1654132585",
-    "seo_refUrl": "http%3A//www.qianlima.com/zbgg/p2",
-    "15637008265fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "3e048e44-2e33-4936-b2d1-00784cb48e60",
-    "useragent_hash": "32bd84bdee2cfe920dda80f92fa20070",
-    "qlm_rem_login": "1",
-    "qlm_username": "17610673271",
-    "qlm_password": "fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE",
-    "17610673271fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "b2b29454-a5f4-4fb0-ad76-cc2be246cac9",
-    "xAuthToken": "7f15af45-30c3-4bee-8a89-1b2813100aaf",
-    "login_time": "1654498455",
-    "userInfo": "{%22userId%22:10609848%2C%22username%22:%2217610673271%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E8%91%A3%E5%85%88%E7%94%9F%22%2C%22companyName%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E6%B3%A8%E5%86%8C%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222022-05-30%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22http://img_al.qianlima.com/invoice/1588986761_8ebeade70a.jpg%22%2C%22customerServicePhone%22:%2217718573953%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2217610673271%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22zhiwu%22:%22%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
-}
 session = requests.session()
 
 '''
-招标阶段
-0 = 全部
+https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
+搜索-2.0
 1 = 招标信息
 2 = 中标信息
-3 = 采购意向
+3 = 拟在建项目
+4 = 审批项目
 '''
-REQUEST_DATA_MAP = {
-    0: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": -1, "noticeSegmentTypeStr": "", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 0, "types": "-1", "showContent": 1, "hasTenderTransferProject": 1, "newAreas": "1", "hasChooseSortType": 1, "summaryType": 0},
-    1: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": "0", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": -1, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"0": [], "1": []}, "summaryType": 0},
-    2: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 3, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 3, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"3": []}, "summaryType": 0},
-    3: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 99, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 99, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"99": []}, "summaryType": 0}
-}
 
 
 def delay_by_day(days, fmt="%Y-%m-%d"):
@@ -101,7 +64,6 @@ def crawl_request(url, data, retries=5):
         except requests.RequestException as e:
             logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
             usages += 1
-    # print(resp)
     return resp
 
 
@@ -112,7 +74,7 @@ def crawl_spider(area: str, type_: int, page: int, **kwargs):
     curr_date = delay_by_day(0)
     begin_time = kwargs.pop('begin_time', curr_date)
     end_time = kwargs.pop('end_time', curr_date)
-    max_per_page = kwargs.pop('max_page', 20)
+    max_per_page = kwargs.pop('max_per_page', 20)
     data = REQUEST_DATA_MAP[type_]
     data['newAreas'] = area  # 设置地区
     data['currentPage'] = page  # 页码
@@ -128,7 +90,6 @@ def crawl_spider(area: str, type_: int, page: int, **kwargs):
         resp_json = response.json()
         if resp_json['code'] == 200:
             row_count = resp_json["data"]["rowCount"]
-            # print(row_count)
             items = resp_json["data"]["data"]
             for item in items:
                 cid = sha1(str(item["contentid"]))
@@ -175,12 +136,12 @@ def crawl_spider(area: str, type_: int, page: int, **kwargs):
         qlm.insert_many(results)
 
     if request_status in ['stop', 'success']:
-        logger.info("{}-第{}区-第{}类{}条-第{}页,成功上传{}条数据".format(
+        logger.info("{}-{}-{}-共{}条-第{}页,成功上传{}条数据".format(
             begin_time,
-            area,
-            type_,
-            page,
+            city_dict.get(int(area)),
+            channel_dict.get(type_),
             row_count,
+            page,
             len(results))
         )
     return request_status
@@ -198,7 +159,7 @@ def by_area_crawl_data(area="", type_=0, **kwargs):
             break
 
         page = pages.pop(0)
-        logger.info(f"访问第{area}区-第{type_}类-第{page}页数据")
+        logger.info(f"访问-{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据")
         while True:
             success = crawl_spider(area, type_, page, **kwargs)
             if success == 'failure':
@@ -207,7 +168,7 @@ def by_area_crawl_data(area="", type_=0, **kwargs):
                 time.sleep(interval)
                 continue
             elif success == 'disable':
-                logger.warning(f"账号被禁止访问第{area}区-第{page}页数据")
+                logger.warning(f"账号被禁止访问-{city_dict.get(int(area))}-第{page}页数据")
                 disable_page += 1
             elif success == 'method_not_allowed':
                 logger.warning("服务器禁止使用当前 HTTP 方法的请求")
@@ -215,25 +176,26 @@ def by_area_crawl_data(area="", type_=0, **kwargs):
             elif success == 'stop':
                 close_spider = True
             else:
-                logger.info(f"第{area}区-第{page}页数据采集成功")
+                logger.info(f"{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据采集成功")
             break
 
 
-def select_types(date: str, area: str):
-    for type_ in [1, 2, 3]:
+def select_types(date: str, area: str, prov: str):
+    for type_ in [1, 2, 3, 4]:
         by_area_crawl_data(
             area=area,
             type_=type_,
             begin_time=date,
             end_time=date,
-            max_page=100
+            max_per_page=100
         )
-        logger.info(f"{date}-第{area}区-第{type_}类采集结束")
+    logger.info(f"{date}-第{province_dict.get(int(prov))}地区-{channel_dict.get(type_)}采集结束")
 
 
 def select_area(date: str):
-    for area in range(1, 32):
-        select_types(date, str(area))
+    for province in range(1, 32):
+        for city_ in area_dict.get(province):
+            select_types(date, area=str(city_), prov=str(province))
     logger.info(f"任务结束")
 
 

+ 251 - 0
qlm/source_qianlima_bak1.py

@@ -0,0 +1,251 @@
+# coding: utf-8
+import datetime
+import json
+import math
+import random
+import time
+
+import requests
+
+from utils.databases import mongo_table, redis_client
+from utils.log import logger
+from utils.sessions_521 import http_session_521
+from utils.tools import sha1
+
+qlm = mongo_table('qlm', 'data_merge')
+r = redis_client()
+redis_key = "qianlima_2022"
+
+headers = {
+    "Accept": "*/*",
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+    "Cache-Control": "no-cache",
+    "Connection": "keep-alive",
+    "Content-Type": "application/json",
+    "Origin": "http://search.vip.qianlima.com",
+    "Pragma": "no-cache",
+    "Referer": "http://search.vip.qianlima.com/index.html",
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
+    "X-Auth-Token": "7f15af45-30c3-4bee-8a89-1b2813100aaf"
+}
+cookies = {
+    "BAIDU_SSP_lcr": "https://www.google.com/",
+    "guest_id": "0124edcd-1edc-4434-ae60-5d44662dced0",
+    "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874",
+    "seo_curUrl": "www.qianlima.com",
+    "source": "1",
+    "login_ip": "1.192.62.141",
+    "__jsluid_h": "7d0d080a30094eb57be38e4c09dd4a3b",
+    "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1654132585",
+    "seo_refUrl": "http%3A//www.qianlima.com/zbgg/p2",
+    "15637008265fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "3e048e44-2e33-4936-b2d1-00784cb48e60",
+    "useragent_hash": "32bd84bdee2cfe920dda80f92fa20070",
+    "qlm_rem_login": "1",
+    "qlm_username": "17610673271",
+    "qlm_password": "fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE",
+    "17610673271fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "b2b29454-a5f4-4fb0-ad76-cc2be246cac9",
+    "xAuthToken": "7f15af45-30c3-4bee-8a89-1b2813100aaf",
+    "login_time": "1654498455",
+    "userInfo": "{%22userId%22:10609848%2C%22username%22:%2217610673271%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E8%91%A3%E5%85%88%E7%94%9F%22%2C%22companyName%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E6%B3%A8%E5%86%8C%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222022-05-30%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22http://img_al.qianlima.com/invoice/1588986761_8ebeade70a.jpg%22%2C%22customerServicePhone%22:%2217718573953%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2217610673271%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22zhiwu%22:%22%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
+}
+session = requests.session()
+
+'''
+招标阶段
+0 = 全部
+1 = 招标信息
+2 = 中标信息
+3 = 采购意向
+'''
+REQUEST_DATA_MAP = {
+    0: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": -1, "noticeSegmentTypeStr": "", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 0, "types": "-1", "showContent": 1, "hasTenderTransferProject": 1, "newAreas": "1", "hasChooseSortType": 1, "summaryType": 0},
+    1: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": "0", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": -1, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"0": [], "1": []}, "summaryType": 0},
+    2: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 3, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 3, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"3": []}, "summaryType": 0},
+    3: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 99, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 99, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"99": []}, "summaryType": 0}
+}
+
+
+def delay_by_day(days, fmt="%Y-%m-%d"):
+    """按天延时"""
+    _days = int(days)
+    _current_now = datetime.datetime.now()
+    return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
+
+
+def crawl_request(url, data, retries=5):
+    global session, cookies
+    resp = None
+    usages, usages_521 = 0, 1
+    while usages < retries:
+        request_params = {}
+        request_params.setdefault('data', data)
+        request_params.setdefault('headers', headers)
+        request_params.setdefault('cookies', cookies)
+        request_params.setdefault('timeout', 60)
+        try:
+            resp = session.post(url, **request_params)
+            if resp.status_code == 521:
+                while usages_521 < retries:
+                    success, _, cookies = http_session_521(session, url, headers, cookies, data=data)
+                    if success:
+                        break
+                    logger.warning(f"反爬破解失败,次数:{usages_521}")
+                    time.sleep(1)
+                    usages_521 += 1
+                usages += 1
+            elif resp.status_code in [401, 403, 404]:
+                logger.error(f"账号登录已失效或封停,异常状态码:{resp.status_code}")
+                break
+            else:
+                break
+        except requests.RequestException as e:
+            logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
+            usages += 1
+    # print(resp)
+    return resp
+
+
+def crawl_spider(area: str, type_: int, page: int, **kwargs):
+    results = []
+    request_status = 'failure'  # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable
+
+    curr_date = delay_by_day(0)
+    begin_time = kwargs.pop('begin_time', curr_date)
+    end_time = kwargs.pop('end_time', curr_date)
+    max_per_page = kwargs.pop('max_page', 20)
+    data = REQUEST_DATA_MAP[type_]
+    data['newAreas'] = area  # 设置地区
+    data['currentPage'] = page  # 页码
+    data['numPerPage'] = max_per_page  # 每页的条目数
+    data['timeType'] = 4  # 自定义时间参数
+    data['beginTime'] = begin_time  # 开始时间,格式:xxxx-xx-xxx
+    data['endTime'] = end_time  # 结束时间,格式:xxxx-xx-xxx
+    data = json.dumps(data)
+    url = "https://search.vip.qianlima.com/rest/service/website/search/solr"
+    response = crawl_request(url, data)
+    row_count = 0
+    if response is not None and response.status_code == 200:
+        resp_json = response.json()
+        if resp_json['code'] == 200:
+            row_count = resp_json["data"]["rowCount"]
+            # print(row_count)
+            items = resp_json["data"]["data"]
+            for item in items:
+                cid = sha1(str(item["contentid"]))
+                if not r.hexists(redis_key, cid):
+                    r.hset(redis_key, cid, '')
+                    if "popTitle" in item:
+                        item["title"] = item["popTitle"]
+                    else:
+                        item["title"] = item["showTitle"]
+
+                    addr = str(item["areaName"]).split('-')
+                    _area = addr[0] if len(addr) > 0 else ''
+                    _city = addr[1] if len(addr) > 1 else ''
+                    channel = (item['noticeSegmentTypeName'] or item['progName'])
+                    res = {
+                        'site': '千里马',
+                        'channel': channel,
+                        'area': _area,
+                        'city': _city,
+                        'title': item["title"],
+                        'publishtime': item['updateTime'],
+                        'href': item.get('url', '')
+                    }
+                    results.append(res)
+            request_status = 'success'
+
+            if len(items) < max_per_page:
+                request_status = 'stop'
+        else:
+            '''
+            {
+                "code": 200520,
+                "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
+                "data": null
+            }
+            '''
+            logger.info(resp_json['msg'])
+    elif response is not None and response.status_code in [401, 403, 404]:
+        request_status = 'disable'
+    elif response is not None and response.status_code == 405:
+        request_status = 'method_not_allowed'
+
+    if len(results) > 0:
+        qlm.insert_many(results)
+
+    if request_status in ['stop', 'success']:
+        logger.info("{}-第{}区-第{}类{}条-第{}页,成功上传{}条数据".format(
+            begin_time,
+            area,
+            type_,
+            page,
+            row_count,
+            len(results))
+        )
+    return request_status
+
+
+def by_area_crawl_data(area="", type_=0, **kwargs):
+    close_spider = False
+    disable_page, max_disable_page = 0, 3
+    pages = list(range(1, 101))  # 目前仅支持前10000数据的搜索
+    while len(pages) > 0:
+        if close_spider:
+            break
+        elif disable_page > max_disable_page:
+            # 此处可以添加通知邮件或者企业微信机器人接口,通知采集异常信息
+            break
+
+        page = pages.pop(0)
+        logger.info(f"访问第{area}区-第{type_}类-第{page}页数据")
+        while True:
+            success = crawl_spider(area, type_, page, **kwargs)
+            if success == 'failure':
+                interval = math.log(random.randint(100, 2400), 2)
+                logger.debug(f'异常重试,等待{interval}s')
+                time.sleep(interval)
+                continue
+            elif success == 'disable':
+                logger.warning(f"账号被禁止访问第{area}区-第{page}页数据")
+                disable_page += 1
+            elif success == 'method_not_allowed':
+                logger.warning("服务器禁止使用当前 HTTP 方法的请求")
+                disable_page += 1
+            elif success == 'stop':
+                close_spider = True
+            else:
+                logger.info(f"第{area}区-第{page}页数据采集成功")
+            break
+
+
+def select_types(date: str, area: str):
+    for type_ in [1, 2, 3]:
+        by_area_crawl_data(
+            area=area,
+            type_=type_,
+            begin_time=date,
+            end_time=date,
+            max_page=100
+        )
+        logger.info(f"{date}-第{area}区-第{type_}类采集结束")
+
+
+def select_area(date: str):
+    for area in range(1, 32):
+        select_types(date, str(area))
+    logger.info(f"任务结束")
+
+
+def history(date_lst: list):
+    for date in date_lst:
+        select_area(date)
+
+
+def start():
+    date_str = delay_by_day(-1)
+    select_area(date_str)
+
+
+if __name__ == '__main__':
+    start()

+ 142 - 0
qlm/utils/config_parms.py

@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-06-16 
+---------
+@summary: 千里马
+---------
+@author: Lzz
+"""
+
+REQUEST_DATA_MAP = {
+    1: {"keywords":"","timeType":"4","beginTime":"2023-06-15","endTime":"2023-06-15","filtermode":"8","searchMode":0,"currentPage":1,"numPerPage":20,"sortType":"6","allType":-1,"beginAmount":"","endAmount":"","purchasingUnitIdList":"","threeClassifyTagStr":"","fourLevelCategoryIdListStr":"","threeLevelCategoryIdListStr":"","levelId":"","types":"-1","showContent":1,"hasLinkName":"","searchDataType":1,"newAreas":"1","hasChooseSortType":1,"progIdAndNoticeSegmentTypeMaps":{"0":[],"1":[],"2":[],"4":[2,3,5],"5":[],"99":[]},"summaryType":0},
+    2: {"keywords":"","timeType":"4","beginTime":"2023-06-15","endTime":"2023-06-15","filtermode":"8","searchMode":0,"currentPage":1,"numPerPage":20,"sortType":"6","allType":-1,"beginAmount":"","endAmount":"","purchasingUnitIdList":"","threeClassifyTagStr":"","fourLevelCategoryIdListStr":"","threeLevelCategoryIdListStr":"","levelId":"","types":"-1","searchDataType":1,"showContent":1,"hasLinkName":"","newAreas":"1","hasChooseSortType":1,"progIdAndNoticeSegmentTypeMaps":{"3":[],"4":[11,12],"5":[]},"summaryType":0},
+    3: {"keywords":"","timeType":"4","beginTime":"2023-06-15","endTime":"2023-06-15","filtermode":"8","searchMode":0,"currentPage":1,"numPerPage":20,"sortType":"6","allType":-1,"noticeSegmentTypeStr":"","beginAmount":"","endAmount":"","purchasingUnitIdList":"","threeClassifyTagStr":"","fourLevelCategoryIdListStr":"","threeLevelCategoryIdListStr":"","levelId":"","searchDataType":2,"types":"101","jzjd":"","starNums":"5,4,3,3.5","hasLinkName":"","hasTenderTransferProject":1,"newAreas":"1","hasChooseSortType":1},
+    4: {"keywords":"","timeType":"4","beginTime":"2023-06-15","endTime":"2023-06-15","filtermode":"8","searchMode":0,"currentPage":1,"numPerPage":20,"sortType":"6","allType":-1,"beginAmount":"","endAmount":"","purchasingUnitIdList":"","threeClassifyTagStr":"","fourLevelCategoryIdListStr":"","threeLevelCategoryIdListStr":"","levelId":"","searchDataType":3,"types":"301","hasLinkName":"","newAreas":"1","hasChooseSortType":1,"progIdAndNoticeSegmentTypeMaps":{"31":[],"32":[],"33":[],"34":[],"35":[],"36":[],"37":[],"61":[]}},
+}
+
+
+city_dict = {32: '安庆', 33: '蚌埠', 35: '池州', 36: '滁州', 37: '阜阳', 38: '合肥', 39: '淮北', 40: '淮南', 41: '黄山',
+ 43: '马鞍山', 44: '宿州', 45: '铜陵', 46: '芜湖', 47: '宣城', 48: '亳州', 126: '北京', 130: '福州', 131: '龙岩', 42: '六安',
+ 132: '南平', 133: '宁德', 134: '莆田', 135: '泉州', 136: '三明', 137: '厦门', 138: '漳州', 207: '白银', 208: '定西',
+ 209: '甘南藏族自治州', 210: '嘉峪关', 211: '金昌', 212: '酒泉', 213: '兰州', 214: '临夏回族自治州', 215: '陇南', 216: '平凉',
+ 217: '庆阳', 218: '天水', 219: '武威', 220: '张掖', 302: '潮州', 303: '东莞', 304: '佛山', 305: '广州', 306: '河源',
+ 307: '惠州', 308: '江门', 309: '揭阳', 310: '茂名', 311: '梅州', 312: '清远', 313: '汕头', 314: '汕尾',
+ 315: '韶关', 316: '深圳', 317: '阳江', 318: '云浮', 319: '湛江', 320: '肇庆', 321: '中山', 322: '珠海', 415: '百色',
+ 416: '北海', 417: '崇左', 418: '防城港', 419: '桂林', 420: '贵港', 421: '河池', 422: '贺州', 423: '来宾',
+ 424: '柳州', 425: '南宁', 426: '钦州', 427: '梧州', 428: '玉林', 519: '安顺', 520: '毕节', 521: '贵阳',
+ 522: '六盘水', 523: '黔东南苗族侗族自治州', 524: '黔南布依族苗族自治州', 525: '黔西南布依族苗族自治州', 526: '铜仁',
+ 527: '遵义', 610: '白沙黎族自治县', 611: '保亭黎族苗族自治县', 612: '昌江黎族自治县', 613: '澄迈县', 614:
+     '定安县', 615: '东方', 616: '海口', 617: '乐东黎族自治县', 618: '临高县', 619: '陵水黎族自治县', 620: '琼海',
+ 621: '琼中黎族苗族自治县', 622: '三亚', 623: '屯昌县', 624: '万宁', 625: '文昌', 626: '五指山', 627: '儋州',
+ 3272: '三沙市', 646: '保定', 647: '沧州', 648: '承德', 649: '邯郸', 650: '衡水', 651: '廊坊', 652: '秦皇岛',
+ 653: '石家庄', 654: '唐山', 655: '邢台', 656: '张家口', 804: '安阳', 805: '鹤壁', 806: '济源', 807: '焦作',
+ 808: '开封', 809: '洛阳', 810: '南阳', 811: '平顶山', 812: '三门峡', 813: '商丘', 814: '新乡', 815: '信阳',
+ 816: '许昌', 817: '郑州', 818: '周口', 819: '驻马店', 820: '漯河', 821: '濮阳', 950: '大庆', 951: '大兴安岭',
+ 952: '哈尔滨', 953: '鹤岗', 954: '黑河', 955: '鸡西', 956: '佳木斯', 957: '牡丹江', 958: '七台河',
+ 959: '齐齐哈尔', 960: '双鸭山', 961: '绥化', 962: '伊春', 1041: '鄂州', 1042: '恩施土家族苗族自治州',
+ 1043: '黄冈', 1044: '黄石', 1045: '荆门', 1046: '荆州', 1047: '潜江', 1048: '神农架林区', 1049: '十堰',
+ 1050: '随州', 1051: '天门', 1052: '武汉', 1053: '仙桃', 1054: '咸宁', 1055: '襄阳', 1056: '孝感', 1057: '宜昌',
+ 1134: '常德', 1135: '长沙', 1136: '郴州', 1137: '衡阳', 1138: '怀化', 1139: '娄底', 1140: '邵阳', 1141: '湘潭',
+ 1142: '湘西土家族苗族自治州', 1143: '益阳', 1144: '永州', 1145: '岳阳', 1146: '张家界', 1147: '株洲', 1249: '白城',
+ 1250: '白山', 1251: '长春', 1252: '吉林', 1253: '辽源', 1254: '四平', 1255: '松原', 1256: '通化',
+ 1257: '延边朝鲜族自治州', 1307: '常州', 1308: '淮安', 1309: '连云港', 1310: '南京', 1311: '南通', 1312: '苏州',
+ 1313: '宿迁', 1314: '泰州', 1315: '无锡', 1316: '徐州', 1317: '盐城', 1318: '扬州', 1319: '镇江', 1387: '抚州',
+ 1388: '赣州', 1389: '吉安', 1390: '景德镇', 1391: '九江', 1392: '南昌', 1393: '萍乡', 1394: '上饶', 1395: '新余',
+ 1396: '宜春', 1397: '鹰潭', 1489: '鞍山', 1490: '本溪', 1491: '朝阳', 1492: '大连', 1493: '丹东', 1494: '抚顺',
+ 1495: '阜新', 1496: '葫芦岛', 1497: '锦州', 1498: '辽阳', 1499: '盘锦', 1500: '沈阳', 1501: '铁岭', 1502: '营口',
+ 1561: '阿拉善盟', 1562: '巴彦淖尔市', 1563: '包头', 1564: '赤峰', 1565: '鄂尔多斯', 1566: '呼和浩特',
+ 1567: '呼伦贝尔', 1568: '通辽', 1569: '乌海', 1570: '乌兰察布', 1571: '锡林郭勒盟', 1572: '兴安盟', 1662: '固原',
+ 1663: '石嘴山', 1664: '吴忠', 1665: '银川', 1681: '中卫', 1686: '果洛藏族自治州', 1687: '海北藏族自治州',
+ 1688: '海东', 1689: '海南藏族自治州', 1690: '海西蒙古族藏族自治州', 1691: '黄南藏族自治州', 1692: '西宁',
+ 1693: '玉树藏族自治州', 1734: '滨州', 1735: '德州', 1736: '东营', 1737: '菏泽', 1738: '济南', 1739: '济宁',
+ 1740: '莱芜', 1741: '聊城', 1742: '临沂', 1743: '青岛', 1744: '日照', 1745: '泰安', 1746: '威海', 1747: '潍坊',
+ 1748: '烟台', 1749: '枣庄', 1750: '淄博', 1859: '长治', 1860: '大同', 1861: '晋城', 1862: '晋中', 1863: '临汾',
+ 1864: '吕梁', 1865: '朔州', 1866: '太原', 1867: '忻州', 1868: '阳泉', 1869: '运城', 1977: '安康', 1978: '宝鸡',
+ 1979: '汉中', 1980: '商洛', 1981: '铜川', 1982: '渭南', 1983: '西安', 1984: '咸阳', 1985: '延安', 1986: '榆林',
+ 2081: '上海', 2085: '阿坝藏族羌族自治州', 2086: '巴中', 2087: '成都', 2088: '达州', 2089: '德阳',
+ 2090: '甘孜藏族自治州', 2091: '广安', 2092: '广元', 2093: '乐山', 2094: '凉山彝族自治州', 2095: '眉山',
+ 2096: '绵阳', 2097: '南充', 2098: '内江', 2099: '攀枝花', 2100: '遂宁', 2101: '雅安', 2102: '宜宾', 2103: '资阳',
+ 2104: '自贡', 2105: '泸州', 2262: '天津', 2268: '阿里', 2269: '昌都', 2270: '拉萨', 2271: '林芝', 2272: '那曲',
+ 2273: '日喀则', 2274: '山南', 2348: '阿克苏', 2349: '阿拉尔', 2350: '巴音郭楞蒙古自治州', 2351: '博尔塔拉蒙古自治州',
+ 2352: '昌吉回族自治州', 2353: '哈密', 2354: '和田', 2355: '喀什', 2356: '克拉玛依', 2357: '克孜勒苏柯尔克孜自治州',
+ 2358: '石河子', 2359: '图木舒克', 2360: '吐鲁番', 2361: '乌鲁木齐', 2362: '五家渠', 2363: '伊犁哈萨克自治州',
+ 3135: '可克达拉市', 3136: '昆玉市', 3547: '塔城地区', 3548: '铁门关市', 3549: '双河市', 3550: '阿勒泰地区',
+ 3551: '北屯市', 2454: '保山', 2455: '楚雄彝族自治州', 2456: '大理白族自治州', 2457: '德宏傣族景颇族自治州',
+ 2458: '迪庆藏族自治州', 2459: '红河哈尼族彝族自治州', 2460: '昆明', 2461: '丽江', 2462: '临沧', 2463: '怒江傈僳族自治州',
+ 2464: '曲靖', 2465: '普洱市', 2466: '文山壮族苗族自治州', 2467: '西双版纳傣族自治州', 2468: '玉溪', 2469: '昭通', 2595: '杭州',
+ 2596: '湖州', 2597: '嘉兴', 2598: '金华', 2599: '丽水', 2600: '宁波', 2601: '绍兴', 2602: '台州', 2603: '温州', 2604: '舟山',
+ 2605: '衢州', 2675: '重庆'}
+
+area_dict = {
+    1: [32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48],
+     2: [126],
+     3: [130, 131, 132, 133, 134, 135, 136, 137, 138],
+     4: [207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220],
+     5: [302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322],
+     6: [415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428],
+     7: [519, 520, 521, 522, 523, 524, 525, 526, 527],
+     8: [610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 3272],
+     9: [646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656],
+     10: [804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821],
+     11: [950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962],
+     12: [1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057],
+     13: [1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147],
+     14: [1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257],
+     15: [1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319],
+     16: [1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397],
+     17: [1489, 1490, 1491, 1492, 1493, 1494, 1495, 1496, 1497, 1498, 1499, 1500, 1501, 1502],
+     18: [1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572],
+     19: [1662, 1663, 1664, 1665, 1681],
+     20: [1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693],
+     21: [1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750],
+     22: [1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869],
+     23: [1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986],
+     24: [2081],
+     25: [2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2104, 2105],
+     26: [2262],
+     27: [2268, 2269, 2270, 2271, 2272, 2273, 2274],
+     28: [2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 3135, 3136, 3547, 3548, 3549, 3550, 3551],
+     29: [2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469],
+     30: [2595, 2596, 2597, 2598, 2599, 2600, 2601, 2602, 2603, 2604, 2605],
+     31: [2675]
+}
+
+channel_dict = {1:"招标信息",2:"中标信息",3:"拟在建项目",4:"审批项目"}
+
+province_dict = {1: '安徽', 2: '北京', 3: '福建', 4: '甘肃', 5: '广东', 6: '广西', 7: '贵州', 8: '海南', 9: '河北',
+                 10: '河南', 11: '黑龙江', 12: '湖北', 13: '湖南', 14: '吉林', 15: '江苏', 16: '江西', 17: '辽宁',
+                 18: '内蒙古', 19: '宁夏', 20: '青海', 21: '山东', 22: '山西', 23: '陕西', 24: '上海', 25: '四川',
+                 26: '天津', 27: '西藏', 28: '新疆', 29: '云南', 30: '浙江', 31: '重庆'}
+
+headers = {
+    "Accept": "*/*",
+    "Accept-Language": "zh-CN,zh;q=0.9",
+    "Cache-Control": "no-cache",
+    "Connection": "keep-alive",
+    "Content-Type": "application/json",
+    "Origin": "https://search.vip.qianlima.com",
+    "Pragma": "no-cache",
+    "Referer": "https://search.vip.qianlima.com/index.html",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+    "X-Auth-Token": "7da2f8b3-7034-4774-b7d1-1b96ba572e63",
+}
+cookies = {
+    "guest_id": "b1c483bd-2170-4322-b3d8-9637644938c4",
+    "seo_curUrl": "www.qianlima.com",
+    "qlm_referrer": "https://www.google.com.hk/",
+    "delClose200811": "firstGoIn",
+    "HWWAFSESID": "cf4ca73bda48cfb3bd",
+    "HWWAFSESTIME": "1686896094447",
+    "backUrl": "https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0",
+    "accessCaptchaPermission": "9AC523BE37D9BD163FDFB01EDB25D97ABC7F1FF2F188B48661BF7484B3580BD6",
+    "qlm_rem_login": "1",
+    "qlm_username": "16637019281",
+    "qlm_password": "fm7UBUpf83uECp33f88KuE3RppECupom",
+    "seo_refUrl": "",
+    "xAuthToken": "7da2f8b3-7034-4774-b7d1-1b96ba572e63",
+    "userInfo": "{%22userId%22:11970045%2C%22username%22:%2216637019281%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E7%8E%8B%E5%BC%BA%22%2C%22companyName%22:%22%E5%8C%97%E4%BA%AC%E8%B5%9E%E5%8D%9A%E6%81%92%E5%AE%89%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222023-06-16%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22https://gw-static.qianlima.com/gw/invoice/1681907621_553c9cbd28.jpeg%22%2C%22customerServicePhone%22:%22%20400-688-2000%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2216637019281%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%8C%97%E4%BA%AC%E8%B5%9E%E5%8D%9A%E6%81%92%E5%AE%89%22%2C%22zhiwu%22:%22%E5%91%98%E5%B7%A5%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22showExpireDate%22:true%2C%22companyNature%22:null%2C%22companyArea%22:%22%22%2C%22companyType%22:null%2C%22industry%22:null%2C%22product%22:null%2C%22contacts%22:%22%E7%8E%8B%E5%BC%BA%22%2C%22contactNumber%22:%2216637019281%22%2C%22contactAddress%22:null%2C%22mainCustomerGroups%22:null%2C%22informationTypePreferences%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
+}
+
+
+
+