lizongze преди 1 година
родител
ревизия
dc77ca48e4
променени са 6 файла, в които са добавени 290 реда и са изтрити 403 реда
  1. 4 0
      qlm/config/conf.yaml
  2. 0 100
      qlm/source_qianlima_bak.py
  3. 0 251
      qlm/source_qianlima_bak1.py
  4. 209 12
      qlm/source_qianlima_history.py
  5. 2 0
      ybw/config/conf.yaml
  6. 75 40
      ybw/detail_spider.py

+ 4 - 0
qlm/config/conf.yaml

@@ -1,12 +1,16 @@
 mongo:
   host: 172.17.4.87
   port: !!int 27080
+#  host: 127.0.0.1
+#  port: !!int 27017
 
 
 redis:
   host: 172.17.4.232
   port: !!int 7361
   pwd: "k5ZJR5KV4q7DRZ92DQ"
+#  host: 127.0.0.1
+#  port: !!int 6379
   db: !!int 1
 
 

+ 0 - 100
qlm/source_qianlima_bak.py

@@ -1,100 +0,0 @@
-# coding: utf-8
-import time
-
-import requests
-
-from utils.databases import mongo_table, redis_client
-from utils.log import logger
-from utils.tools import sha1
-
-qlm = mongo_table('qlm', 'qlm_2021')
-r = redis_client()
-redis_key = "qianlima_2021"
-
-'''
-# areas  地区
-# currentPage 页码
-# numPerPage 每页的条目数
-# types
-全部
-公告 0
-预告 1
-变更 2
-中标 3
-其他 5
-'''
-PROXIES = None
-
-
-def crawl_request(url, headers):
-    """
-    公共方法,get获取url 解析json 数据
-
-    :param url: 访问的url
-    :param headers: 携带参数url
-    :return:
-    """
-    while True:
-        try:
-            get_html = requests.get(url, headers=headers, timeout=5000)
-            # 自动编码,自适应字符编码
-            get_html.encoding = get_html.apparent_encoding
-            logger.info(get_html.status_code)
-            if get_html.status_code in [403, 404, 400, 502, 302]:
-                continue
-            elif get_html.status_code in [200]:
-                return get_html
-        except requests.exceptions.ConnectTimeout:
-            logger.error("Reacquire proxy")
-        except requests.RequestException:
-            time.sleep(3)
-            continue
-
-
-def crawl_spider(area, _type, i):
-    headers = {
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-        "Accept-Encoding": "gzip, deflate",
-        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
-        "Cache-Control": "no-cache",
-        "Connection": "keep-alive",
-        "DNT": "1",
-        "Host": "search.qianlima.com",
-        "Pragma": "no-cache",
-        "Upgrade-Insecure-Requests": "1",
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68",
-        "Cookie": 'UM_distinctid=178af0c6f6f2f3-0e81be36d60604-7166786d-144000-178af0c6f70294; BAIDU_SSP_lcr=https://cn.bing.com/; guest_id=ac5769d7-b906-499d-ab85-47809ee9bc56; gr_user_id=d2cc35f6-ffa2-441b-a9ff-f836345e6f75; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844534; seo_refUrl=https%3A//cn.bing.com/; seo_curUrl=www.qianlima.com; qlm_referrer=https://cn.bing.com/; delClose200811=firstGoIn; __jsluid_h=5f702d3c66f33654fc8d1f109062bb23; __jsl_clearance=1617844553.848|0|oACHKEqjLj1O5rc480L59DWlTO4%3D; CNZZDATA1277608403=736687752-1617840159-http%253A%252F%252Fsearch.qianlima.com%252F%7C1617840159; nts_login_tip=1; fromWhereUrl="http://www.qianlima.com/mfzb/"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1617844615'
-    }
-    url = "http://search.qianlima.com/api/v1/website/search?filtermode=1&timeType=-1&areas={}&types={}&isfirst=false&searchMode=2&keywords=%20&currentPage={}&numPerPage=1000"
-    list_url = url.format(area, _type, i)
-    print(list_url)
-    req = crawl_request(list_url, headers)
-    info_list = req.json()["data"]["data"]
-    item_list = []
-    for info in info_list:
-        tmid = sha1(str(info["contentid"]))
-        if r.hget(redis_key, tmid) is None:
-            r.hset(redis_key, tmid, str(info["contentid"]))
-            if "popTitle" in info:
-                info["title"] = info["popTitle"]
-            else:
-                info["title"] = info["showTitle"]
-            item_list.append(info)
-    if item_list:
-        qlm.insert_many(item_list)
-    logger.info("{}--{}抓取第{}页数据,共{}条".format(area, _type, i, len(item_list)))
-
-
-def start():
-    # 遍历省份
-    for area in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 11, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]:
-        # 遍历状态
-        # [0,1,2,3,5]
-        for _type in [0, 1, 2, 3, 5]:
-            # 遍历页码
-            # for i in range(1, 11):
-            crawl_spider(area, _type, 1)
-
-
-if __name__ == '__main__':
-    start()

+ 0 - 251
qlm/source_qianlima_bak1.py

@@ -1,251 +0,0 @@
-# coding: utf-8
-import datetime
-import json
-import math
-import random
-import time
-
-import requests
-
-from utils.databases import mongo_table, redis_client
-from utils.log import logger
-from utils.sessions_521 import http_session_521
-from utils.tools import sha1
-
-qlm = mongo_table('qlm', 'data_merge')
-r = redis_client()
-redis_key = "qianlima_2022"
-
-headers = {
-    "Accept": "*/*",
-    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-    "Cache-Control": "no-cache",
-    "Connection": "keep-alive",
-    "Content-Type": "application/json",
-    "Origin": "http://search.vip.qianlima.com",
-    "Pragma": "no-cache",
-    "Referer": "http://search.vip.qianlima.com/index.html",
-    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
-    "X-Auth-Token": "7f15af45-30c3-4bee-8a89-1b2813100aaf"
-}
-cookies = {
-    "BAIDU_SSP_lcr": "https://www.google.com/",
-    "guest_id": "0124edcd-1edc-4434-ae60-5d44662dced0",
-    "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874",
-    "seo_curUrl": "www.qianlima.com",
-    "source": "1",
-    "login_ip": "1.192.62.141",
-    "__jsluid_h": "7d0d080a30094eb57be38e4c09dd4a3b",
-    "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1654132585",
-    "seo_refUrl": "http%3A//www.qianlima.com/zbgg/p2",
-    "15637008265fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "3e048e44-2e33-4936-b2d1-00784cb48e60",
-    "useragent_hash": "32bd84bdee2cfe920dda80f92fa20070",
-    "qlm_rem_login": "1",
-    "qlm_username": "17610673271",
-    "qlm_password": "fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE",
-    "17610673271fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "b2b29454-a5f4-4fb0-ad76-cc2be246cac9",
-    "xAuthToken": "7f15af45-30c3-4bee-8a89-1b2813100aaf",
-    "login_time": "1654498455",
-    "userInfo": "{%22userId%22:10609848%2C%22username%22:%2217610673271%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E8%91%A3%E5%85%88%E7%94%9F%22%2C%22companyName%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E6%B3%A8%E5%86%8C%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222022-05-30%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22http://img_al.qianlima.com/invoice/1588986761_8ebeade70a.jpg%22%2C%22customerServicePhone%22:%2217718573953%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2217610673271%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22zhiwu%22:%22%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
-}
-session = requests.session()
-
-'''
-招标阶段
-0 = 全部
-1 = 招标信息
-2 = 中标信息
-3 = 采购意向
-'''
-REQUEST_DATA_MAP = {
-    0: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": -1, "noticeSegmentTypeStr": "", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 0, "types": "-1", "showContent": 1, "hasTenderTransferProject": 1, "newAreas": "1", "hasChooseSortType": 1, "summaryType": 0},
-    1: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": "0", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": -1, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"0": [], "1": []}, "summaryType": 0},
-    2: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 3, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 3, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"3": []}, "summaryType": 0},
-    3: {"keywords": "", "timeType": 4, "beginTime": "2022-12-07", "endTime": "2022-12-07", "filtermode": "8", "searchMode": 0, "currentPage": 1, "numPerPage": 20, "sortType": 1, "allType": 99, "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "searchDataType": 1, "types": 99, "showContent": 1, "newAreas": "", "hasChooseSortType": 1, "progIdAndNoticeSegmentTypeMaps": {"99": []}, "summaryType": 0}
-}
-
-
-def delay_by_day(days, fmt="%Y-%m-%d"):
-    """按天延时"""
-    _days = int(days)
-    _current_now = datetime.datetime.now()
-    return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
-
-
-def crawl_request(url, data, retries=5):
-    global session, cookies
-    resp = None
-    usages, usages_521 = 0, 1
-    while usages < retries:
-        request_params = {}
-        request_params.setdefault('data', data)
-        request_params.setdefault('headers', headers)
-        request_params.setdefault('cookies', cookies)
-        request_params.setdefault('timeout', 60)
-        try:
-            resp = session.post(url, **request_params)
-            if resp.status_code == 521:
-                while usages_521 < retries:
-                    success, _, cookies = http_session_521(session, url, headers, cookies, data=data)
-                    if success:
-                        break
-                    logger.warning(f"反爬破解失败,次数:{usages_521}")
-                    time.sleep(1)
-                    usages_521 += 1
-                usages += 1
-            elif resp.status_code in [401, 403, 404]:
-                logger.error(f"账号登录已失效或封停,异常状态码:{resp.status_code}")
-                break
-            else:
-                break
-        except requests.RequestException as e:
-            logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
-            usages += 1
-    # print(resp)
-    return resp
-
-
-def crawl_spider(area: str, type_: int, page: int, **kwargs):
-    results = []
-    request_status = 'failure'  # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable
-
-    curr_date = delay_by_day(0)
-    begin_time = kwargs.pop('begin_time', curr_date)
-    end_time = kwargs.pop('end_time', curr_date)
-    max_per_page = kwargs.pop('max_page', 20)
-    data = REQUEST_DATA_MAP[type_]
-    data['newAreas'] = area  # 设置地区
-    data['currentPage'] = page  # 页码
-    data['numPerPage'] = max_per_page  # 每页的条目数
-    data['timeType'] = 4  # 自定义时间参数
-    data['beginTime'] = begin_time  # 开始时间,格式:xxxx-xx-xxx
-    data['endTime'] = end_time  # 结束时间,格式:xxxx-xx-xxx
-    data = json.dumps(data)
-    url = "https://search.vip.qianlima.com/rest/service/website/search/solr"
-    response = crawl_request(url, data)
-    row_count = 0
-    if response is not None and response.status_code == 200:
-        resp_json = response.json()
-        if resp_json['code'] == 200:
-            row_count = resp_json["data"]["rowCount"]
-            # print(row_count)
-            items = resp_json["data"]["data"]
-            for item in items:
-                cid = sha1(str(item["contentid"]))
-                if not r.hexists(redis_key, cid):
-                    r.hset(redis_key, cid, '')
-                    if "popTitle" in item:
-                        item["title"] = item["popTitle"]
-                    else:
-                        item["title"] = item["showTitle"]
-
-                    addr = str(item["areaName"]).split('-')
-                    _area = addr[0] if len(addr) > 0 else ''
-                    _city = addr[1] if len(addr) > 1 else ''
-                    channel = (item['noticeSegmentTypeName'] or item['progName'])
-                    res = {
-                        'site': '千里马',
-                        'channel': channel,
-                        'area': _area,
-                        'city': _city,
-                        'title': item["title"],
-                        'publishtime': item['updateTime'],
-                        'href': item.get('url', '')
-                    }
-                    results.append(res)
-            request_status = 'success'
-
-            if len(items) < max_per_page:
-                request_status = 'stop'
-        else:
-            '''
-            {
-                "code": 200520,
-                "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
-                "data": null
-            }
-            '''
-            logger.info(resp_json['msg'])
-    elif response is not None and response.status_code in [401, 403, 404]:
-        request_status = 'disable'
-    elif response is not None and response.status_code == 405:
-        request_status = 'method_not_allowed'
-
-    if len(results) > 0:
-        qlm.insert_many(results)
-
-    if request_status in ['stop', 'success']:
-        logger.info("{}-第{}区-第{}类{}条-第{}页,成功上传{}条数据".format(
-            begin_time,
-            area,
-            type_,
-            page,
-            row_count,
-            len(results))
-        )
-    return request_status
-
-
-def by_area_crawl_data(area="", type_=0, **kwargs):
-    close_spider = False
-    disable_page, max_disable_page = 0, 3
-    pages = list(range(1, 101))  # 目前仅支持前10000数据的搜索
-    while len(pages) > 0:
-        if close_spider:
-            break
-        elif disable_page > max_disable_page:
-            # 此处可以添加通知邮件或者企业微信机器人接口,通知采集异常信息
-            break
-
-        page = pages.pop(0)
-        logger.info(f"访问第{area}区-第{type_}类-第{page}页数据")
-        while True:
-            success = crawl_spider(area, type_, page, **kwargs)
-            if success == 'failure':
-                interval = math.log(random.randint(100, 2400), 2)
-                logger.debug(f'异常重试,等待{interval}s')
-                time.sleep(interval)
-                continue
-            elif success == 'disable':
-                logger.warning(f"账号被禁止访问第{area}区-第{page}页数据")
-                disable_page += 1
-            elif success == 'method_not_allowed':
-                logger.warning("服务器禁止使用当前 HTTP 方法的请求")
-                disable_page += 1
-            elif success == 'stop':
-                close_spider = True
-            else:
-                logger.info(f"第{area}区-第{page}页数据采集成功")
-            break
-
-
-def select_types(date: str, area: str):
-    for type_ in [1, 2, 3]:
-        by_area_crawl_data(
-            area=area,
-            type_=type_,
-            begin_time=date,
-            end_time=date,
-            max_page=100
-        )
-        logger.info(f"{date}-第{area}区-第{type_}类采集结束")
-
-
-def select_area(date: str):
-    for area in range(1, 32):
-        select_types(date, str(area))
-    logger.info(f"任务结束")
-
-
-def history(date_lst: list):
-    for date in date_lst:
-        select_area(date)
-
-
-def start():
-    date_str = delay_by_day(-1)
-    select_area(date_str)
-
-
-if __name__ == '__main__':
-    start()

+ 209 - 12
qlm/source_qianlima_history.py

@@ -1,17 +1,214 @@
 # coding: utf-8
+import datetime
+import json
+import math
+import random
+import time
 
-from source_qianlima import history
+import requests
+from utils.config_parms import *
+from utils.databases import mongo_table, redis_client
+from utils.log import logger
+from utils.sessions_521 import http_session_521
+from utils.tools import sha1
+
+qlm = mongo_table('qlm', 'data_merge')
+r = redis_client()
+redis_key = "qianlima_2022"
+
+session = requests.session()
+
+'''
+https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
+搜索-2.0
+1 = 招标信息
+2 = 中标信息
+3 = 拟在建项目
+4 = 审批项目
+'''
+
+
+def delay_by_day(days, fmt="%Y-%m-%d"):
+    """按天延时"""
+    _days = int(days)
+    _current_now = datetime.datetime.now()
+    return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
+
+
+def crawl_request(url, data, retries=5):
+    global session, cookies1
+    resp = None
+    usages, usages_521 = 0, 1
+    while usages < retries:
+        request_params = {}
+        request_params.setdefault('data', data)
+        request_params.setdefault('headers', headers1)
+        request_params.setdefault('cookies', cookies1)
+        request_params.setdefault('timeout', 60)
+        try:
+            resp = session.post(url, **request_params)
+            if resp.status_code == 521:
+                while usages_521 < retries:
+                    success, _, cookies1 = http_session_521(session, url, headers1, cookies1, data=data)
+                    if success:
+                        break
+                    logger.warning(f"反爬破解失败,次数:{usages_521}")
+                    time.sleep(1)
+                    usages_521 += 1
+                usages += 1
+            elif resp.status_code in [401, 403, 404]:
+                logger.error(f"账号登录已失效或封停,异常状态码:{resp.status_code}")
+                break
+            else:
+                break
+        except requests.RequestException as e:
+            logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
+            usages += 1
+    return resp
+
+
+def crawl_spider(area: str, type_: int, page: int, **kwargs):
+    results = []
+    request_status = 'failure'  # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable
+
+    curr_date = delay_by_day(0)
+    begin_time = kwargs.pop('begin_time', curr_date)
+    end_time = kwargs.pop('end_time', curr_date)
+    max_per_page = kwargs.pop('max_per_page', 20)
+    data = REQUEST_DATA_MAP[type_]
+    data['newAreas'] = area  # 设置地区
+    data['currentPage'] = page  # 页码
+    data['numPerPage'] = max_per_page  # 每页的条目数
+    data['timeType'] = 4  # 自定义时间参数
+    data['beginTime'] = begin_time  # 开始时间,格式:xxxx-xx-xxx
+    data['endTime'] = end_time  # 结束时间,格式:xxxx-xx-xxx
+    data = json.dumps(data)
+    url = "https://search.vip.qianlima.com/rest/service/website/search/solr"
+    response = crawl_request(url, data)
+    row_count = 0
+    if response is not None and response.status_code == 200:
+        resp_json = response.json()
+        if resp_json['code'] == 200:
+            row_count = resp_json["data"]["rowCount"]
+            items = resp_json["data"]["data"]
+            for item in items:
+                cid = sha1(str(item["contentid"]))
+                if not r.hexists(redis_key, cid):
+                    r.hset(redis_key, cid, '')
+                    if "popTitle" in item:
+                        item["title"] = item["popTitle"]
+                    else:
+                        item["title"] = item["showTitle"]
+
+                    addr = str(item["areaName"]).split('-')
+                    _area = addr[0] if len(addr) > 0 else ''
+                    _city = addr[1] if len(addr) > 1 else ''
+                    channel = (item['noticeSegmentTypeName'] or item['progName'])
+                    res = {
+                        'site': '千里马',
+                        'channel': channel,
+                        'area': _area,
+                        'city': _city,
+                        'title': item["title"],
+                        'publishtime': item['updateTime'],
+                        'href': item.get('url', '')
+                    }
+                    results.append(res)
+            request_status = 'success'
+
+            if len(items) < max_per_page:
+                request_status = 'stop'
+        else:
+            '''
+            {
+                "code": 200520,
+                "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
+                "data": null
+            }
+            '''
+            logger.info(resp_json['msg'])
+    elif response is not None and response.status_code in [401, 403, 404]:
+        request_status = 'disable'
+    elif response is not None and response.status_code == 405:
+        request_status = 'method_not_allowed'
+
+    if len(results) > 0:
+        qlm.insert_many(results)
+
+    if request_status in ['stop', 'success']:
+        logger.info("{}-{}-{}-共{}条-第{}页,成功上传{}条数据".format(
+            begin_time,
+            city_dict.get(int(area)),
+            channel_dict.get(type_),
+            row_count,
+            page,
+            len(results))
+        )
+    return request_status
+
+
+def by_area_crawl_data(area="", type_=0, **kwargs):
+    close_spider = False
+    disable_page, max_disable_page = 0, 3
+    pages = list(range(1, 101))  # 目前仅支持前10000数据的搜索
+    while len(pages) > 0:
+        if close_spider:
+            break
+        elif disable_page > max_disable_page:
+            # 此处可以添加通知邮件或者企业微信机器人接口,通知采集异常信息
+            break
+
+        page = pages.pop(0)
+        logger.info(f"访问-{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据")
+        while True:
+            success = crawl_spider(area, type_, page, **kwargs)
+            if success == 'failure':
+                interval = math.log(random.randint(100, 2400), 2)
+                logger.debug(f'异常重试,等待{interval}s')
+                time.sleep(interval)
+                continue
+            elif success == 'disable':
+                logger.warning(f"账号被禁止访问-{city_dict.get(int(area))}-第{page}页数据")
+                disable_page += 1
+            elif success == 'method_not_allowed':
+                logger.warning("服务器禁止使用当前 HTTP 方法的请求")
+                disable_page += 1
+            elif success == 'stop':
+                close_spider = True
+            else:
+                logger.info(f"{city_dict.get(int(area))}-{channel_dict.get(type_)}-第{page}页数据采集成功")
+                time.sleep(math.log(random.randint(100, 2400), 2))
+            break
+
+
+def select_types(date: str, area: str, prov: str):
+    for type_ in [1, 2, 3, 4]:
+        by_area_crawl_data(
+            area=area,
+            type_=type_,
+            begin_time=date,
+            end_time=date,
+            max_per_page=100
+        )
+    logger.info(f"{date}-{province_dict.get(int(prov))}地区-{channel_dict.get(type_)}采集结束")
+
+
+def select_area(date: str):
+    for province in range(1, 32):
+        for city_ in area_dict.get(province):
+            select_types(date, area=str(city_), prov=str(province))
+    logger.info(f"任务结束")
+
+
+def history(date_lst: list):
+    for date in date_lst:
+        select_area(date)
+
+
+def start():
+    date_str = "2023-09-25"
+    select_area(date_str)
 
 
 if __name__ == '__main__':
-    lst = [
-        '2022-05-29',
-        '2022-05-30',
-        '2022-05-31',
-        '2022-06-01',
-        '2022-06-02',
-        '2022-06-03',
-        '2022-06-04',
-        '2022-06-05',
-    ]
-    history(lst)
+    start()

+ 2 - 0
ybw/config/conf.yaml

@@ -11,6 +11,8 @@ redis:
   host: 172.17.162.28
   port: !!int 7361
   pwd: "k5ZJR5KV4q7DRZ92DQ"
+#  host: 127.0.0.1
+#  port: !!int 6379
   db: !!int 1
 
 

+ 75 - 40
ybw/detail_spider.py

@@ -1,6 +1,6 @@
 import random
 import time
-
+import re
 import requests.exceptions
 from lxml.html import fromstring, HtmlElement, tostring
 from lxml.html.clean import Cleaner
@@ -88,6 +88,17 @@ class DetailSpider:
         update = {'crawl': False}
         self._update_crawl_task(task['_id'], **update)
 
+    def json_request(self, fid, request_params):
+
+        url = "https://www.chinabidding.cn/agency.info.Detail/show"
+        params = {
+            "fid": f"{fid}"
+        }
+
+        res = requests.get(url, params=params, **request_params)
+        return res
+
+
     def crawl_request(self, item: dict):
         url = item['competehref']
         headers = {
@@ -126,53 +137,77 @@ class DetailSpider:
             else:
                 request_params.update({'cookies': login_cookies})
 
-            try:
-                r = requests.get(url, **request_params)
-                # 账号登录状态检查
-                retry_login = login_check(self.user.phone, url, False)
-                if retry_login:
-                    logger.info(f"[重新登录]{self.user.phone}")
-                    _, code = login(*self.user, proxies=proxies)
-                    if code == 200:
-                        retries += 1
-                    else:
-                        time.sleep(1800)
-                        retries += 1
+            fid = "".join(re.findall('\?fid=(.*)',url))
+            if fid:
+                try:
+                    r = self.json_request(fid, request_params)
+                    # 账号登录状态检查
+                    retry_login = login_check(self.user.phone, url, False)
+                    if retry_login:
+                        logger.info(f"[重新登录]{self.user.phone}")
+                        _, code = login(*self.user, proxies=proxies)
+                        if code == 200:
+                            retries += 1
+                        else:
+                            time.sleep(1800)
+                            retries += 1
+                        continue
+                    logger.info(f'[采集正文] fid_{fid}')
+                    return r
+                except:
+                    retries += 1
                     continue
-                element = fromstring(r.text)
-                nodes = element.xpath('//*[@id="main_dom"]/div[1]')
-                if len(nodes) != 1:
-                    retries_502 += 1
-                    logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
+            else:
+                try:
+                    r = requests.get(url, **request_params)
+                    # 账号登录状态检查
+                    retry_login = login_check(self.user.phone, url, False)
+                    if retry_login:
+                        logger.info(f"[重新登录]{self.user.phone}")
+                        _, code = login(*self.user, proxies=proxies)
+                        if code == 200:
+                            retries += 1
+                        else:
+                            time.sleep(1800)
+                            retries += 1
+                        continue
+                    element = fromstring(r.text)
+                    nodes = element.xpath('//*[@id="main_dom"]/div[1]')
+                    if len(nodes) != 1:
+                        retries_502 += 1
+                        logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
+                        continue
+                    else:
+                        node = nodes[0]
+                        logger.info(f'[采集正文] id={node.attrib.get("id")}')
+                        return r
+                except requests.RequestException:
+                    retries += 1
                     continue
-                else:
-                    node = nodes[0]
-                    logger.info(f'[采集正文] id={node.attrib.get("id")}')
-                    return r
-            except requests.RequestException:
-                retries += 1
-                continue
 
         return None
 
     def crawl_response(self, response, item):
-        element: HtmlElement = fromstring(response.text)
-        node = element.xpath('//*[@id="infoDescription"]')[0]
-        node = pre_parse(node)
-        features = {
-            './div[@class="ckgys_cont"]',
-            './/div[@class="detail-title ng-scope"]',
-            './/table[@class="detail_Table"]',
-        }
-        for feature in features:
-            extract_node = node.xpath(feature)
-            if len(extract_node) > 0:
-                valid_node = extract_node[0]
-                break
+        if re.match('^\{', response.text):
+            html = response.json().get('c_info').get('content')
         else:
-            valid_node = node
+            element: HtmlElement = fromstring(response.text)
+            node = element.xpath('//*[@id="infoDescription"]')[0]
+            node = pre_parse(node)
+            features = {
+                './div[@class="ckgys_cont"]',
+                './/div[@class="detail-title ng-scope"]',
+                './/table[@class="detail_Table"]',
+            }
+            for feature in features:
+                extract_node = node.xpath(feature)
+                if len(extract_node) > 0:
+                    valid_node = extract_node[0]
+                    break
+            else:
+                valid_node = node
 
-        html = page_source(valid_node)
+            html = page_source(valid_node)
         '''检查原始页面内容'''
         CheckText(html)
         item["contenthtml"] = html