dongzhaorui 3 年之前
父節點
當前提交
c96ae88aa5
共有 1 個文件被更改,包括 1 次插入245 次删除
  1. 1 245
      qlm/source_qianlima_history.py

+ 1 - 245
qlm/source_qianlima_history.py

@@ -1,250 +1,6 @@
 # coding: utf-8
-import datetime
-import json
-import math
-import random
-import time
 
-import requests
-
-from utils.databases import mongo_table, redis_client
-from utils.log import logger
-from utils.sessions_521 import http_session_521
-from utils.tools import sha1
-
-qlm = mongo_table('qlm', 'data_merge')
-r = redis_client()
-redis_key = "qianlima_2022"
-
-headers = {
-    "Accept": "*/*",
-    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-    "Cache-Control": "no-cache",
-    "Connection": "keep-alive",
-    "Content-Type": "application/json",
-    "Origin": "http://search.vip.qianlima.com",
-    "Pragma": "no-cache",
-    "Referer": "http://search.vip.qianlima.com/index.html",
-    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
-    "X-Auth-Token": "4c21c1bc-ef21-44bb-b466-677a5804e3a0"
-}
-cookies = {
-    "BAIDU_SSP_lcr": "https://www.google.com/",
-    "guest_id": "0124edcd-1edc-4434-ae60-5d44662dced0",
-    "Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1653874874",
-    "seo_curUrl": "www.qianlima.com",
-    "source": "1",
-    "login_ip": "1.192.62.141",
-    "__jsluid_h": "7d0d080a30094eb57be38e4c09dd4a3b",
-    "delClose200811": "firstGoIn",
-    "Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7": "1654132585",
-    "seo_refUrl": "http%3A//www.qianlima.com/zbgg/p2",
-    "15637008265fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "3e048e44-2e33-4936-b2d1-00784cb48e60",
-    "__jsl_clearance": "1654146807.154|0|B%2FzLZ4GWjKEYGnqJezy56PLtb1A%3D",
-    "qlm_rem_login": "1",
-    "qlm_username": "17610673271",
-    "qlm_password": "fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE",
-    "17610673271fmgj83E8oBfKRmKCBRuUR83pCB8pCCfE": "10188605-7361-475f-998c-61beb43adc56",
-    "useragent_hash": "32bd84bdee2cfe920dda80f92fa20070",
-    "xAuthToken": "4c21c1bc-ef21-44bb-b466-677a5804e3a0",
-    "login_time": "1654148916",
-    "userInfo": "{%22userId%22:10609848%2C%22username%22:%2217610673271%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E8%91%A3%E5%85%88%E7%94%9F%22%2C%22companyName%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E6%B3%A8%E5%86%8C%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222022-05-30%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22http://img_al.qianlima.com/invoice/1588986761_8ebeade70a.jpg%22%2C%22customerServicePhone%22:%2217718573953%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2217610673271%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%90%88%E8%82%A5%E6%8B%93%E6%99%AE%E7%BD%91%E7%BB%9C%E7%B3%BB%E7%BB%9F%E5%B7%A5%E7%A8%8B%E6%9C%89%E9%99%90%E8%B4%A3%E4%BB%BB%E5%85%AC%E5%8F%B8%22%2C%22zhiwu%22:%22%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
-}
-session = requests.session()
-INFO_TYPES_MAPS = {
-    0: {"allType": -1, "types": "-1", "hasTenderTransferProject": 1, "searchDataType": 0},  # 全部
-    1: {"allType": 0, "progIdList": [0, 1], "types": -1, "searchDataType": 1},  # 招标信息
-    2: {"allType": 3, "progIdList": [3], "types": 3, "searchDataType": 1},  # 中标信息
-    3: {"allType": 99, "progIdList": [99], "types": 99, "searchDataType": 1}  # 采购意向
-}
-
-
-def delay_by_day(days, fmt="%Y-%m-%d"):
-    """按天延时"""
-    _days = int(days)
-    _current_now = datetime.datetime.now()
-    return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
-
-
-def crawl_request(url, data, retries=5):
-    global session, cookies
-    resp = None
-    usages, usages_521 = 0, 1
-    while usages < retries:
-        request_params = {}
-        request_params.setdefault('data', data)
-        request_params.setdefault('headers', headers)
-        request_params.setdefault('cookies', cookies)
-        request_params.setdefault('timeout', 60)
-        try:
-            resp = session.post(url, **request_params)
-            if resp.status_code == 521:
-                while usages_521 < retries:
-                    success, _, cookies = http_session_521(session, url, headers, cookies, data=data)
-                    if success:
-                        break
-                    logger.warning(f"反爬破解失败,次数:{usages_521}")
-                    time.sleep(1)
-                    usages_521 += 1
-                usages += 1
-            elif resp.status_code in [401, 403, 404]:
-                logger.error(f"账号登录已失效或封停,异常状态码:{resp.status_code}")
-                break
-            else:
-                break
-        except requests.RequestException as e:
-            logger.error(f"访问失败,失败原因:{e.__class__.__name__}")
-            usages += 1
-    # print(resp)
-    return resp
-
-
-def crawl_spider(area: str, types: int, page: int, **kwargs):
-    results = []
-    request_status = 'failure'  # 资源请求结果, 成功=success 失败=failure 停止=stop 账号封停=disable
-
-    curr_date = delay_by_day(0)
-    begin_time = kwargs.pop('begin_time', curr_date)
-    end_time = kwargs.pop('end_time', curr_date)
-    max_per_page = kwargs.pop('max_page', 20)
-    types_map = INFO_TYPES_MAPS[types]
-    data = {
-        **types_map,
-        "keywords": "",
-        "timeType": 4,  # 自定义时间参数
-        "beginTime": begin_time,  # 格式: xxxx-xx-xxx
-        "endTime": end_time,  # 格式: xxxx-xx-xxx
-        "filtermode": 8,
-        "searchMode": 0,
-        "currentPage": page,  # 页码
-        "numPerPage": max_per_page,  # 每页的条目数
-        "sortType": 1,
-        "noticeSegmentTypeStr": "",
-        "beginAmount": "",
-        "endAmount": "",
-        "purchasingUnitIdList": "",
-        "threeClassifyTagStr": "",
-        "fourLevelCategoryIdListStr": "",
-        "threeLevelCategoryIdListStr": "",
-        "levelId": "",
-        "showContent": 1,
-        "newAreas": area,  # 设置地区
-        "hasChooseSortType": 1,
-        "summaryType": 0
-    }
-    data = json.dumps(data)
-    url = "http://search.vip.qianlima.com/rest/service/website/search/solr"
-    response = crawl_request(url, data)
-    if response is not None and response.status_code == 200:
-        resp_json = response.json()
-        if resp_json['code'] == 200:
-            # print(resp_json["data"]["rowCount"])
-            items = resp_json["data"]["data"]
-            for item in items:
-                cid = sha1(str(item["contentid"]))
-                if not r.hexists(redis_key, cid):
-                    r.hset(redis_key, cid, '')
-                    if "popTitle" in item:
-                        item["title"] = item["popTitle"]
-                    else:
-                        item["title"] = item["showTitle"]
-
-                    addr = str(item["areaName"]).split('-')
-                    _area = addr[0] if len(addr) > 0 else ''
-                    _city = addr[1] if len(addr) > 1 else ''
-                    channel = (item['noticeSegmentTypeName'] or item['progName'])
-                    res = {
-                        'site': '千里马',
-                        'channel': channel,
-                        'area': _area,
-                        'city': _city,
-                        'title': item["title"],
-                        'publishtime': item['updateTime'],
-                        'href': item.get('url', '')
-                    }
-                    results.append(res)
-            request_status = 'success'
-
-            if len(items) < max_per_page:
-                request_status = 'stop'
-        else:
-            '''
-            {
-                "code": 200520,
-                "msg": "抱歉,您在单位时间内的搜索次数已达上限,请联系客服购买会员!咨询电话:400-688-2000",
-                "data": null
-            }
-            '''
-            logger.info(resp_json['msg'])
-    elif response is not None and response.status_code in [401, 403, 404]:
-        request_status = 'disable'
-
-    if len(results) > 0:
-        qlm.insert_many(results)
-
-    if request_status in ['stop', 'success']:
-        logger.info("第{}区-第{}类-第{}页成功上传{}条数据".format(area, types, page, len(results)))
-    return request_status
-
-
-def by_area_crawl_data(area="", types=0, **kwargs):
-    close_spider = False
-    disable_page, max_disable_page = 0, 3
-    pages = list(range(1, 11))  # 目前仅支持前10000数据的搜索
-    while len(pages) > 0:
-        if close_spider:
-            break
-        elif disable_page > max_disable_page:
-            # 此处可以添加通知邮件或者企业微信机器人接口,通知采集异常信息
-            break
-
-        page = pages.pop(0)
-        logger.info(f"访问第{area}区-第{types}类-第{page}页数据")
-        while True:
-            success = crawl_spider(area, types, page, **kwargs)
-            if success == 'failure':
-                n = random.randint(100, 2400)
-                interval = math.log(n, 2)
-                logger.debug(f'等待{interval}s')
-                time.sleep(interval)
-                continue
-            elif success == 'disable':
-                logger.warning(f"账号被禁止访问第{area}区-第{page}页数据")
-                disable_page += 1
-                break
-            elif success == 'stop':
-                close_spider = True
-            logger.info(f"第{area}区-第{page}页数据采集成功")
-            break
-
-
-def select_types(date: str, area: str):
-    for types in [1, 2, 3]:
-        by_area_crawl_data(
-            area=area,
-            types=types,
-            begin_time=date,
-            end_time=date,
-            max_page=1000
-        )
-        logger.info(f"第{area}区-第{types}类采集任务结束")
-
-
-def select_area(date: str):
-    for area in range(1, 32):
-        select_types(date, str(area))
-    logger.info("任务结束")
-
-
-def history(date_lst: list):
-    for date in date_lst:
-        select_area(date)
-
-
-def start():
-    date_str = delay_by_day(-1)
-    # print(date_str)
-    select_area(date_str)
+from source_qianlima import history
 
 
 if __name__ == '__main__':