|
@@ -1,6 +1,5 @@
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
-import datetime
|
|
|
import json
|
|
|
import math
|
|
|
import random
|
|
@@ -19,8 +18,7 @@ from utils.config_parms import (
|
|
|
from utils.databases import mongo_table, redis_client
|
|
|
from utils.log import logger
|
|
|
from utils.sessions_521 import http_session_521
|
|
|
-from utils.tools import sha1
|
|
|
-
|
|
|
+from utils.tools import sha1, get_today_of_day
|
|
|
|
|
|
'''
|
|
|
https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
|
|
@@ -35,16 +33,8 @@ qlm = mongo_table('qlm', 'data_merge')
|
|
|
r = redis_client()
|
|
|
redis_key = 'qianlima_2024'
|
|
|
|
|
|
-proxies = {
|
|
|
- 'http': 'socks5://119.3.159.234:8860',
|
|
|
- 'https': 'socks5://119.3.159.234:8860',
|
|
|
-}
|
|
|
session = requests.session()
|
|
|
|
|
|
-account_id = 1 # 账号标识
|
|
|
-captcha_appear_times = 0 # 图形验证出现次数
|
|
|
-stop_use_account = False
|
|
|
-
|
|
|
|
|
|
class AccountViolationRiskError(Exception):
|
|
|
pass
|
|
@@ -66,47 +56,50 @@ def send_wechat_warning(msg, send=True):
|
|
|
logger.info(response.json())
|
|
|
|
|
|
|
|
|
-def get_today_of_day(offset, fmt='%Y-%m-%d'):
|
|
|
- date = datetime.datetime.now() + datetime.timedelta(days=int(offset))
|
|
|
- return date.strftime(fmt)
|
|
|
+def get_account(area):
|
|
|
+ return next((p for p in account_pool if area in p['follow']), None)
|
|
|
+
|
|
|
+
|
|
|
+def disrupt_account_pool():
|
|
|
+ results = []
|
|
|
|
|
|
+ copy_account_pool = list(account_pool)
|
|
|
+ while copy_account_pool:
|
|
|
+ idx = random.randint(0, len(copy_account_pool) - 1)
|
|
|
+ results.append(copy_account_pool.pop(idx))
|
|
|
|
|
|
-def switch_account():
|
|
|
- global account_id, stop_use_account
|
|
|
+ return results
|
|
|
|
|
|
- logger.info(f'切换账号...{account_id}')
|
|
|
- if account_id < len(account_pool):
|
|
|
- account_id += 1 # 切换账号
|
|
|
- else:
|
|
|
- account_id = 1 # 重置账号
|
|
|
- stop_use_account = True
|
|
|
|
|
|
+def request(url, data, account, retries=5):
|
|
|
+ global session
|
|
|
|
|
|
-def request(url, data, retries=5):
|
|
|
- global session, proxies, account_id, stop_use_account
|
|
|
+ ip, _ = str(account['proxies']['http']).replace('socks5://', '').split(':')
|
|
|
+ phone = account['phone']
|
|
|
|
|
|
resp, msg = None, ''
|
|
|
usages, usages_521 = 0, 1
|
|
|
while usages < retries:
|
|
|
- (_, account), = account_pool[account_id].items()
|
|
|
request_params = {}
|
|
|
request_params.setdefault('data', data)
|
|
|
request_params.setdefault('headers', account['headers'])
|
|
|
request_params.setdefault('cookies', account['cookies'])
|
|
|
- request_params.setdefault('proxies', proxies)
|
|
|
+ request_params.setdefault('proxies', account['proxies'])
|
|
|
request_params.setdefault('timeout', 60)
|
|
|
try:
|
|
|
resp = session.post(url, **request_params)
|
|
|
if resp.status_code == 521:
|
|
|
while usages_521 < retries:
|
|
|
- success, _, cookies = http_session_521(session, url,
|
|
|
+ success, _, cookies = http_session_521(session,
|
|
|
+ url,
|
|
|
headers=account['headers'],
|
|
|
cookies=account['cookies'],
|
|
|
data=data,
|
|
|
- proxies=proxies)
|
|
|
+ proxies=account['proxies'])
|
|
|
if success:
|
|
|
break
|
|
|
- msg = f'反爬破解失败,次数:{usages_521}'
|
|
|
+
|
|
|
+ msg = f'账号[{phone}]反爬破解失败,次数:{usages_521}'
|
|
|
logger.warning(msg)
|
|
|
time.sleep(1)
|
|
|
usages_521 += 1
|
|
@@ -114,21 +107,18 @@ def request(url, data, retries=5):
|
|
|
usages += 1
|
|
|
|
|
|
elif resp.status_code == 429:
|
|
|
- if stop_use_account:
|
|
|
- msg = f'访问频繁,图形验证,异常状态码:{resp.status_code}'
|
|
|
- logger.error(msg)
|
|
|
- logger.warning(resp.content.decode())
|
|
|
- break
|
|
|
- else:
|
|
|
- switch_account()
|
|
|
+ msg = f'账号[{phone}]访问频繁,图形验证,异常状态码:{resp.status_code}'
|
|
|
+ logger.error(msg)
|
|
|
+ logger.warning(resp.content.decode())
|
|
|
+ break
|
|
|
|
|
|
elif resp.status_code in [401, 403, 404]:
|
|
|
- msg = f'账号登录已失效或封停,异常状态码:{resp.status_code}'
|
|
|
+ msg = f'账号[{phone}]登录已失效或封停,,异常状态码:{resp.status_code}'
|
|
|
logger.error(msg)
|
|
|
break
|
|
|
|
|
|
elif str(resp.status_code).startswith('4'):
|
|
|
- msg = f'公网IP被封禁,异常状态码:{resp.status_code}'
|
|
|
+ msg = f'公网代理IP[{ip}]被封禁,,异常状态码:{resp.status_code}'
|
|
|
logger.error(msg)
|
|
|
break
|
|
|
|
|
@@ -142,7 +132,7 @@ def request(url, data, retries=5):
|
|
|
return resp, msg
|
|
|
|
|
|
|
|
|
-def downloader(begin_date, end_date, category, address, page, page_size):
|
|
|
+def downloader(begin_date, end_date, category, address, page, page_size, account):
|
|
|
"""
|
|
|
|
|
|
:param str begin_date: 开始时间,格式:xxxx-xx-xxx
|
|
@@ -151,6 +141,7 @@ def downloader(begin_date, end_date, category, address, page, page_size):
|
|
|
:param int address: 地区编号
|
|
|
:param int page: 页码
|
|
|
:param int page_size: 单页数据条数
|
|
|
+ :param dict account: 采集账号
|
|
|
|
|
|
"""
|
|
|
url = 'https://search.vip.qianlima.com/rest/service/website/search/solr'
|
|
@@ -166,7 +157,7 @@ def downloader(begin_date, end_date, category, address, page, page_size):
|
|
|
# 请求资源响应自定义状态, 成功=success 失败=failure 停止=stop IP封停=disable等
|
|
|
request_status = 'failure'
|
|
|
|
|
|
- response, err = request(url, data)
|
|
|
+ response, err = request(url, data, account)
|
|
|
if response is None:
|
|
|
request_status = 'server_error'
|
|
|
return request_status, err
|
|
@@ -277,6 +268,7 @@ def automatic_pagination(**kwargs):
|
|
|
retry_times += 1
|
|
|
send_warning = True
|
|
|
elif err == 'stop':
|
|
|
+ time.sleep(math.log(random.randint(100, 2400), 2))
|
|
|
close_spider = True
|
|
|
else:
|
|
|
time.sleep(math.log(random.randint(100, 2400), 2))
|
|
@@ -286,14 +278,15 @@ def automatic_pagination(**kwargs):
|
|
|
raise AccountViolationRiskError
|
|
|
|
|
|
|
|
|
-def core(date: str, category: int, address: int, page_size=20):
|
|
|
+def core(date: str, category: int, address: int, account, page_size=20):
|
|
|
try:
|
|
|
automatic_pagination(
|
|
|
begin_date=date,
|
|
|
end_date=date,
|
|
|
category=category,
|
|
|
address=address,
|
|
|
- page_size=page_size # 每页数据最大条数
|
|
|
+ page_size=page_size, # 每页数据最大条数
|
|
|
+ account=account
|
|
|
)
|
|
|
return True
|
|
|
except AccountViolationRiskError:
|
|
@@ -307,6 +300,12 @@ def spider(date, page_size=40):
|
|
|
for date in dates:
|
|
|
for category, category_name in channel_dict.items():
|
|
|
for area, cities in area_dict.items():
|
|
|
+ account = get_account(area)
|
|
|
+ if not account:
|
|
|
+ # raise ValueError('采集账号不能为空!')
|
|
|
+ logger.warning('暂无可用采集账号与代理!')
|
|
|
+ continue
|
|
|
+
|
|
|
for city in cities:
|
|
|
logger.info(' && '.join([
|
|
|
date,
|
|
@@ -318,7 +317,7 @@ def spider(date, page_size=40):
|
|
|
if len(cities) == 1:
|
|
|
city = area # 千里马取消了直辖市的分区,直接采集省市区域
|
|
|
|
|
|
- yield core(date, category, city, page_size=page_size)
|
|
|
+ yield core(date, category, city, account, page_size=page_size)
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(e)
|