|
@@ -8,21 +8,19 @@ import time
|
|
|
|
|
|
import requests
|
|
import requests
|
|
|
|
|
|
-from utils.config_parms import *
|
|
|
|
|
|
+from utils.config_parms import (
|
|
|
|
+ account_pool,
|
|
|
|
+ area_dict,
|
|
|
|
+ city_dict,
|
|
|
|
+ province_dict,
|
|
|
|
+ channel_dict,
|
|
|
|
+ REQUEST_DATA_MAP
|
|
|
|
+)
|
|
from utils.databases import mongo_table, redis_client
|
|
from utils.databases import mongo_table, redis_client
|
|
from utils.log import logger
|
|
from utils.log import logger
|
|
from utils.sessions_521 import http_session_521
|
|
from utils.sessions_521 import http_session_521
|
|
from utils.tools import sha1
|
|
from utils.tools import sha1
|
|
|
|
|
|
-qlm = mongo_table('qlm', 'data_merge')
|
|
|
|
-r = redis_client()
|
|
|
|
-redis_key = 'qianlima_2024'
|
|
|
|
-
|
|
|
|
-session = requests.session()
|
|
|
|
-proxies = {
|
|
|
|
- 'http': 'socks5://119.3.159.234:8860',
|
|
|
|
- 'https': 'socks5://119.3.159.234:8860',
|
|
|
|
-}
|
|
|
|
|
|
|
|
'''
|
|
'''
|
|
https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
|
|
https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
|
|
@@ -33,6 +31,20 @@ https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=
|
|
4 = 审批项目
|
|
4 = 审批项目
|
|
'''
|
|
'''
|
|
|
|
|
|
|
|
+qlm = mongo_table('qlm', 'data_merge')
|
|
|
|
+r = redis_client()
|
|
|
|
+redis_key = 'qianlima_2024'
|
|
|
|
+
|
|
|
|
+proxies = {
|
|
|
|
+ 'http': 'socks5://119.3.159.234:8860',
|
|
|
|
+ 'https': 'socks5://119.3.159.234:8860',
|
|
|
|
+}
|
|
|
|
+session = requests.session()
|
|
|
|
+
|
|
|
|
+account_id = 1 # 账号标识
|
|
|
|
+captcha_appear_times = 0 # 图形验证出现次数
|
|
|
|
+stop_use_account = False
|
|
|
|
+
|
|
|
|
|
|
class AccountViolationRiskError(Exception):
|
|
class AccountViolationRiskError(Exception):
|
|
pass
|
|
pass
|
|
@@ -59,48 +71,74 @@ def get_today_of_day(offset, fmt='%Y-%m-%d'):
|
|
return date.strftime(fmt)
|
|
return date.strftime(fmt)
|
|
|
|
|
|
|
|
|
|
|
|
+def switch_account():
|
|
|
|
+ global account_id, stop_use_account
|
|
|
|
+
|
|
|
|
+ logger.info(f'切换账号...{account_id}')
|
|
|
|
+ if account_id < len(account_pool):
|
|
|
|
+ account_id += 1 # 切换账号
|
|
|
|
+ else:
|
|
|
|
+ account_id = 1 # 重置账号
|
|
|
|
+ stop_use_account = True
|
|
|
|
+
|
|
|
|
+
|
|
def request(url, data, retries=5):
|
|
def request(url, data, retries=5):
|
|
- global session, cookies, proxies
|
|
|
|
|
|
+ global session, proxies, account_id, stop_use_account
|
|
|
|
+
|
|
resp, msg = None, ''
|
|
resp, msg = None, ''
|
|
usages, usages_521 = 0, 1
|
|
usages, usages_521 = 0, 1
|
|
while usages < retries:
|
|
while usages < retries:
|
|
|
|
+ (_, account), = account_pool[account_id].items()
|
|
request_params = {}
|
|
request_params = {}
|
|
request_params.setdefault('data', data)
|
|
request_params.setdefault('data', data)
|
|
- request_params.setdefault('headers', headers)
|
|
|
|
- request_params.setdefault('cookies', cookies)
|
|
|
|
|
|
+ request_params.setdefault('headers', account['headers'])
|
|
|
|
+ request_params.setdefault('cookies', account['cookies'])
|
|
request_params.setdefault('proxies', proxies)
|
|
request_params.setdefault('proxies', proxies)
|
|
request_params.setdefault('timeout', 60)
|
|
request_params.setdefault('timeout', 60)
|
|
try:
|
|
try:
|
|
resp = session.post(url, **request_params)
|
|
resp = session.post(url, **request_params)
|
|
if resp.status_code == 521:
|
|
if resp.status_code == 521:
|
|
while usages_521 < retries:
|
|
while usages_521 < retries:
|
|
- success, _, cookies = http_session_521(session, url, headers, cookies, data=data, proxies=proxies)
|
|
|
|
|
|
+ success, _, cookies = http_session_521(session, url,
|
|
|
|
+ headers=account['headers'],
|
|
|
|
+ cookies=account['cookies'],
|
|
|
|
+ data=data,
|
|
|
|
+ proxies=proxies)
|
|
if success:
|
|
if success:
|
|
break
|
|
break
|
|
msg = f'反爬破解失败,次数:{usages_521}'
|
|
msg = f'反爬破解失败,次数:{usages_521}'
|
|
logger.warning(msg)
|
|
logger.warning(msg)
|
|
time.sleep(1)
|
|
time.sleep(1)
|
|
usages_521 += 1
|
|
usages_521 += 1
|
|
|
|
+
|
|
usages += 1
|
|
usages += 1
|
|
|
|
+
|
|
|
|
+ elif resp.status_code == 429:
|
|
|
|
+ if stop_use_account:
|
|
|
|
+ msg = f'访问频繁,图形验证,异常状态码:{resp.status_code}'
|
|
|
|
+ logger.error(msg)
|
|
|
|
+ logger.warning(resp.content.decode())
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ switch_account()
|
|
|
|
+
|
|
elif resp.status_code in [401, 403, 404]:
|
|
elif resp.status_code in [401, 403, 404]:
|
|
msg = f'账号登录已失效或封停,异常状态码:{resp.status_code}'
|
|
msg = f'账号登录已失效或封停,异常状态码:{resp.status_code}'
|
|
logger.error(msg)
|
|
logger.error(msg)
|
|
break
|
|
break
|
|
- elif resp.status_code in [429]:
|
|
|
|
- msg = f'图形验证,异常状态码:{resp.status_code}'
|
|
|
|
- logger.error(msg)
|
|
|
|
- logger.warning(resp.content.decode())
|
|
|
|
- break
|
|
|
|
|
|
+
|
|
elif str(resp.status_code).startswith('4'):
|
|
elif str(resp.status_code).startswith('4'):
|
|
msg = f'公网IP被封禁,异常状态码:{resp.status_code}'
|
|
msg = f'公网IP被封禁,异常状态码:{resp.status_code}'
|
|
logger.error(msg)
|
|
logger.error(msg)
|
|
break
|
|
break
|
|
|
|
+
|
|
else:
|
|
else:
|
|
break
|
|
break
|
|
except requests.RequestException as e:
|
|
except requests.RequestException as e:
|
|
msg = f'访问失败,原因:{e.__class__.__name__}'
|
|
msg = f'访问失败,原因:{e.__class__.__name__}'
|
|
logger.error(msg)
|
|
logger.error(msg)
|
|
usages += 1
|
|
usages += 1
|
|
|
|
+
|
|
return resp, msg
|
|
return resp, msg
|
|
|
|
|
|
|
|
|
|
@@ -125,8 +163,10 @@ def downloader(begin_date, end_date, category, address, page, page_size):
|
|
data['numPerPage'] = page_size
|
|
data['numPerPage'] = page_size
|
|
data = json.dumps(data)
|
|
data = json.dumps(data)
|
|
|
|
|
|
|
|
+ # 请求资源响应自定义状态, 成功=success 失败=failure 停止=stop IP封停=disable等
|
|
|
|
+ request_status = 'failure'
|
|
|
|
+
|
|
response, err = request(url, data)
|
|
response, err = request(url, data)
|
|
- request_status = 'failure' # 资源请求结果状态, 成功=success 失败=failure 停止=stop 封停=disable
|
|
|
|
if response is None:
|
|
if response is None:
|
|
request_status = 'server_error'
|
|
request_status = 'server_error'
|
|
return request_status, err
|
|
return request_status, err
|
|
@@ -192,7 +232,7 @@ def downloader(begin_date, end_date, category, address, page, page_size):
|
|
|
|
|
|
if request_status in ['stop', 'success']:
|
|
if request_status in ['stop', 'success']:
|
|
if page == 1:
|
|
if page == 1:
|
|
- logger.info(f'千里马 {begin_date} 发布 {row_count} 条数据')
|
|
|
|
|
|
+ logger.info(f'千里马 {begin_date} 网站发布 {row_count} 条数据')
|
|
logger.info(f'入库 {len(results)} 条')
|
|
logger.info(f'入库 {len(results)} 条')
|
|
|
|
|
|
return request_status, err
|
|
return request_status, err
|