Browse Source

添加账号池

dongzhaorui 1 year ago
parent
commit
e13aad642a
3 changed files with 50 additions and 70 deletions
  1. 42 43
      qlm/source_qianlima.py
  2. 0 27
      qlm/utils/config_parms.py
  3. 8 0
      qlm/utils/tools.py

+ 42 - 43
qlm/source_qianlima.py

@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 
-import datetime
 import json
 import math
 import random
@@ -19,8 +18,7 @@ from utils.config_parms import (
 from utils.databases import mongo_table, redis_client
 from utils.log import logger
 from utils.sessions_521 import http_session_521
-from utils.tools import sha1
-
+from utils.tools import sha1, get_today_of_day
 
 '''
 https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0
@@ -35,16 +33,8 @@ qlm = mongo_table('qlm', 'data_merge')
 r = redis_client()
 redis_key = 'qianlima_2024'
 
-proxies = {
-    'http': 'socks5://119.3.159.234:8860',
-    'https': 'socks5://119.3.159.234:8860',
-}
 session = requests.session()
 
-account_id = 1  # 账号标识
-captcha_appear_times = 0  # 图形验证出现次数
-stop_use_account = False
-
 
 class AccountViolationRiskError(Exception):
     pass
@@ -66,47 +56,50 @@ def send_wechat_warning(msg, send=True):
     logger.info(response.json())
 
 
-def get_today_of_day(offset, fmt='%Y-%m-%d'):
-    date = datetime.datetime.now() + datetime.timedelta(days=int(offset))
-    return date.strftime(fmt)
+def get_account(area):
+    return next((p for p in account_pool if area in p['follow']), None)
+
+
+def disrupt_account_pool():
+    results = []
 
+    copy_account_pool = list(account_pool)
+    while copy_account_pool:
+        idx = random.randint(0, len(copy_account_pool) - 1)
+        results.append(copy_account_pool.pop(idx))
 
-def switch_account():
-    global account_id, stop_use_account
+    return results
 
-    logger.info(f'切换账号...{account_id}')
-    if account_id < len(account_pool):
-        account_id += 1  # 切换账号
-    else:
-        account_id = 1  # 重置账号
-        stop_use_account = True
 
+def request(url, data, account, retries=5):
+    global session
 
-def request(url, data, retries=5):
-    global session, proxies, account_id, stop_use_account
+    ip, _ = str(account['proxies']['http']).replace('socks5://', '').split(':')
+    phone = account['phone']
 
     resp, msg = None, ''
     usages, usages_521 = 0, 1
     while usages < retries:
-        (_, account), = account_pool[account_id].items()
         request_params = {}
         request_params.setdefault('data', data)
         request_params.setdefault('headers', account['headers'])
         request_params.setdefault('cookies', account['cookies'])
-        request_params.setdefault('proxies', proxies)
+        request_params.setdefault('proxies', account['proxies'])
         request_params.setdefault('timeout', 60)
         try:
             resp = session.post(url, **request_params)
             if resp.status_code == 521:
                 while usages_521 < retries:
-                    success, _, cookies = http_session_521(session, url,
+                    success, _, cookies = http_session_521(session,
+                                                           url,
                                                            headers=account['headers'],
                                                            cookies=account['cookies'],
                                                            data=data,
-                                                           proxies=proxies)
+                                                           proxies=account['proxies'])
                     if success:
                         break
-                    msg = f'反爬破解失败,次数:{usages_521}'
+
+                    msg = f'账号[{phone}]反爬破解失败,次数:{usages_521}'
                     logger.warning(msg)
                     time.sleep(1)
                     usages_521 += 1
@@ -114,21 +107,18 @@ def request(url, data, retries=5):
                 usages += 1
 
             elif resp.status_code == 429:
-                if stop_use_account:
-                    msg = f'访问频繁,图形验证,异常状态码:{resp.status_code}'
-                    logger.error(msg)
-                    logger.warning(resp.content.decode())
-                    break
-                else:
-                    switch_account()
+                msg = f'账号[{phone}]访问频繁,图形验证,异常状态码:{resp.status_code}'
+                logger.error(msg)
+                logger.warning(resp.content.decode())
+                break
 
             elif resp.status_code in [401, 403, 404]:
-                msg = f'账号登录已失效或封停,异常状态码:{resp.status_code}'
+                msg = f'账号[{phone}]登录已失效或封停,,异常状态码:{resp.status_code}'
                 logger.error(msg)
                 break
 
             elif str(resp.status_code).startswith('4'):
-                msg = f'公网IP被封禁,异常状态码:{resp.status_code}'
+                msg = f'公网代理IP[{ip}]被封禁,,异常状态码:{resp.status_code}'
                 logger.error(msg)
                 break
 
@@ -142,7 +132,7 @@ def request(url, data, retries=5):
     return resp, msg
 
 
-def downloader(begin_date, end_date, category, address, page, page_size):
+def downloader(begin_date, end_date, category, address, page, page_size, account):
     """
 
     :param str begin_date: 开始时间,格式:xxxx-xx-xxx
@@ -151,6 +141,7 @@ def downloader(begin_date, end_date, category, address, page, page_size):
     :param int address: 地区编号
     :param int page: 页码
     :param int page_size: 单页数据条数
+    :param dict account: 采集账号
 
     """
     url = 'https://search.vip.qianlima.com/rest/service/website/search/solr'
@@ -166,7 +157,7 @@ def downloader(begin_date, end_date, category, address, page, page_size):
     # 请求资源响应自定义状态, 成功=success 失败=failure 停止=stop IP封停=disable等
     request_status = 'failure'
 
-    response, err = request(url, data)
+    response, err = request(url, data, account)
     if response is None:
         request_status = 'server_error'
         return request_status, err
@@ -277,6 +268,7 @@ def automatic_pagination(**kwargs):
                 retry_times += 1
                 send_warning = True
             elif err == 'stop':
+                time.sleep(math.log(random.randint(100, 2400), 2))
                 close_spider = True
             else:
                 time.sleep(math.log(random.randint(100, 2400), 2))
@@ -286,14 +278,15 @@ def automatic_pagination(**kwargs):
         raise AccountViolationRiskError
 
 
-def core(date: str, category: int, address: int, page_size=20):
+def core(date: str, category: int, address: int, account, page_size=20):
     try:
         automatic_pagination(
             begin_date=date,
             end_date=date,
             category=category,
             address=address,
-            page_size=page_size  # 每页数据最大条数
+            page_size=page_size,  # 每页数据最大条数
+            account=account
         )
         return True
     except AccountViolationRiskError:
@@ -307,6 +300,12 @@ def spider(date, page_size=40):
         for date in dates:
             for category, category_name in channel_dict.items():
                 for area, cities in area_dict.items():
+                    account = get_account(area)
+                    if not account:
+                        # raise ValueError('采集账号不能为空!')
+                        logger.warning('暂无可用采集账号与代理!')
+                        continue
+
                     for city in cities:
                         logger.info(' && '.join([
                             date,
@@ -318,7 +317,7 @@ def spider(date, page_size=40):
                         if len(cities) == 1:
                             city = area  # 千里马取消了直辖市的分区,直接采集省市区域
 
-                        yield core(date, category, city, page_size=page_size)
+                        yield core(date, category, city, account, page_size=page_size)
 
     except Exception as e:
         logger.error(e)

File diff suppressed because it is too large
+ 0 - 27
qlm/utils/config_parms.py


+ 8 - 0
qlm/utils/tools.py

@@ -1,3 +1,6 @@
+# -*- coding: utf-8 -*-
+
+import datetime
 import hashlib
 import socket
 
@@ -22,3 +25,8 @@ def get_host_ip():
     finally:
         s.close()
     return ip
+
+
+def get_today_of_day(offset, fmt='%Y-%m-%d'):
+    date = datetime.datetime.now() + datetime.timedelta(days=int(offset))
+    return date.strftime(fmt)

Some files were not shown because too many files changed in this diff