Kaynağa Gözat

元博网 - 自动释放账号

dongzhaorui 3 yıl önce
ebeveyn
işleme
8baaa519d6

+ 70 - 0
ybw/crawler/account.py

@@ -0,0 +1,70 @@
+import json
+from pathlib import Path
+
+import requests
+
+from utils.log import logger
+from utils.tools import wait
+
+ROOT_PATH = Path(__file__).parent.parent
+
+_headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
+JSON_ACCOUNT_RECORD = (ROOT_PATH / 'config/account_record.json').resolve()
+
+
+def account_record(uid, crawl_type):
+    with open(JSON_ACCOUNT_RECORD, 'w+', encoding='utf-8') as wp:
+        item = {
+            "uid": uid,
+            "crawl_type": crawl_type
+        }
+        wp.write(json.dumps(item, indent=4))
+
+
+def read_account():
+    try:
+        with open(JSON_ACCOUNT_RECORD, encoding='utf-8') as rp:
+            cookies: dict = json.load(rp)
+            return cookies
+    except (json.decoder.JSONDecodeError, FileNotFoundError):
+        pass
+
+
+def get_account(site, crawl_type):
+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
+    params = {
+        "site": site,
+        "crawl_type": crawl_type
+    }
+    try:
+        response = requests.get(url,
+                                headers=_headers,
+                                params=params,
+                                timeout=10)
+        print(response.json())
+        data = response.json()['data']
+    except requests.RequestException:
+        # 网络不通信时,无法获取账号
+        data = None
+    return data
+
+
+def release_account(uid, crawl_type):
+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
+    if uid is not None:
+        params = {
+            "uid": uid,
+            "crawl_type": crawl_type
+        }
+        while True:
+            try:
+                response = requests.get(url,
+                                        headers=_headers,
+                                        params=params,
+                                        timeout=10)
+                if response.status_code == 200:
+                    logger.debug(f"release_account >>> {response.json()}")
+                    break
+            except requests.RequestException:
+                logger.error("网络异常,归还账号失败")
+                wait(1)

+ 12 - 56
ybw/crawler/crawl_scheduler.py

@@ -5,11 +5,12 @@ from datetime import date, timedelta
 
 import requests
 
+from crawler.account import release_account, get_account, account_record
 from crawler.login import User
 from utils.databases import mongo_table, int2long, object_id
 from utils.execptions import YbwCrawlError
 from utils.log import logger
-from utils.tools import get_host_ip
+from utils.tools import get_host_ip, wait
 
 
 class Scheduler:
@@ -32,8 +33,6 @@ class Scheduler:
         self.spider_code = None
         self.crawl_url = None
 
-        self._headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
-
         # self.account_tab = mongo_table('py_spider', 'match_account')
         self.record_tab = mongo_table('py_spider', 'match_account_record')
         self.error_tab = mongo_table('py_spider', 'crawl_error')
@@ -49,54 +48,17 @@ class Scheduler:
             release=dict(crawl_detail=False),
         )
         self._schedule = {'list': list_attr, 'detail': detail_attr}
-        self.account = self.get_account()
+        self.account = get_account(self.site, self.crawl_type)
 
     def _init(self):
         self.account_id = self.account['_id']
+        account_record(self.account_id, self.crawl_type)  # 保存使用账号,用于容器自启动归还账号
         self.user = User(self.account['account'], self.account['password'])
         logger.info(f'[启用账号]{self.user.phone}')
         history = self.account_history_crawl_record()
         self.count = history['count']  # 访问条数
         self.total = history['total']  # 每日限量
 
-    def get_account(self):
-        url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
-        params = {
-            "site": self.site,
-            "crawl_type": self.crawl_type
-        }
-
-        try:
-            response = requests.get(url,
-                                    headers=self._headers,
-                                    params=params,
-                                    timeout=10)
-            data = response.json()['data']
-        except requests.RequestException:
-            # 网络不通信时,无法获取账号
-            data = None
-        return data
-
-    def _release_account(self):
-        url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
-        if self.account_id is not None:
-            params = {
-                "uid": self.account_id,
-                "crawl_type": self.crawl_type
-            }
-            while True:
-                try:
-                    response = requests.get(url,
-                                            headers=self._headers,
-                                            params=params,
-                                            timeout=10)
-                    if response.status_code == 200:
-                        logger.debug(f"_release_account >>> {response.json()}")
-                        break
-                except requests.RequestException:
-                    logger.error("网络异常,归还账号失败")
-                    self._wait_schedule(1)
-
     def crawl_counter(self, number: int):
         """采集计数器"""
         records = self.record_tab.find_one({'_id': self.record_id})
@@ -117,15 +79,13 @@ class Scheduler:
         }
         self.error_tab.insert_one(rows)
 
-
     def _update_tab(self, collection, mid, **update):
         update['update_time'] = self.current_time
         collection.update_one({'_id': mid}, {'$set': update})
 
-
     def change_account(self):
         """更换账号"""
-        self._release_account()
+        release_account(self.account_id, self.crawl_type)
         self._init()
 
     def account_history_crawl_record(self):
@@ -153,19 +113,15 @@ class Scheduler:
         self.record_id = item['_id']
         return item
 
-    def finished(self, execute_next_time=None):
+    def finished(self, interval=None):
         logger.info("任务结束")
-        self._release_account()
-        self._wait_schedule(execute_next_time)
-
-    def wait_for_next_task(self, interval=None):
-        interval = (interval or random.choice(range(5, 11)))
-        self._wait_schedule(interval)
+        release_account(self.account_id, self.crawl_type)
+        wait(interval)
 
     @staticmethod
-    def _wait_schedule(interval=None):
-        interval = (interval or 600)
-        time.sleep(interval)
+    def wait_for_next_task(interval=None):
+        interval = (interval or random.choice(range(5, 15)))
+        wait(interval)
 
     @property
     def crawl_task(self):
@@ -208,7 +164,7 @@ class Scheduler:
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         logger.info(f'[关闭调度]')
-        self._release_account()
+        release_account(self.account_id, self.crawl_type)
         self.crawl_start = False
 
         if exc_type is not None:

+ 11 - 0
ybw/release_account.py

@@ -0,0 +1,11 @@
+from crawler.account import read_account, release_account
+
+
+def _send_message():
+    account = read_account()
+    if account is not None:
+        release_account(**account)
+
+
+if __name__ == '__main__':
+    _send_message()

+ 1 - 0
ybw/start.sh

@@ -2,6 +2,7 @@
 
 # 切换到指定目录
 cd /mnt/ybw
+/usr/bin/python3 /mnt/ybw/release_account.py
 /usr/bin/python3 /mnt/ybw/detail_spider.py
 #保留终端,防止容器自动退出
 /usr/sbin/init

+ 8 - 1
ybw/utils/tools.py

@@ -1,5 +1,7 @@
 import socket
 import hashlib
+import time
+
 
 def get_host_ip():
     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
@@ -20,4 +22,9 @@ def sha1(text: str):
     """
     _sha1 = hashlib.sha1()
     _sha1.update(text.encode("utf-8"))
-    return _sha1.hexdigest()
+    return _sha1.hexdigest()
+
+
+def wait(interval=None):
+    _interval = (interval or 600)
+    time.sleep(_interval)