|
@@ -5,11 +5,12 @@ from datetime import date, timedelta
|
|
|
|
|
|
import requests
|
|
|
|
|
|
+from crawler.account import release_account, get_account, account_record
|
|
|
from crawler.login import User
|
|
|
from utils.databases import mongo_table, int2long, object_id
|
|
|
from utils.execptions import YbwCrawlError
|
|
|
from utils.log import logger
|
|
|
-from utils.tools import get_host_ip
|
|
|
+from utils.tools import get_host_ip, wait
|
|
|
|
|
|
|
|
|
class Scheduler:
|
|
@@ -32,8 +33,6 @@ class Scheduler:
|
|
|
self.spider_code = None
|
|
|
self.crawl_url = None
|
|
|
|
|
|
- self._headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
|
|
|
-
|
|
|
# self.account_tab = mongo_table('py_spider', 'match_account')
|
|
|
self.record_tab = mongo_table('py_spider', 'match_account_record')
|
|
|
self.error_tab = mongo_table('py_spider', 'crawl_error')
|
|
@@ -49,54 +48,17 @@ class Scheduler:
|
|
|
release=dict(crawl_detail=False),
|
|
|
)
|
|
|
self._schedule = {'list': list_attr, 'detail': detail_attr}
|
|
|
- self.account = self.get_account()
|
|
|
+ self.account = get_account(self.site, self.crawl_type)
|
|
|
|
|
|
def _init(self):
|
|
|
self.account_id = self.account['_id']
|
|
|
+ account_record(self.account_id, self.crawl_type) # 保存使用账号,用于容器自启动归还账号
|
|
|
self.user = User(self.account['account'], self.account['password'])
|
|
|
logger.info(f'[启用账号]{self.user.phone}')
|
|
|
history = self.account_history_crawl_record()
|
|
|
self.count = history['count'] # 访问条数
|
|
|
self.total = history['total'] # 每日限量
|
|
|
|
|
|
- def get_account(self):
|
|
|
- url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
|
|
|
- params = {
|
|
|
- "site": self.site,
|
|
|
- "crawl_type": self.crawl_type
|
|
|
- }
|
|
|
-
|
|
|
- try:
|
|
|
- response = requests.get(url,
|
|
|
- headers=self._headers,
|
|
|
- params=params,
|
|
|
- timeout=10)
|
|
|
- data = response.json()['data']
|
|
|
- except requests.RequestException:
|
|
|
- # 网络不通信时,无法获取账号
|
|
|
- data = None
|
|
|
- return data
|
|
|
-
|
|
|
- def _release_account(self):
|
|
|
- url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
|
|
|
- if self.account_id is not None:
|
|
|
- params = {
|
|
|
- "uid": self.account_id,
|
|
|
- "crawl_type": self.crawl_type
|
|
|
- }
|
|
|
- while True:
|
|
|
- try:
|
|
|
- response = requests.get(url,
|
|
|
- headers=self._headers,
|
|
|
- params=params,
|
|
|
- timeout=10)
|
|
|
- if response.status_code == 200:
|
|
|
- logger.debug(f"_release_account >>> {response.json()}")
|
|
|
- break
|
|
|
- except requests.RequestException:
|
|
|
- logger.error("网络异常,归还账号失败")
|
|
|
- self._wait_schedule(1)
|
|
|
-
|
|
|
def crawl_counter(self, number: int):
|
|
|
"""采集计数器"""
|
|
|
records = self.record_tab.find_one({'_id': self.record_id})
|
|
@@ -117,15 +79,13 @@ class Scheduler:
|
|
|
}
|
|
|
self.error_tab.insert_one(rows)
|
|
|
|
|
|
-
|
|
|
def _update_tab(self, collection, mid, **update):
|
|
|
update['update_time'] = self.current_time
|
|
|
collection.update_one({'_id': mid}, {'$set': update})
|
|
|
|
|
|
-
|
|
|
def change_account(self):
|
|
|
"""更换账号"""
|
|
|
- self._release_account()
|
|
|
+ release_account(self.account_id, self.crawl_type)
|
|
|
self._init()
|
|
|
|
|
|
def account_history_crawl_record(self):
|
|
@@ -153,19 +113,15 @@ class Scheduler:
|
|
|
self.record_id = item['_id']
|
|
|
return item
|
|
|
|
|
|
- def finished(self, execute_next_time=None):
|
|
|
+ def finished(self, interval=None):
|
|
|
logger.info("任务结束")
|
|
|
- self._release_account()
|
|
|
- self._wait_schedule(execute_next_time)
|
|
|
-
|
|
|
- def wait_for_next_task(self, interval=None):
|
|
|
- interval = (interval or random.choice(range(5, 11)))
|
|
|
- self._wait_schedule(interval)
|
|
|
+ release_account(self.account_id, self.crawl_type)
|
|
|
+ wait(interval)
|
|
|
|
|
|
@staticmethod
|
|
|
- def _wait_schedule(interval=None):
|
|
|
- interval = (interval or 600)
|
|
|
- time.sleep(interval)
|
|
|
+ def wait_for_next_task(interval=None):
|
|
|
+ interval = (interval or random.choice(range(5, 15)))
|
|
|
+ wait(interval)
|
|
|
|
|
|
@property
|
|
|
def crawl_task(self):
|
|
@@ -208,7 +164,7 @@ class Scheduler:
|
|
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
logger.info(f'[关闭调度]')
|
|
|
- self._release_account()
|
|
|
+ release_account(self.account_id, self.crawl_type)
|
|
|
self.crawl_start = False
|
|
|
|
|
|
if exc_type is not None:
|