|
@@ -7,18 +7,18 @@ from datetime import date, timedelta
|
|
|
import requests
|
|
|
|
|
|
from crawler.login import User
|
|
|
-from utils.databases import MongoDBS
|
|
|
+from utils.databases import mongo_table, int2long, object_id
|
|
|
from utils.execptions import JyBasicException
|
|
|
from utils.log import logger
|
|
|
-from utils.tools import int2long, object_id
|
|
|
+from utils.tools import get_host_ip
|
|
|
|
|
|
|
|
|
class Scheduler:
|
|
|
|
|
|
def __init__(self, query: dict):
|
|
|
self.query = query
|
|
|
- self.crawl_account_tab = MongoDBS('py_spider', 'match_account').coll
|
|
|
- self.crawl_error_tab = MongoDBS('py_spider', 'crawl_error').coll
|
|
|
+ self.account_tab = mongo_table('py_spider', 'match_account')
|
|
|
+ self.crawl_error_tab = mongo_table('py_spider', 'crawl_error')
|
|
|
self.crawl_start = False
|
|
|
self.account_id = None
|
|
|
self.user = None
|
|
@@ -29,64 +29,26 @@ class Scheduler:
|
|
|
self.crawl_type = None
|
|
|
self.__records = None
|
|
|
|
|
|
+ def _update_data(self, item):
|
|
|
+ """
|
|
|
+ 更新账号所属的采集数据信息
|
|
|
+
|
|
|
+ :param item: 最新数据
|
|
|
+ """
|
|
|
+ item['ip'] = get_host_ip()
|
|
|
+ item['update_time'] = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
+ self.account_tab.update_one(
|
|
|
+ {'_id': self.account_id},
|
|
|
+ {'$set': item}
|
|
|
+ )
|
|
|
+
|
|
|
def _release_account(self):
|
|
|
rows = dict(
|
|
|
used=False,
|
|
|
update_time=datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
)
|
|
|
if self.account_id is not None:
|
|
|
- self.crawl_account_tab.update_one(
|
|
|
- {'_id': self.account_id},
|
|
|
- {'$set': rows}
|
|
|
- )
|
|
|
-
|
|
|
- def _set_account(self, item: dict):
|
|
|
- self.account_id = item['_id']
|
|
|
- self.user = User(item['account'], item['password'])
|
|
|
- logger.info(f'[开启调度]启用账号: {self.user.username}')
|
|
|
- usage = int(item['usage'])
|
|
|
- item['usage'] = usage + 1
|
|
|
- use_time = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
- item['update_time'] = use_time
|
|
|
- item['used'] = True
|
|
|
- self.crawl_account_tab.update_one(
|
|
|
- {'_id': self.account_id},
|
|
|
- {'$set': item}
|
|
|
- )
|
|
|
-
|
|
|
- def _query_account(self, query: dict):
|
|
|
- return self.crawl_account_tab.find_one(query, sort=[('usage', 1)])
|
|
|
-
|
|
|
- def __enter__(self):
|
|
|
- rows = self._query_account(self.query)
|
|
|
- if rows is not None:
|
|
|
- self._set_account(rows)
|
|
|
- self.crawl_start = True # 控制调度的状态
|
|
|
- else:
|
|
|
- # TODO 没有空闲账号时,取出使用次数最少的账号,暂未实现
|
|
|
- logger.warning(f'请检查mongo表 {self.crawl_account_tab.name} 账号状态')
|
|
|
- return self
|
|
|
-
|
|
|
- def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
- logger.info(f'[关闭调度]')
|
|
|
- self._release_account()
|
|
|
- self.crawl_start = False
|
|
|
-
|
|
|
- if exc_type is not None:
|
|
|
- errmsg = traceback.extract_tb(exc_tb)
|
|
|
- e = JyBasicException(
|
|
|
- code=10500,
|
|
|
- reason=str(exc_type),
|
|
|
- title='未知系统错误'
|
|
|
- )
|
|
|
- self.err_record(e)
|
|
|
- logger.error(f'错误类型: {exc_type}, 错误内容: {exc_val}, 错误详情: {errmsg}')
|
|
|
- return True
|
|
|
-
|
|
|
- def finished(self, execute_next_time=None):
|
|
|
- logger.info("任务结束")
|
|
|
- self._release_account()
|
|
|
- self.sleep(execute_next_time)
|
|
|
+ self._update_data(rows)
|
|
|
|
|
|
@staticmethod
|
|
|
def sleep(wait_time=None):
|
|
@@ -106,19 +68,6 @@ class Scheduler:
|
|
|
def yesterday(self):
|
|
|
return (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
|
|
|
|
|
|
- def err_record(self, e: JyBasicException):
|
|
|
- rows = {
|
|
|
- 'account': self.user.username if self.user is not None else '',
|
|
|
- 'spidercode': self.spider_code,
|
|
|
- 'url': self.crawl_url,
|
|
|
- 'status_code': e.code,
|
|
|
- 'reason': e.reason,
|
|
|
- 'params': getattr(e, 'title', ''),
|
|
|
- 'crawl_time': int2long(int(time.time())),
|
|
|
- 'crawl_type': self.crawl_type,
|
|
|
- }
|
|
|
- self.crawl_error_tab.insert_one(rows)
|
|
|
-
|
|
|
@property
|
|
|
def crawl_task(self):
|
|
|
results = {}
|
|
@@ -137,9 +86,73 @@ class Scheduler:
|
|
|
except requests.RequestException:
|
|
|
return results
|
|
|
|
|
|
+ def err_record(self, e: JyBasicException):
|
|
|
+ rows = {
|
|
|
+ 'account': self.user.username if self.user is not None else '',
|
|
|
+ 'spidercode': self.spider_code,
|
|
|
+ 'url': self.crawl_url,
|
|
|
+ 'status_code': e.code,
|
|
|
+ 'reason': e.reason,
|
|
|
+ 'params': getattr(e, 'title', ''),
|
|
|
+ 'crawl_time': int2long(int(time.time())),
|
|
|
+ 'crawl_type': self.crawl_type,
|
|
|
+ }
|
|
|
+ self.crawl_error_tab.insert_one(rows)
|
|
|
+
|
|
|
def query_user(self, account: str):
|
|
|
query = {'account': account}
|
|
|
- rows = self.crawl_account_tab.find_one(query)
|
|
|
+ rows = self.account_tab.find_one(query)
|
|
|
if rows is None:
|
|
|
raise
|
|
|
return User(rows['account'], rows['password'])
|
|
|
+
|
|
|
+ def finished(self, execute_next_time=None):
|
|
|
+ logger.info("任务结束")
|
|
|
+ self._release_account()
|
|
|
+ self.sleep(execute_next_time)
|
|
|
+
|
|
|
+ def update_count(self, number):
|
|
|
+ rows = self.account_tab.find_one({'_id': self.account_id})
|
|
|
+ records = rows.get('records', {self.today: 0})
|
|
|
+ '''采集记录历史保存7天'''
|
|
|
+ count = records.get(self.today, 0)
|
|
|
+ count += number
|
|
|
+ if len(records) > 7:
|
|
|
+ records.clear()
|
|
|
+ records.setdefault(self.today, count)
|
|
|
+ else:
|
|
|
+ records.update({self.today: count})
|
|
|
+ rows.update({'records': records})
|
|
|
+ self._update_data(rows)
|
|
|
+
|
|
|
+ def __enter__(self):
|
|
|
+ logger.info(f'[开启调度]')
|
|
|
+ rows = self.account_tab.find_one(self.query, sort=[('update_time', 1)])
|
|
|
+ if rows is not None:
|
|
|
+ self.account_id = rows['_id']
|
|
|
+ self.user = User(rows['account'], rows['password'])
|
|
|
+ logger.info(f'[启用账号] {self.user.username}')
|
|
|
+ rows['used'] = True
|
|
|
+ records = rows.get('records', {self.today: 0})
|
|
|
+ rows.update({'records': records})
|
|
|
+ self._update_data(rows)
|
|
|
+ self.crawl_start = True # 控制调度的状态
|
|
|
+ else:
|
|
|
+ logger.warning(f'[{self.query.get("site")}采集]暂无闲置账号')
|
|
|
+ return self
|
|
|
+
|
|
|
+ def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
+ logger.info(f'[关闭调度]')
|
|
|
+ self._release_account()
|
|
|
+ self.crawl_start = False
|
|
|
+
|
|
|
+ if exc_type is not None:
|
|
|
+ errmsg = traceback.extract_tb(exc_tb)
|
|
|
+ e = JyBasicException(
|
|
|
+ code=10500,
|
|
|
+ reason=str(exc_type),
|
|
|
+ title='未知系统错误'
|
|
|
+ )
|
|
|
+ self.err_record(e)
|
|
|
+ logger.error(f'错误类型: {exc_type}, 错误内容: {exc_val}, 错误详情: {errmsg}')
|
|
|
+ return True
|