|
@@ -7,7 +7,7 @@ import requests
|
|
|
|
|
|
from crawler.login import User
|
|
|
from utils.databases import mongo_table, int2long, object_id
|
|
|
-from utils.execptions import JyBasicException
|
|
|
+from utils.execptions import ZbYTbCrawlError
|
|
|
from utils.log import logger
|
|
|
from utils.tools import get_host_ip
|
|
|
|
|
@@ -29,16 +29,76 @@ class Scheduler:
|
|
|
self.crawl_exception = None
|
|
|
self.kwargs = kwargs
|
|
|
|
|
|
+ self._headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
|
|
|
+
|
|
|
self.account_tab = mongo_table('py_spider', 'match_account')
|
|
|
self.record_tab = mongo_table('py_spider', 'match_account_record')
|
|
|
self.crawl_error_tab = mongo_table('py_spider', 'crawl_error')
|
|
|
|
|
|
+ list_attr = dict(
|
|
|
+ name='crawl_list',
|
|
|
+ lock=dict(crawl_list=True),
|
|
|
+ release=dict(crawl_list=False),
|
|
|
+ )
|
|
|
+ detail_attr = dict(
|
|
|
+ name='crawl_detail',
|
|
|
+ lock=dict(crawl_detail=True),
|
|
|
+ release=dict(crawl_detail=False),
|
|
|
+ )
|
|
|
+ self._schedule = {'list': list_attr, 'detail': detail_attr}
|
|
|
+ self.account = self.get_account()
|
|
|
+
|
|
|
+ def _init(self):
|
|
|
+ self.account_id = self.account['_id']
|
|
|
+ self.user = User(self.account['account'], self.account['password'])
|
|
|
+ logger.info(f'[启用账号]{self.user.username}')
|
|
|
+ history = self.account_history_crawl_record()
|
|
|
+ self.count = history['count'] # 访问条数
|
|
|
+
|
|
|
+ def get_account(self):
|
|
|
+ url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
|
|
|
+ params = {
|
|
|
+ "site": self.site,
|
|
|
+ "crawl_type": self.crawl_type
|
|
|
+ }
|
|
|
+ try:
|
|
|
+ response = requests.get(url,
|
|
|
+ headers=self._headers,
|
|
|
+ params=params,
|
|
|
+ timeout=10)
|
|
|
+ print(response.json())
|
|
|
+ data = response.json()['data']
|
|
|
+ except requests.RequestException:
|
|
|
+ # 网络不通信时,无法获取账号
|
|
|
+ data = None
|
|
|
+ return data
|
|
|
+
|
|
|
+ def _release_account(self):
|
|
|
+ url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
|
|
|
+ if self.account_id is not None:
|
|
|
+ params = {
|
|
|
+ "uid": self.account_id,
|
|
|
+ "crawl_type": self.crawl_type
|
|
|
+ }
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ response = requests.get(url,
|
|
|
+ headers=self._headers,
|
|
|
+ params=params,
|
|
|
+ timeout=10)
|
|
|
+ if response.status_code == 200:
|
|
|
+ logger.debug(f"_release_account >>> {response.json()}")
|
|
|
+ break
|
|
|
+ except requests.RequestException:
|
|
|
+ logger.error("网络异常,归还账号失败")
|
|
|
+ self._wait_schedule(1)
|
|
|
+
|
|
|
def crawl_counter(self, number: int):
|
|
|
"""采集计数器"""
|
|
|
records = self.record_tab.find_one({'_id': self.record_id})
|
|
|
records['count'] += number
|
|
|
self.count = records['count']
|
|
|
- self._update_tab(self.record_tab, self.record_id, records)
|
|
|
+ self._update_tab(self.record_tab, self.record_id, count=self.count)
|
|
|
|
|
|
def query_user(self, account: str):
|
|
|
query = {'account': account}
|
|
@@ -47,78 +107,25 @@ class Scheduler:
|
|
|
return None
|
|
|
return User(item['account'], item['password'])
|
|
|
|
|
|
- def finished(self, execute_next_time=None):
|
|
|
- logger.info("任务结束")
|
|
|
- self._release_account()
|
|
|
- self.sleep(execute_next_time)
|
|
|
-
|
|
|
- def err_record(self, e: JyBasicException):
|
|
|
+ def err_record(self, err: ZbYTbCrawlError):
|
|
|
rows = {
|
|
|
'account': self.user.username if self.user is not None else '',
|
|
|
'spidercode': self.spider_code,
|
|
|
'url': self.crawl_url,
|
|
|
- 'status_code': e.code,
|
|
|
- 'reason': e.reason,
|
|
|
- 'params': getattr(e, 'title', ''),
|
|
|
+ 'status_code': err.code,
|
|
|
+ 'reason': err.reason,
|
|
|
+ 'params': getattr(err, 'title', ''),
|
|
|
'crawl_time': int2long(int(time.time())),
|
|
|
'crawl_type': self.crawl_type,
|
|
|
}
|
|
|
self.crawl_error_tab.insert_one(rows)
|
|
|
|
|
|
- def _update_tab(self, mgo_coll, _id, item):
|
|
|
- """
|
|
|
- 更新mongo表
|
|
|
-
|
|
|
- :param mgo_coll: mongo表
|
|
|
- :param _id: mongo_id
|
|
|
- :param item: 数据
|
|
|
- """
|
|
|
- item['update_time'] = self.current_time
|
|
|
- mgo_coll.update_one({'_id': _id}, {'$set': item})
|
|
|
-
|
|
|
- def _release_account(self):
|
|
|
- if self.crawl_type == 'detail':
|
|
|
- rows = dict(crawl_detail=False,)
|
|
|
- else:
|
|
|
- rows = dict(crawl_list=False,)
|
|
|
- if self.account_id is not None:
|
|
|
- self._update_tab(self.account_tab, self.account_id, rows)
|
|
|
-
|
|
|
- def __enter__(self):
|
|
|
- logger.info(f'[开启调度]')
|
|
|
- '''获取闲置账号'''
|
|
|
- if self.account is not None:
|
|
|
- self.account_id = self.account['_id']
|
|
|
- self.user = User(self.account['account'], self.account['password'])
|
|
|
- logger.info(f'[启用账号]{self.user.username}')
|
|
|
- '''初始化记录表'''
|
|
|
- records = self.account_records
|
|
|
- if self.crawl_type == 'detail':
|
|
|
- item = {'crawl_detail': True}
|
|
|
- self.total = records['total']
|
|
|
- self.count = records['count']
|
|
|
- else:
|
|
|
- item = {'crawl_list': True}
|
|
|
- '''初始化采集账号记录'''
|
|
|
- self._update_tab(self.account_tab, self.account_id, item)
|
|
|
- self.crawl_start = True
|
|
|
- else:
|
|
|
- logger.warning(f'[{self.site}]暂无闲置账号')
|
|
|
- return self
|
|
|
-
|
|
|
- @staticmethod
|
|
|
- def wait_for_next_task(wait_time=None):
|
|
|
- _sleep = (wait_time or random.choice(range(5, 15)))
|
|
|
- time.sleep(_sleep)
|
|
|
-
|
|
|
- @staticmethod
|
|
|
- def sleep(wait_time=None):
|
|
|
- sleep_time = (wait_time or 600)
|
|
|
- time.sleep(sleep_time)
|
|
|
+ def _update_tab(self, collection, mid, **update):
|
|
|
+ update['update_time'] = self.current_time
|
|
|
+ collection.update_one({'_id': mid}, {'$set': update})
|
|
|
|
|
|
- @property
|
|
|
- def account_records(self):
|
|
|
- """账号使用记录"""
|
|
|
+ def account_history_crawl_record(self):
|
|
|
+ """使用账号采集记录"""
|
|
|
query = dict(
|
|
|
account=self.account['account'],
|
|
|
date=self.today,
|
|
@@ -142,15 +149,19 @@ class Scheduler:
|
|
|
self.record_id = item['_id']
|
|
|
return item
|
|
|
|
|
|
- @property
|
|
|
- def account(self):
|
|
|
- """账号"""
|
|
|
- query = dict(site=self.site)
|
|
|
- if self.crawl_type == 'detail':
|
|
|
- query['crawl_detail'] = False
|
|
|
- else:
|
|
|
- query['crawl_list'] = False
|
|
|
- return self.account_tab.find_one(query, sort=[('update_time', 1)])
|
|
|
+ def wait_for_next_task(self, interval=None):
|
|
|
+ interval = (interval or random.choice(range(5, 15)))
|
|
|
+ self._wait_schedule(interval)
|
|
|
+
|
|
|
+ def finished(self, execute_next_time=None):
|
|
|
+ logger.info("任务结束")
|
|
|
+ self._release_account()
|
|
|
+ self._wait_schedule(execute_next_time)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _wait_schedule(interval=None):
|
|
|
+ _interval = (interval or 600)
|
|
|
+ time.sleep(_interval)
|
|
|
|
|
|
@property
|
|
|
def crawl_task(self):
|
|
@@ -181,14 +192,24 @@ class Scheduler:
|
|
|
def yesterday(self):
|
|
|
return (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
|
|
|
|
|
|
+ def __enter__(self):
|
|
|
+ logger.info('[开启调度]')
|
|
|
+ '''获取闲置账号'''
|
|
|
+ if self.account is not None:
|
|
|
+ self._init()
|
|
|
+ self.crawl_start = True
|
|
|
+ else:
|
|
|
+ logger.warning(f'[{self.site}]暂无闲置账号')
|
|
|
+ return self
|
|
|
+
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
- logger.info(f'[关闭调度]')
|
|
|
+ logger.info('[关闭调度]')
|
|
|
self._release_account()
|
|
|
self.crawl_start = False
|
|
|
|
|
|
if exc_type is not None:
|
|
|
logger.exception(exc_tb)
|
|
|
- e = JyBasicException(
|
|
|
+ e = ZbYTbCrawlError(
|
|
|
code=10500,
|
|
|
reason=str(exc_type),
|
|
|
title='未知系统错误'
|