|
@@ -32,7 +32,9 @@ class Scheduler:
|
|
|
self.spider_code = None
|
|
|
self.crawl_url = None
|
|
|
|
|
|
- self.account_tab = mongo_table('py_spider', 'match_account')
|
|
|
+ self._headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
|
|
|
+
|
|
|
+ # self.account_tab = mongo_table('py_spider', 'match_account')
|
|
|
self.record_tab = mongo_table('py_spider', 'match_account_record')
|
|
|
self.error_tab = mongo_table('py_spider', 'crawl_error')
|
|
|
|
|
@@ -55,8 +57,45 @@ class Scheduler:
|
|
|
logger.info(f'[启用账号]{self.user.phone}')
|
|
|
history = self.account_history_crawl_record()
|
|
|
self.count = history['count'] # 访问条数
|
|
|
- lock = self._schedule[self.crawl_type]['lock']
|
|
|
- self._update_tab(self.account_tab, self.account_id, **lock)
|
|
|
+ self.total = history['total'] # 每日限量
|
|
|
+
|
|
|
+ def get_account(self):
|
|
|
+ url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
|
|
|
+ params = {
|
|
|
+ "site": self.site,
|
|
|
+ "crawl_type": self.crawl_type
|
|
|
+ }
|
|
|
+
|
|
|
+ try:
|
|
|
+ response = requests.get(url,
|
|
|
+ headers=self._headers,
|
|
|
+ params=params,
|
|
|
+ timeout=10)
|
|
|
+ data = response.json()['data']
|
|
|
+ except requests.RequestException:
|
|
|
+ # 网络不通信时,无法获取账号
|
|
|
+ data = None
|
|
|
+ return data
|
|
|
+
|
|
|
+ def _release_account(self):
|
|
|
+ url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
|
|
|
+ if self.account_id is not None:
|
|
|
+ params = {
|
|
|
+ "uid": self.account_id,
|
|
|
+ "crawl_type": self.crawl_type
|
|
|
+ }
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ response = requests.get(url,
|
|
|
+ headers=self._headers,
|
|
|
+ params=params,
|
|
|
+ timeout=10)
|
|
|
+ if response.status_code == 200:
|
|
|
+ logger.debug(f"_release_account >>> {response.json()}")
|
|
|
+ break
|
|
|
+ except requests.RequestException:
|
|
|
+ logger.error("网络异常,归还账号失败")
|
|
|
+ self._wait_schedule(1)
|
|
|
|
|
|
def crawl_counter(self, number: int):
|
|
|
"""采集计数器"""
|
|
@@ -78,20 +117,11 @@ class Scheduler:
|
|
|
}
|
|
|
self.error_tab.insert_one(rows)
|
|
|
|
|
|
- def _release_account(self):
|
|
|
- if self.account_id is not None:
|
|
|
- release = self._schedule[self.crawl_type]['release']
|
|
|
- self._update_tab(self.account_tab, self.account_id, **release)
|
|
|
|
|
|
def _update_tab(self, collection, mid, **update):
|
|
|
update['update_time'] = self.current_time
|
|
|
collection.update_one({'_id': mid}, {'$set': update})
|
|
|
|
|
|
- def get_account(self):
|
|
|
- """获取账号"""
|
|
|
- release = self._schedule[self.crawl_type]['release']
|
|
|
- query = dict(site=self.site, **release)
|
|
|
- return self.account_tab.find_one(query, sort=[('update_time', 1)])
|
|
|
|
|
|
def change_account(self):
|
|
|
"""更换账号"""
|
|
@@ -106,26 +136,18 @@ class Scheduler:
|
|
|
type=self.crawl_type,
|
|
|
site=self.site,
|
|
|
)
|
|
|
- if self.channel is not None:
|
|
|
- query['channel'] = self.channel
|
|
|
-
|
|
|
item = self.record_tab.find_one(query)
|
|
|
if item is None:
|
|
|
item = dict(
|
|
|
site=self.site,
|
|
|
account=self.account['account'],
|
|
|
type=self.crawl_type,
|
|
|
+ total=self.account.get('total', 0), # 任务总数默认值:0
|
|
|
count=0,
|
|
|
ip=get_host_ip(),
|
|
|
date=self.today,
|
|
|
update_time=self.current_time,
|
|
|
)
|
|
|
-
|
|
|
- if self.crawl_type == 'detail':
|
|
|
- self.total = item['total'] = self.account['total'] # 访问上限
|
|
|
- else:
|
|
|
- item['channel'] = (self.channel or '')
|
|
|
-
|
|
|
result = self.record_tab.insert_one(item)
|
|
|
item['_id'] = result.inserted_id
|
|
|
self.record_id = item['_id']
|