|
@@ -11,7 +11,6 @@ import re
|
|
|
import time
|
|
|
|
|
|
import requests.exceptions
|
|
|
-from loguru import logger
|
|
|
from lxml.html import fromstring, HtmlElement, tostring
|
|
|
from lxml.html.clean import Cleaner
|
|
|
from pymongo import MongoClient
|
|
@@ -19,6 +18,7 @@ from pymongo import MongoClient
|
|
|
import setting
|
|
|
import utils.tools as tool
|
|
|
from dbs.RedisDB import RedisFilter
|
|
|
+from log import logger
|
|
|
from utils.check_utils import CheckText, CheckTask
|
|
|
from utils.clean_html import cleaner
|
|
|
from utils.login import User, load_login_cookies, login, login_check
|
|
@@ -102,17 +102,15 @@ class DetailSpider:
|
|
|
params = {
|
|
|
"fid": f"{fid}"
|
|
|
}
|
|
|
-
|
|
|
- res = requests.get(url, headers=headers, params=params, **request_params)
|
|
|
- return res
|
|
|
+ return requests.get(url, headers=headers, params=params, **request_params)
|
|
|
|
|
|
def crawl_request(self, item: dict):
|
|
|
- url = item['competehref']
|
|
|
+ url = item["competehref"]
|
|
|
headers = {
|
|
|
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
|
'Host': 'www.chinabidding.cn',
|
|
|
'Upgrade-Insecure-Requests': '1',
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
|
|
|
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
|
}
|
|
|
|
|
@@ -136,8 +134,10 @@ class DetailSpider:
|
|
|
if login_cookies is None:
|
|
|
login(*self.user, **proxy_params)
|
|
|
self.login_times += 1
|
|
|
+ self.update_account_login_times()
|
|
|
continue
|
|
|
- elif 'cookies' not in request_params:
|
|
|
+
|
|
|
+ if 'cookies' not in request_params:
|
|
|
request_params.setdefault('cookies', login_cookies)
|
|
|
else:
|
|
|
request_params.update({'cookies': login_cookies})
|
|
@@ -153,17 +153,18 @@ class DetailSpider:
|
|
|
logger.info(f"[重新登录]{self.user.phone}")
|
|
|
_, code = login(*self.user, **proxy_params)
|
|
|
self.login_times += 1
|
|
|
- if code == 200:
|
|
|
- retries += 1
|
|
|
- else:
|
|
|
+ retries += 1
|
|
|
+ if code != 200:
|
|
|
time.sleep(600)
|
|
|
- retries += 1
|
|
|
+
|
|
|
continue
|
|
|
+
|
|
|
logger.info(f'[采集正文] fid_{fid}')
|
|
|
return r
|
|
|
except:
|
|
|
retries += 1
|
|
|
- continue
|
|
|
+ finally:
|
|
|
+ self.update_account_login_times()
|
|
|
else:
|
|
|
try:
|
|
|
r = requests.get(url, **request_params)
|
|
@@ -173,26 +174,25 @@ class DetailSpider:
|
|
|
logger.info(f"[重新登录]{self.user.phone}")
|
|
|
_, code = login(*self.user, **proxy_params)
|
|
|
self.login_times += 1
|
|
|
- if code == 200:
|
|
|
- retries += 1
|
|
|
- else:
|
|
|
+ retries += 1
|
|
|
+ if code != 200:
|
|
|
time.sleep(1800)
|
|
|
- retries += 1
|
|
|
+
|
|
|
continue
|
|
|
|
|
|
- element = fromstring(r.text)
|
|
|
+ element = fromstring(r.content.decode())
|
|
|
nodes = element.xpath('//*[@id="main_dom"]/div[1]')
|
|
|
if len(nodes) != 1:
|
|
|
retries_502 += 1
|
|
|
logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
|
|
|
- continue
|
|
|
else:
|
|
|
- node = nodes[0]
|
|
|
+ node = nodes[0] # list index out of range
|
|
|
logger.info(f'[采集正文] id={node.attrib.get("id")}')
|
|
|
return r
|
|
|
except requests.RequestException:
|
|
|
retries += 1
|
|
|
- continue
|
|
|
+ finally:
|
|
|
+ self.update_account_login_times()
|
|
|
|
|
|
return None
|
|
|
|
|
@@ -239,36 +239,36 @@ class DetailSpider:
|
|
|
self.save_tab.insert_one(insert)
|
|
|
logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
|
|
|
|
|
|
- def crawl_spider(self, schedule, item):
|
|
|
- count = schedule['count']
|
|
|
- self.login_times = schedule['login_times']
|
|
|
- if count >= schedule['total'] or self.login_times >= 3:
|
|
|
- ''' 账号限制 '''
|
|
|
- logger.warning("账号限制")
|
|
|
- return '账号限制'
|
|
|
+ def update_account_login_times(self):
|
|
|
+ self.ybw_info.update_one(
|
|
|
+ {"account": self.user.phone},
|
|
|
+ {"$set": {
|
|
|
+ "login_times": self.login_times,
|
|
|
+ "update_time": tool.get_current_date()
|
|
|
+ }}
|
|
|
+ )
|
|
|
|
|
|
+ def crawl_spider(self, account, item):
|
|
|
_id = item["_id"]
|
|
|
- err = "error"
|
|
|
- for _ in range(3):
|
|
|
- try:
|
|
|
- CheckTask(item) # 检查请求采集任务
|
|
|
- response = self.crawl_request(item)
|
|
|
- if response is not None:
|
|
|
- self.crawl_response(response, item)
|
|
|
- count += 1
|
|
|
- self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
|
|
|
- self.ybw_info.update_one(
|
|
|
- {"account": self.user.phone},
|
|
|
- {"$set": {
|
|
|
- "count": count,
|
|
|
- "update_time": tool.get_current_date(),
|
|
|
- "login_times": self.login_times
|
|
|
- }}
|
|
|
- )
|
|
|
- return True
|
|
|
- except Exception as e:
|
|
|
- err = e
|
|
|
- logger.error(f"请求错误:{err}")
|
|
|
+ err = "unknown error"
|
|
|
+
|
|
|
+ try:
|
|
|
+ CheckTask(item) # 检查请求采集任务
|
|
|
+ response = self.crawl_request(item)
|
|
|
+ if response is not None:
|
|
|
+ self.crawl_response(response, item)
|
|
|
+ self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
|
|
|
+ self.ybw_info.update_one(
|
|
|
+ {"account": self.user.phone},
|
|
|
+ {"$set": {
|
|
|
+ "count": account["count"] + 1,
|
|
|
+ "update_time": tool.get_current_date(),
|
|
|
+ }}
|
|
|
+ )
|
|
|
+ return True
|
|
|
+ except Exception as e:
|
|
|
+ err = e
|
|
|
+ logger.error(f"请求错误:{err}")
|
|
|
|
|
|
self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
|
|
|
return False
|
|
@@ -276,11 +276,6 @@ class DetailSpider:
|
|
|
def start(self):
|
|
|
logger.debug(" *** start ***")
|
|
|
|
|
|
- schedule = self.ybw_info.find_one({"account": self.user.phone})
|
|
|
- if schedule is None:
|
|
|
- logger.error(f"数据库无此账号信息|{self.user.phone}")
|
|
|
- return
|
|
|
-
|
|
|
query = {"crawl_status": {"$exists": False}, "es_count": 0}
|
|
|
sort = [('publishtime', -1)]
|
|
|
limit = 100
|
|
@@ -288,23 +283,36 @@ class DetailSpider:
|
|
|
tasks = [doc for doc in cursor]
|
|
|
|
|
|
download_count = 0
|
|
|
- rdm = random.randint(30, 50)
|
|
|
for item in tasks:
|
|
|
- publish_ts = tool.date_to_timestamp(item['publishtime'])
|
|
|
+ # 检查账号
|
|
|
+ account = self.ybw_info.find_one({"account": self.user.phone})
|
|
|
+ if account is None:
|
|
|
+ logger.error(f"数据库无此账号信息|{self.user.phone}")
|
|
|
+ return
|
|
|
+
|
|
|
+ # 登录次数检查
|
|
|
+ self.login_times = account["login_times"]
|
|
|
+ if self.login_times >= 3:
|
|
|
+ logger.warning(f"账号限制|{self.user.phone}")
|
|
|
+ return
|
|
|
+
|
|
|
+ # 数据发布时间延迟采集
|
|
|
+ publish_ts = tool.date_to_timestamp(item["publishtime"])
|
|
|
if publish_ts > int(time.time()) - 43200:
|
|
|
logger.warning("未到采集时间")
|
|
|
continue
|
|
|
|
|
|
- fp = 'detail_' + item.get('competehref')
|
|
|
+ fp = "detail_" + item.get("competehref")
|
|
|
if not self.dedup.get(fp):
|
|
|
self.dedup.add(fp)
|
|
|
|
|
|
download_count += 1
|
|
|
- rst = self.crawl_spider(schedule, item)
|
|
|
- if not rst or '账号限制' in str(rst):
|
|
|
+ rst = self.crawl_spider(account, item)
|
|
|
+ if not rst:
|
|
|
self.dedup.delete(fp)
|
|
|
|
|
|
- if download_count >= rdm or '账号限制' in str(rst):
|
|
|
+ if download_count >= account["total"]:
|
|
|
+ logger.warning("当日采集数量已达上限")
|
|
|
break
|
|
|
|
|
|
time.sleep(random.randint(80, 180))
|