|
@@ -17,7 +17,7 @@ import feapder.setting as setting
|
|
|
import feapder.utils.tools as tools
|
|
|
from feapder.buffer.item_buffer import ItemBuffer
|
|
|
from feapder.db.memory_db import MemoryDB
|
|
|
-from feapder.network.item import Item
|
|
|
+from feapder.network.item import Item, HeartBeatItem
|
|
|
from feapder.network.request import Request
|
|
|
from feapder.utils import metrics
|
|
|
from feapder.utils.log import log
|
|
@@ -35,16 +35,24 @@ class PaserControl(threading.Thread):
|
|
|
_success_task_count = 0
|
|
|
_failed_task_count = 0
|
|
|
|
|
|
- def __init__(self, collector, redis_key, request_buffer, item_buffer):
|
|
|
+ def __init__(self, collector, redis_key, request_buffer, item_buffer, heartbeat_buffer):
|
|
|
super(PaserControl, self).__init__()
|
|
|
self._parsers = []
|
|
|
self._collector = collector
|
|
|
self._redis_key = redis_key
|
|
|
self._request_buffer = request_buffer
|
|
|
self._item_buffer = item_buffer
|
|
|
+ self._heartbeat_buffer = heartbeat_buffer
|
|
|
|
|
|
self._thread_stop = False
|
|
|
|
|
|
+ def is_not_task(self):
|
|
|
+ return self.is_show_tip
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def get_task_status_count(cls):
|
|
|
+ return cls._failed_task_count, cls._success_task_count
|
|
|
+
|
|
|
def run(self):
|
|
|
self._thread_stop = False
|
|
|
while not self._thread_stop:
|
|
@@ -61,25 +69,17 @@ class PaserControl(threading.Thread):
|
|
|
except (Exception, BaseException) as e:
|
|
|
log.exception(e)
|
|
|
|
|
|
- def is_not_task(self):
|
|
|
- return self.is_show_tip
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def get_task_status_count(cls):
|
|
|
- return cls._failed_task_count, cls._success_task_count
|
|
|
-
|
|
|
def deal_request(self, request):
|
|
|
response = None
|
|
|
request_redis = request["request_redis"]
|
|
|
request = request["request_obj"]
|
|
|
+ now_page = request.page or -1
|
|
|
|
|
|
- is_sent_heartbeat = False # 发送心跳的标识
|
|
|
- heartbeat_lst = [] # 待推送的心跳信息列表
|
|
|
for parser in self._parsers:
|
|
|
- now_page = request.page or -1
|
|
|
counter = {
|
|
|
- 'realQuantity': 0, # 去重后实际入库数量
|
|
|
- 'extractQuantity': 0, # 列表页抽取的列表数量
|
|
|
+ 'now_page': now_page,
|
|
|
+ 'extract_count': 0, # 列表页抽取的列表数量
|
|
|
+ 'rel_count': 0, # 去重后实际入库数量
|
|
|
}
|
|
|
if parser.name == request.parser_name:
|
|
|
used_download_midware_enable = False
|
|
@@ -210,9 +210,9 @@ class PaserControl(threading.Thread):
|
|
|
if "List" in parser.__business_type__ and hasattr(result, 'contenthtml'):
|
|
|
result.is_mixed = True
|
|
|
|
|
|
- counter['extractQuantity'] += 1 # 统计抽取列表数
|
|
|
+ counter['extract_count'] += 1 # 统计抽取列表数
|
|
|
if not self.is_duplicate(result):
|
|
|
- counter['realQuantity'] += 1 # 统计实际列表数
|
|
|
+ counter['rel_count'] += 1 # 统计实际列表数
|
|
|
|
|
|
# 将item入库(异步)
|
|
|
self._item_buffer.put_item(result)
|
|
@@ -239,9 +239,6 @@ class PaserControl(threading.Thread):
|
|
|
f"{function_name} result expect Request、Item or callback, but get type: {type(result)}"
|
|
|
)
|
|
|
|
|
|
- # 发送心跳的条件
|
|
|
- is_sent_heartbeat = True
|
|
|
-
|
|
|
except (Exception, BaseException) as e:
|
|
|
exception_type = (
|
|
|
str(type(e)).replace("<class '", "").replace("'>", "")
|
|
@@ -339,7 +336,6 @@ class PaserControl(threading.Thread):
|
|
|
elif isinstance(result, Item):
|
|
|
self._item_buffer.put_item(result)
|
|
|
|
|
|
- is_sent_heartbeat = True
|
|
|
else:
|
|
|
# 将 requests 重新入库 爬取
|
|
|
request.retry_times += 1
|
|
@@ -394,25 +390,10 @@ class PaserControl(threading.Thread):
|
|
|
if response and hasattr(response, "browser"):
|
|
|
request._webdriver_pool.put(response.browser)
|
|
|
|
|
|
- # 收集爬虫心跳
|
|
|
- if hasattr(parser, "__business_type__"):
|
|
|
- heartbeat_lst.append(dict(
|
|
|
- parser=parser,
|
|
|
- now_page=now_page,
|
|
|
- extract_count=counter['extractQuantity'],
|
|
|
- rel_count=counter['realQuantity'],
|
|
|
- request=request,
|
|
|
- response=response,
|
|
|
- filepath=str(pathlib.Path(setting.sys.argv[0])),
|
|
|
- ))
|
|
|
-
|
|
|
+ # 发布心跳
|
|
|
+ self.publish_heartbeat(parser, request, **counter)
|
|
|
break
|
|
|
|
|
|
- # 发送心跳
|
|
|
- if is_sent_heartbeat:
|
|
|
- for heartbeat in heartbeat_lst:
|
|
|
- self.spider_heartbeat(**heartbeat)
|
|
|
-
|
|
|
if setting.SPIDER_SLEEP_TIME:
|
|
|
if (
|
|
|
isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
|
|
@@ -447,105 +428,40 @@ class PaserControl(threading.Thread):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
- def sent_heartbeat(self, items, table=None):
|
|
|
- """发送汇总采集详情"""
|
|
|
- send_success = True
|
|
|
- items = items if isinstance(items, list) else [items]
|
|
|
- log.debug("发送心跳")
|
|
|
- table = table or setting.SPIDER_HEARTBEAT_RECORD
|
|
|
- if not self._item_buffer.export_to_db(table, items):
|
|
|
- send_success = False
|
|
|
- log.error("失败心跳:\n {}".format(tools.dumps_json(items)))
|
|
|
- return send_success
|
|
|
-
|
|
|
- @staticmethod
|
|
|
- def get_spider_attribute(name, *args):
|
|
|
- """获取对象属性"""
|
|
|
- arg1, arg2 = args or (None, None)
|
|
|
-
|
|
|
- val = None
|
|
|
- if arg1 is not None:
|
|
|
- if isinstance(arg1, dict):
|
|
|
- val = arg1.get(name)
|
|
|
- if not val and name == "spidercode":
|
|
|
- val = arg1.get("code")
|
|
|
- else:
|
|
|
- val = getattr(arg1, name, None)
|
|
|
-
|
|
|
- if not val and arg2 is not None:
|
|
|
- val = getattr(arg2, name, None)
|
|
|
-
|
|
|
- return val if val is not None else ""
|
|
|
-
|
|
|
- def spider_heartbeat(self, request, response, **kwargs):
|
|
|
- """爬虫心跳"""
|
|
|
- parser = kwargs["parser"]
|
|
|
- now_page = kwargs["now_page"]
|
|
|
- extract_count = kwargs["extract_count"]
|
|
|
- request_count = sum(self.get_task_status_count()) # 采集任务总数(本次爬虫运行时发起的总请求数)
|
|
|
- rel_count = kwargs["rel_count"]
|
|
|
- filepath = kwargs["filepath"]
|
|
|
- status_code = getattr(response, "status_code", -1)
|
|
|
-
|
|
|
- spider_info = getattr(request, "item", {})
|
|
|
- site = self.get_spider_attribute("site", spider_info, parser)
|
|
|
- channel = self.get_spider_attribute("channel", spider_info, parser)
|
|
|
- code = self.get_spider_attribute("spidercode", spider_info, parser)
|
|
|
-
|
|
|
+ def publish_heartbeat(self, parser, request, **kwargs):
|
|
|
+ request_item = getattr(request, "item")
|
|
|
business_type: str = parser.__business_type__ # 爬虫业务类型
|
|
|
+ if business_type.endswith("List"):
|
|
|
+ site = getattr(parser, "site")
|
|
|
+ spidercode = request_item["code"]
|
|
|
+ count = kwargs["extract_count"]
|
|
|
+ else:
|
|
|
+ site = request_item["site"]
|
|
|
+ spidercode = request_item["spidercode"]
|
|
|
+ count = 0
|
|
|
+
|
|
|
run_time = tools.get_current_date(date_format="%Y-%m-%d") # 运行时间,单位:天
|
|
|
- spider_id = tools.get_md5(code + business_type + run_time)
|
|
|
- heartbeat_content = dict(
|
|
|
+ heartbeat_item = HeartBeatItem(
|
|
|
node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"), # crawlab节点名称
|
|
|
crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"), # crawlab平台爬虫的任务id
|
|
|
site=site,
|
|
|
- channel=channel,
|
|
|
- spidercode=code,
|
|
|
- url=request.url, # 访问地址
|
|
|
- status_code=status_code, # 响应状态码
|
|
|
- runtime=run_time,
|
|
|
+ channel=request_item["channel"],
|
|
|
+ spidercode=spidercode,
|
|
|
business_type=business_type,
|
|
|
- spider_id=spider_id,
|
|
|
- filepath=filepath, # 文件路径
|
|
|
- create_at=tools.ensure_int64(tools.get_current_timestamp()), # 执行时间, 单位:秒
|
|
|
+ spider_id=tools.get_md5(spidercode + business_type + run_time),
|
|
|
+ filepath=str(pathlib.Path(setting.sys.argv[0])), # 文件路径
|
|
|
+ runtime=run_time,
|
|
|
+ nowpage=kwargs["now_page"], # 当前列表页页码
|
|
|
+ count=count, # 抽取的列表数量
|
|
|
+ rel_count=kwargs["rel_count"], # 实际入库总数
|
|
|
+ failed_task_count=self._failed_task_count,
|
|
|
+ success_task_count=self._success_task_count,
|
|
|
+ create_at=tools.ensure_int64(tools.get_current_timestamp()), # 创建时间, 单位:秒
|
|
|
+ expire_at=tools.get_utcnow(), # 设置utc时间,定期删除(5天)
|
|
|
)
|
|
|
-
|
|
|
- if hasattr(request, "error_msg") and status_code != 200:
|
|
|
- error = getattr(request, "error_msg")
|
|
|
- feature = dict(
|
|
|
- err_type=str(error.split(": ")[0]),
|
|
|
- err_msg=getattr(request, "error_msg"),
|
|
|
- )
|
|
|
- feature.setdefault("request_success", False)
|
|
|
- if business_type.endswith("List"):
|
|
|
- feature.update(dict(nowpage=now_page, ))
|
|
|
- else:
|
|
|
- feature.update(dict(count=request_count, ))
|
|
|
- else:
|
|
|
- if business_type.endswith("List"):
|
|
|
- # 列表页
|
|
|
- list_feature = dict(
|
|
|
- nowpage=now_page, # 当前页码
|
|
|
- count=extract_count, # 列表提取总数
|
|
|
- rel_count=rel_count, # 实际入库总数
|
|
|
- )
|
|
|
- feature = list_feature
|
|
|
- else:
|
|
|
- # 详情页
|
|
|
- detail_feature = dict(
|
|
|
- count=request_count, # 发起请求的总数
|
|
|
- rel_count=rel_count, # 实际入库总数
|
|
|
- )
|
|
|
- feature = detail_feature
|
|
|
- feature.setdefault("request_success", True)
|
|
|
-
|
|
|
- feature.update({
|
|
|
- 'failed_task_count': self._failed_task_count,
|
|
|
- 'success_task_count': self._success_task_count
|
|
|
- })
|
|
|
- feature['expire_at'] = tools.get_utcnow() # 设置utc时间,定期删除(5天)
|
|
|
- heartbeat_content.update(feature)
|
|
|
- return self.sent_heartbeat(heartbeat_content)
|
|
|
+ # 采集任务总数(本次爬虫运行发起的总请求数) failed_task_count + success_task_count
|
|
|
+ heartbeat_item.table_name = setting.SPIDER_HEARTBEAT_RECORD # 设置表名
|
|
|
+ return self._heartbeat_buffer.put_item(heartbeat_item)
|
|
|
|
|
|
|
|
|
class AirSpiderParserControl(PaserControl):
|