|
@@ -11,8 +11,8 @@ import threading
|
|
|
import bson
|
|
|
import numpy as np
|
|
|
import requests
|
|
|
+import requests.exceptions as requests_exceptions
|
|
|
from playwright._impl._api_types import Error
|
|
|
-from requests.exceptions import SSLError
|
|
|
|
|
|
import utils.tools as tools
|
|
|
from db.mongodb import MongoDB
|
|
@@ -40,10 +40,16 @@ class MonitorParser(threading.Thread):
|
|
|
finally:
|
|
|
return items
|
|
|
|
|
|
- @staticmethod
|
|
|
- def get_response(url, render=False, **kwargs):
|
|
|
+ def get_response(self, url, render=False, **kwargs):
|
|
|
+ response = Response.from_dict({
|
|
|
+ "url": url,
|
|
|
+ "_content": b"",
|
|
|
+ "cookies": {},
|
|
|
+ "status_code": -1,
|
|
|
+ "elapsed": 666,
|
|
|
+ "headers": {}
|
|
|
+ })
|
|
|
request = Request(url=url, render=render, **kwargs)
|
|
|
- response = None
|
|
|
for i in range(3):
|
|
|
try:
|
|
|
response = request.get_response()
|
|
@@ -51,34 +57,43 @@ class MonitorParser(threading.Thread):
|
|
|
if 'The certificate for this server is invalid.' in e.message:
|
|
|
url = url.replace('https', 'http')
|
|
|
request = Request(url=url, render=render, **kwargs)
|
|
|
- except SSLError as e:
|
|
|
+ except requests_exceptions.SSLError:
|
|
|
url = url.replace('https', 'http')
|
|
|
request = Request(url=url, render=True, **kwargs)
|
|
|
- except Exception as e:
|
|
|
- response.reason = e.args
|
|
|
+ except requests_exceptions as e:
|
|
|
+ logger.exception(e)
|
|
|
+ break
|
|
|
else:
|
|
|
if response.status_code != 200:
|
|
|
- continue
|
|
|
-
|
|
|
- if response.text is None:
|
|
|
- continue
|
|
|
-
|
|
|
- if len(response.plain_text) == 0:
|
|
|
- continue
|
|
|
-
|
|
|
- if response.tags()['tags_count'] == 0:
|
|
|
- continue
|
|
|
+ if any([
|
|
|
+ response.text is None,
|
|
|
+ len(response.plain_text) == 0,
|
|
|
+ response.tags()['tags_count'] == 0
|
|
|
+ ]):
|
|
|
+ continue
|
|
|
|
|
|
break
|
|
|
else:
|
|
|
- response = Response.from_dict({
|
|
|
- "url": url,
|
|
|
- "_content": b"",
|
|
|
- "cookies": {},
|
|
|
- "status_code": getattr(response, 'status_code', -1),
|
|
|
- "elapsed": 666,
|
|
|
- "headers": {}
|
|
|
- })
|
|
|
+ if response.status_code != -1:
|
|
|
+ response = Response(response)
|
|
|
+ # 设置编码
|
|
|
+ response.encoding = response.encoding or "utf-8"
|
|
|
+
|
|
|
+ logger.debug(
|
|
|
+ """
|
|
|
+ -------------- response for ----------------
|
|
|
+ thread = %s
|
|
|
+ url = %s
|
|
|
+ title = %s
|
|
|
+ response = %s
|
|
|
+ """
|
|
|
+ % (
|
|
|
+ self.getName(),
|
|
|
+ url,
|
|
|
+ response.title(),
|
|
|
+ response
|
|
|
+ )
|
|
|
+ )
|
|
|
|
|
|
return response
|
|
|
|
|
@@ -88,21 +103,14 @@ class MonitorParser(threading.Thread):
|
|
|
condition={'_id': task['_id']},
|
|
|
data=items
|
|
|
)
|
|
|
- print({'_id': task['_id']})
|
|
|
+ # print({'_id': task['_id']})
|
|
|
return result
|
|
|
|
|
|
def deal_task(self, task):
|
|
|
- is_first_monitor = False
|
|
|
-
|
|
|
- # 网站主页
|
|
|
- host = task['host']
|
|
|
- response = self.get_response(host, render=False, proxies=False)
|
|
|
- host_status_code = response.status_code
|
|
|
-
|
|
|
# 栏目
|
|
|
url = task['url']
|
|
|
response = self.get_response(url, render=True, proxies=False)
|
|
|
- channel_status_code = response.status_code
|
|
|
+ status_code = response.status_code
|
|
|
|
|
|
# 栏目页面标签
|
|
|
tags_count = response.tags()['tags_count']
|
|
@@ -118,66 +126,41 @@ class MonitorParser(threading.Thread):
|
|
|
if tags_count_diff not in std_range:
|
|
|
channel_ischange = True
|
|
|
|
|
|
+ if len(tags_count_diff_lst) > 3 and sum(tags_count_diff_lst) == 0:
|
|
|
+ channel_ischange = True
|
|
|
+ status_code = 500
|
|
|
+
|
|
|
# 访问频次
|
|
|
update_dt = tools.timestamp_to_date(task['update_at'], '%Y-%m-%d')
|
|
|
- if tools.get_current_date('%Y-%m-%d') != update_dt:
|
|
|
- is_first_monitor = True
|
|
|
-
|
|
|
- channel_visit_count, channel_failure_count = 1, 0
|
|
|
- if channel_status_code != 200:
|
|
|
- channel_failure_count = 1
|
|
|
-
|
|
|
- host_visit_count, host_failure_count = 1, 0
|
|
|
- if host_status_code != 200:
|
|
|
- host_failure_count = 1
|
|
|
+ is_first_monitor = tools.get_current_date('%Y-%m-%d') != update_dt
|
|
|
+ if is_first_monitor:
|
|
|
+ visit_count, failure_count = 1, 0
|
|
|
+ if status_code != 200:
|
|
|
+ failure_count = 1
|
|
|
|
|
|
tags_count_diff_lst = []
|
|
|
tags_count_diff_lst.insert(0, tags_count_diff)
|
|
|
else:
|
|
|
- channel_visit_count = task['channel_visit_count'] + 1
|
|
|
- channel_failure_count = task['channel_failure_count']
|
|
|
- if channel_status_code != 200:
|
|
|
- channel_failure_count += 1
|
|
|
-
|
|
|
- host_visit_count = task['host_visit_count'] + 1
|
|
|
- host_failure_count = task['host_failure_count']
|
|
|
- if host_status_code != 200:
|
|
|
- host_failure_count += 1
|
|
|
+ visit_count = task['visit_count'] + 1
|
|
|
+ failure_count = task['failure_count']
|
|
|
+ if status_code != 200:
|
|
|
+ failure_count += 1
|
|
|
|
|
|
tags_count_diff_lst.insert(0, tags_count_diff)
|
|
|
|
|
|
- if is_first_monitor:
|
|
|
- pass
|
|
|
-
|
|
|
items = {
|
|
|
+ 'title': response.title(), # 页面标头
|
|
|
'tags_count': tags_count,
|
|
|
'tags_count_diff': tags_count_diff,
|
|
|
'tags_count_diff_lst': tags_count_diff_lst,
|
|
|
'channel_ischange': channel_ischange,
|
|
|
- 'channel_status_code': channel_status_code,
|
|
|
- 'channel_visit_count': channel_visit_count,
|
|
|
- 'channel_failure_count': channel_failure_count,
|
|
|
- 'host_status_code': host_status_code,
|
|
|
- 'host_visit_count': host_visit_count,
|
|
|
- 'host_failure_count': host_failure_count,
|
|
|
+ 'status_code': status_code,
|
|
|
+ 'visit_count': visit_count,
|
|
|
+ 'failure_count': failure_count,
|
|
|
'update_at': tools.ensure_int64(tools.get_current_timestamp())
|
|
|
}
|
|
|
self.__add_items_to_db(task, items)
|
|
|
|
|
|
- logger.debug(
|
|
|
- """
|
|
|
- -------------- 处理完成 ----------------
|
|
|
- id = Object('%s')
|
|
|
- thread = %s
|
|
|
- response = %s
|
|
|
- """
|
|
|
- % (
|
|
|
- str(task['_id']),
|
|
|
- self.getName(),
|
|
|
- response
|
|
|
- )
|
|
|
- )
|
|
|
-
|
|
|
def run(self):
|
|
|
while True:
|
|
|
task = self.get_task()
|
|
@@ -215,4 +198,4 @@ class MonitorServer(threading.Thread):
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- MonitorServer(thread_nums=2).start()
|
|
|
+ MonitorServer(thread_nums=4).start()
|