Browse Source

update:添加最大公共子串抽取

dongzhaorui 2 years ago
parent
commit
41ca614e00

+ 60 - 77
A数据处理/site_monitor/monitor.py

@@ -11,8 +11,8 @@ import threading
 import bson
 import numpy as np
 import requests
+import requests.exceptions as requests_exceptions
 from playwright._impl._api_types import Error
-from requests.exceptions import SSLError
 
 import utils.tools as tools
 from db.mongodb import MongoDB
@@ -40,10 +40,16 @@ class MonitorParser(threading.Thread):
         finally:
             return items
 
-    @staticmethod
-    def get_response(url, render=False, **kwargs):
+    def get_response(self, url, render=False, **kwargs):
+        response = Response.from_dict({
+            "url": url,
+            "_content": b"",
+            "cookies": {},
+            "status_code": -1,
+            "elapsed": 666,
+            "headers": {}
+        })
         request = Request(url=url, render=render, **kwargs)
-        response = None
         for i in range(3):
             try:
                 response = request.get_response()
@@ -51,34 +57,43 @@ class MonitorParser(threading.Thread):
                 if 'The certificate for this server is invalid.' in e.message:
                     url = url.replace('https', 'http')
                     request = Request(url=url, render=render, **kwargs)
-            except SSLError as e:
+            except requests_exceptions.SSLError:
                 url = url.replace('https', 'http')
                 request = Request(url=url, render=True, **kwargs)
-            except Exception as e:
-                response.reason = e.args
+            except requests_exceptions as e:
+                logger.exception(e)
+                break
             else:
                 if response.status_code != 200:
-                    continue
-
-                if response.text is None:
-                    continue
-
-                if len(response.plain_text) == 0:
-                    continue
-
-                if response.tags()['tags_count'] == 0:
-                    continue
+                    if any([
+                        response.text is None,
+                        len(response.plain_text) == 0,
+                        response.tags()['tags_count'] == 0
+                    ]):
+                        continue
 
                 break
         else:
-            response = Response.from_dict({
-                "url": url,
-                "_content": b"",
-                "cookies": {},
-                "status_code": getattr(response, 'status_code', -1),
-                "elapsed": 666,
-                "headers": {}
-            })
+            if response.status_code != -1:
+                response = Response(response)
+            # 设置编码
+            response.encoding = response.encoding or "utf-8"
+
+        logger.debug(
+            """
+                -------------- response for ----------------
+                thread = %s
+                url = %s
+                title = %s
+                response = %s
+                """
+            % (
+                self.getName(),
+                url,
+                response.title(),
+                response
+            )
+        )
 
         return response
 
@@ -88,21 +103,14 @@ class MonitorParser(threading.Thread):
             condition={'_id': task['_id']},
             data=items
         )
-        print({'_id': task['_id']})
+        # print({'_id': task['_id']})
         return result
 
     def deal_task(self, task):
-        is_first_monitor = False
-
-        # 网站主页
-        host = task['host']
-        response = self.get_response(host, render=False, proxies=False)
-        host_status_code = response.status_code
-
         # 栏目
         url = task['url']
         response = self.get_response(url, render=True, proxies=False)
-        channel_status_code = response.status_code
+        status_code = response.status_code
 
         # 栏目页面标签
         tags_count = response.tags()['tags_count']
@@ -118,66 +126,41 @@ class MonitorParser(threading.Thread):
             if tags_count_diff not in std_range:
                 channel_ischange = True
 
+        if len(tags_count_diff_lst) > 3 and sum(tags_count_diff_lst) == 0:
+            channel_ischange = True
+            status_code = 500
+
         # 访问频次
         update_dt = tools.timestamp_to_date(task['update_at'], '%Y-%m-%d')
-        if tools.get_current_date('%Y-%m-%d') != update_dt:
-            is_first_monitor = True
-
-            channel_visit_count, channel_failure_count = 1, 0
-            if channel_status_code != 200:
-                channel_failure_count = 1
-
-            host_visit_count, host_failure_count = 1, 0
-            if host_status_code != 200:
-                host_failure_count = 1
+        is_first_monitor = tools.get_current_date('%Y-%m-%d') != update_dt
+        if is_first_monitor:
+            visit_count, failure_count = 1, 0
+            if status_code != 200:
+                failure_count = 1
 
             tags_count_diff_lst = []
             tags_count_diff_lst.insert(0, tags_count_diff)
         else:
-            channel_visit_count = task['channel_visit_count'] + 1
-            channel_failure_count = task['channel_failure_count']
-            if channel_status_code != 200:
-                channel_failure_count += 1
-
-            host_visit_count = task['host_visit_count'] + 1
-            host_failure_count = task['host_failure_count']
-            if host_status_code != 200:
-                host_failure_count += 1
+            visit_count = task['visit_count'] + 1
+            failure_count = task['failure_count']
+            if status_code != 200:
+                failure_count += 1
 
             tags_count_diff_lst.insert(0, tags_count_diff)
 
-        if is_first_monitor:
-            pass
-
         items = {
+            'title': response.title(),  # 页面标头
             'tags_count': tags_count,
             'tags_count_diff': tags_count_diff,
             'tags_count_diff_lst': tags_count_diff_lst,
             'channel_ischange': channel_ischange,
-            'channel_status_code': channel_status_code,
-            'channel_visit_count': channel_visit_count,
-            'channel_failure_count': channel_failure_count,
-            'host_status_code': host_status_code,
-            'host_visit_count': host_visit_count,
-            'host_failure_count': host_failure_count,
+            'status_code': status_code,
+            'visit_count': visit_count,
+            'failure_count': failure_count,
             'update_at': tools.ensure_int64(tools.get_current_timestamp())
         }
         self.__add_items_to_db(task, items)
 
-        logger.debug(
-            """
-                -------------- 处理完成 ----------------
-                id  = Object('%s')
-                thread = %s
-                response = %s
-                """
-            % (
-                str(task['_id']),
-                self.getName(),
-                response
-            )
-        )
-
     def run(self):
         while True:
             task = self.get_task()
@@ -215,4 +198,4 @@ class MonitorServer(threading.Thread):
 
 
 if __name__ == '__main__':
-    MonitorServer(thread_nums=2).start()
+    MonitorServer(thread_nums=4).start()

+ 19 - 1
A数据处理/site_monitor/network/response.py

@@ -24,7 +24,7 @@ from w3lib.encoding import (
     http_content_type_encoding,
     html_body_declared_encoding
 )
-
+import utils.tools as tools
 from utils.log import logger as log
 
 FAIL_ENCODING = "ISO-8859-1"
@@ -394,3 +394,21 @@ class Response(res):
 
         tags_dict['tags_count'] = count
         return tags_dict
+
+    def title(self):
+        title_text = self.xpath('//title/text()').extract_first("")
+
+        htag = '//h1//text() | //h2//text() | //h3//text() | //h4//text()'
+        h_tag_texts_list = self.xpath(htag).extract()
+        htag_text = h_tag_texts_list[0] if len(h_tag_texts_list) > 0 else ''
+
+        news_title = ''
+        for h_tag_text in h_tag_texts_list:
+            lcs = tools.get_longest_common_sub_string(title_text, h_tag_text)
+            if len(lcs) > len(news_title):
+                news_title = lcs
+
+        news_title = news_title if len(news_title) > 8 else ''
+
+        title = (news_title or title_text or htag_text)
+        return title.strip()

+ 33 - 70
A数据处理/site_monitor/utils/tools.py

@@ -137,80 +137,43 @@ def memoizemethod_noargs(method):
 
 
 ########################【网页解析相关】###############################
+def get_longest_common_sub_string(str1: str, str2: str) -> str:
+    """
+    获取两个字符串的最长公共子串。
 
+    构造一个矩阵,横向是字符串1,纵向是字符串2,例如:
 
-# @log_function_time
-def get_html_by_requests(
-    url, headers=None, code="utf-8", data=None, proxies={}, with_response=False
-):
-    html = ""
-    r = None
-    try:
-        if data:
-            r = requests.post(
-                url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies
-            )
-        else:
-            r = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies)
-
-        if code:
-            r.encoding = code
-        html = r.text
-
-    except Exception as e:
-        log.error(e)
-    finally:
-        r and r.close()
-
-    if with_response:
-        return html, r
-    else:
-        return html
-
+      青南是天才!?
+    听0 0 0 0 00 0
+    说0 0 0 0 00 0
+    青1 0 0 0 00 0
+    南0 1 0 0 00 0
+    是0 0 1 0 00 0
+    天0 0 0 1 00 0
+    才0 0 0 0 10 0
+    !0 0 0 0 01 0
 
-def get_json_by_requests(
-    url,
-    params=None,
-    headers=None,
-    data=None,
-    proxies={},
-    with_response=False,
-    cookies=None,
-):
-    json = {}
-    response = None
-    try:
-        # response = requests.get(url, params = params)
-        if data:
-            response = requests.post(
-                url,
-                headers=headers,
-                data=data,
-                params=params,
-                timeout=TIME_OUT,
-                proxies=proxies,
-                cookies=cookies,
-            )
-        else:
-            response = requests.get(
-                url,
-                headers=headers,
-                params=params,
-                timeout=TIME_OUT,
-                proxies=proxies,
-                cookies=cookies,
-            )
-        response.encoding = "utf-8"
-        json = response.json()
-    except Exception as e:
-        log.error(e)
-    finally:
-        response and response.close()
+    显然,只要斜对角线最长的就是最长公共子串
 
-    if with_response:
-        return json, response
-    else:
-        return json
+    :param str1:
+    :param str2:
+    :return:
+    """
+    if not all([str1, str2]):
+        return ''
+    matrix = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]
+    max_length = 0
+    start_position = 0
+    for index_of_str1 in range(1, len(str1) + 1):
+        for index_of_str2 in range(1, len(str2) + 1):
+            if str1[index_of_str1 - 1] == str2[index_of_str2 - 1]:
+                matrix[index_of_str1][index_of_str2] = matrix[index_of_str1 - 1][index_of_str2 - 1] + 1
+                if matrix[index_of_str1][index_of_str2] > max_length:
+                    max_length = matrix[index_of_str1][index_of_str2]
+                    start_position = index_of_str1 - max_length
+            else:
+                matrix[index_of_str1][index_of_str2] = 0
+    return str1[start_position: start_position + max_length]
 
 
 def get_cookies(response):