Browse Source

添加文本有效内容长度检查

萤火也是火 3 years ago
parent
commit
133957fff7
1 changed files with 9 additions and 1 deletions
  1. 9 1
      ybw/crawler/check_utils.py

+ 9 - 1
ybw/crawler/check_utils.py

@@ -22,8 +22,16 @@ class CheckContent:
     def check_text_length(val: str):
         if len(val) == 0:
             raise CustomCheckError(code=10101, reason='文本内容为空')
-        elif not re.findall(r'[\u4e00-\u9fa5]', val, re.S):
+        elif not re.findall(r'[\u4e00-\u9fa5]+', val, re.S):
             raise CustomCheckError(code=10102, reason='不存在中文字符')
+        else:
+            '''清洗数字、字母、中文之外的干扰元素'''
+            sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
+            for pattern in sub_pattern:
+                val = re.sub(pattern, '', val)
+            # 若文本长度小于50,表示页面内容无详情内容
+            if len(val) < 50:
+                raise CustomCheckError(code=10102, reason='页面无有效内容')
 
     @staticmethod
     def check_content(val: str):