dongzhaorui@topnet.net.cn il y a 3 ans
Parent
commit
7d82debbcf
2 fichiers modifiés avec 156 ajouts et 86 suppressions
  1. 132 76
      zbytb/crawler/clean_html.py
  2. 24 10
      zbytb/crawler/spiders/DetailPageSpider.py

+ 132 - 76
zbytb/crawler/clean_html.py

@@ -1,90 +1,146 @@
-# HTML 替换
 import re
 
+__all__ = ['cleaner', 'clean_js']
+
+# 独立元素
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+# 行内元素
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+}
+# 块级元素
+BLOCK_TAGS = {
+    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
+    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
+    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+# 其他
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+    '阅读数[::]\d+': '',
+}
+# 样式
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+}
+# 空白符
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+# css标签集合
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+# css属性集合
+ATTRS = {'id', 'class', 'style', 'width'}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
 
-def th(neirong):
-    tihuan = {
-        '<!--.*?-->': '',
-        '"': "'",
-        '\n': '',
-        '\xa0': "",
-        '<script .*?>': '',
-        '</script>': '',
-        '<span .*?>': '',
-        '</span> ': '',
-        '<p.*?>': '<br>',
-        '</p>': '<br>',
-        '<div>': '<br>',
-        '<div .*?>': '<br>',
-        '<img .*?>': '<br>',
-        '</div>': '<br>',
-        '<style.*?</style>': '',
-        '<EpointForm>': '',
-        '<html.*?</head>': '',
-        '<input .*?>': '',
-        '<!DOCTYPE.*?>': '',
-        '</meta>': '',
-        '<?xml:.*?>': '',
-        '<label.*?>': '<br>',
-        '</label>': '',
-        'style=".*?"': '',
-        "style='.*?'": '',
-        'class=".*?"': '',
-        "class='.*?'": '',
-        "bordercolor='.*?'": '',
-        'bgcolor=".*?"': '',
-        'BORDERCOLOR=".*?"': '',
-        'width=".*?"': '',
-        '<a name=".*?">': '',
-        '<o:p>': '',
-        '</o:p>': '',
-        '<A name=.*?>': '',
-        '<a .*?>': '',
-        '</a>': '',
-        '<font .*?>': '',
-        '</font>': '',
-        '<body>': '',
-        '</body>': '',
-        '<h\d{1}\s{0,10}>.*</h\d{1}\s{0,10}>': '',
-        '</h\d{1}\s{0,10}>': '',
-        '<h\d{1}\s{0,10}}>': '',
-        '【关闭】': '',
-        '【打印】': '',
-    }
 
-    nr = neirong
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    return html
 
-    all_tag = re.findall("<[^>]+>", nr)
-    for tag in all_tag:
-        nr = nr.replace(tag, str(tag).lower())
 
-    def thh(k, v, c):
-        return re.sub(k, v, c)
+def _lowercase_tag(html):
+    """标签归一化处理(全部小写)"""
+    tags = re.findall("<[^>]+>", html)
+    for tag in tags:
+        html = html.replace(tag, str(tag).lower())
 
-    for k, v in tihuan.items():
-        nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
-    return nr
+    repair_tags = _repair_tag()
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
 
+    return html
 
-def th_1(neirong):
-    tihuan = {
-        '<!--.*?-->': '',
-        '"': "'",
-        '\n': '',
-        '\xa0': "",
-        '<script .*?>': '',
-        '</script>': '',
+
+def cleaner(html, special=None, completely=False):
+    """
+    数据清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :return: 清洗后的页面源码
+    """
+    if special is None:
+        special = {}
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
     }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
 
-    nr = neirong
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
 
-    all_tag = re.findall("<[^>]+>", nr)
-    for tag in all_tag:
-        nr = nr.replace(tag, str(tag).lower())
+    html = _escape_character(html)
+    return html
 
-    def thh(k, v, c):
-        return re.sub(k, v, c)
 
-    for k, v in tihuan.items():
-        nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
-    return nr
+def clean_js(html):
+    remove_tags = {
+        '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+        '\\xa0|\\u3000': '',  # 空格
+        '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+        **BLANKS,
+    }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+    return html

+ 24 - 10
zbytb/crawler/spiders/DetailPageSpider.py

@@ -7,7 +7,7 @@ from bs4 import BeautifulSoup
 from pymongo.errors import DuplicateKeyError
 
 from crawler.check_utils import CheckText, CheckTask
-from crawler.clean_html import th_1, th
+from crawler.clean_html import cleaner, clean_js
 from crawler.crawl_scheduler import Scheduler
 from crawler.defaults import http_request_get
 from crawler.login import load_login_cookies, login, User, login_status_check
@@ -100,22 +100,27 @@ class CrawlDetailPageSpider:
 
     def process_attachment(self, content: str, rows: dict):
         soup = BeautifulSoup(content, "lxml")
-        all_a = soup.findAll("a")
+        all_node = soup.findAll("a") or soup.findAll("iframe")
         attachments = {}
         index = 0
-        for tag_a in all_a:
-            file_name, file_type = (tag_a.string or tag_a.text), None
-            file_path = tag_a.attrs.get("href", "")
-            if file_type is None:
-                # 抽取文件类型
+        for node in all_node:
+            file_name, file_type = (node.string or node.text), None
+            file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
+            # 附件可能包含在一个iframe中
+            _id = node.attrs.get('id')
+            if _id == 'pdfContainer':
+                file_type = 'pdf'
+            # 抽取文件类型
+            elif file_type is None:
                 file_type = (extract_file_type(file_name)
                              or extract_file_type(file_path))
+
             # 抽取文件名称
             parser = urlparse(file_path)
             if parser.scheme in ['https', 'http'] and file_type is not None:
                 if not file_name:
                     name = extract_file_name_by_href(file_path, file_type)
-                    if name is None:
+                    if name is not None:
                         file_name = name
                     else:
                         file_name = f"{rows['title']}_{index}"
@@ -133,8 +138,17 @@ class CrawlDetailPageSpider:
 
     def process_content(self, content, rows: dict):
         self.process_attachment(content, rows)
-        rows["contenthtml"] = th_1(content)
-        rows["detail"] = th(content)
+        rows["contenthtml"] = clean_js(content)
+        special = {
+            '<iframe[^<>]*>[\s\S]*?</iframe>': ''
+        }
+        rows["detail"] = cleaner(content, special=special)
+        try:
+            CheckText(rows["detail"])
+        except CustomCheckError:
+            # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
+            rows["detail"] = "<br/>详细内容请访问原网页!"
+
         rows["comeintime"] = int2long(int(time.time()))
         '''清除采集字段'''
         if 'crawl_status' in rows: