il y a 3 ans · 7d82debbcf
--- a/zbytb/crawler/clean_html.py
+++ b/zbytb/crawler/clean_html.py
@@ -1,90 +1,146 @@
 
				-# HTML 替换
			
 
				 import re
			
 
				 
			
 
				+__all__ = ['cleaner', 'clean_js']
			
 
				+
			
 
				+# 独立元素
			
 
				+INDEPENDENT_TAGS = {
			
 
				+    '<head>[\s\S]*?</head>': '',
			
 
				+    '<html>|<html [^>]*>|</html>': '',
			
 
				+    '<body>|<body [^>]*>|</body>': '',
			
 
				+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
			
 
				+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+    '\\xa0|\\u3000': '',  # 空格
			
 
				+    '<!--[\s\S]*?-->': '',  # 注释
			
 
				+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+    '<input>': '',  # 输入框
			
 
				+    '<img[^>]*>': '<br>',  # 图片
			
 
				+}
			
 
				+# 行内元素
			
 
				+INLINE_TAGS = {
			
 
				+    '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				+    '<span>|<span [^>]*>|</span>': '',  # span
			
 
				+    '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				+    '<font>|<font [^>]*>|</font>': '',  # font
			
 
				+}
			
 
				+# 块级元素
			
 
				+BLOCK_TAGS = {
			
 
				+    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
			
 
				+    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				+    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
			
 
				+    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
			
 
				+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				+}
			
 
				+# 其他
			
 
				+OTHER = {
			
 
				+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
			
 
				+    '<epointform>': '',
			
 
				+    '<!doctype html>|<!doctype html [^>]*>': '',
			
 
				+    '【关闭】|关闭': '',
			
 
				+    '【打印】|打印本页': '',
			
 
				+    '【字体：[\s\S]*】': '',
			
 
				+    '文章来源：[\u4e00-\u9fa5]+': '',
			
 
				+    '浏览次数：.*[<]+': '',
			
 
				+    '（责任编辑：.*?）': '',
			
 
				+    '分享到[：]': '',
			
 
				+    '阅读数[:：]\d+': '',
			
 
				+}
			
 
				+# 样式
			
 
				+CSS_STYLE = {
			
 
				+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
			
 
				+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
			
 
				+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
			
 
				+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				+}
			
 
				+# 空白符
			
 
				+BLANKS = {
			
 
				+    '\n\s*\n': '\n',
			
 
				+    '\s*\n\s*': '\n',
			
 
				+    '[^\S\n]': ' ',
			
 
				+    '\s+': ' ',
			
 
				+}
			
 
				+# css标签集合
			
 
				+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				+# css属性集合
			
 
				+ATTRS = {'id', 'class', 'style', 'width'}
			
 
				+
			
 
				+
			
 
				+def _repair_tag():
			
 
				+    """异常的标签组合,用来替换非标准页面的标签"""
			
 
				+    _repairs = {}
			
 
				+    for tag in TAGS:
			
 
				+        for attr in ATTRS:
			
 
				+            key = '{}{}'.format(tag, attr)
			
 
				+            val = '{} {}'.format(tag, attr)
			
 
				+            _repairs[key] = val
			
 
				+    return _repairs
			
 
				 
			
 
				-def th(neirong):
			
 
				-    tihuan = {
			
 
				-        '<!--.*?-->': '',
			
 
				-        '"': "'",
			
 
				-        '\n': '',
			
 
				-        '\xa0': "",
			
 
				-        '<script .*?>': '',
			
 
				-        '</script>': '',
			
 
				-        '<span .*?>': '',
			
 
				-        '</span> ': '',
			
 
				-        '<p.*?>': '<br>',
			
 
				-        '</p>': '<br>',
			
 
				-        '<div>': '<br>',
			
 
				-        '<div .*?>': '<br>',
			
 
				-        '<img .*?>': '<br>',
			
 
				-        '</div>': '<br>',
			
 
				-        '<style.*?</style>': '',
			
 
				-        '<EpointForm>': '',
			
 
				-        '<html.*?</head>': '',
			
 
				-        '<input .*?>': '',
			
 
				-        '<!DOCTYPE.*?>': '',
			
 
				-        '</meta>': '',
			
 
				-        '<?xml:.*?>': '',
			
 
				-        '<label.*?>': '<br>',
			
 
				-        '</label>': '',
			
 
				-        'style=".*?"': '',
			
 
				-        "style='.*?'": '',
			
 
				-        'class=".*?"': '',
			
 
				-        "class='.*?'": '',
			
 
				-        "bordercolor='.*?'": '',
			
 
				-        'bgcolor=".*?"': '',
			
 
				-        'BORDERCOLOR=".*?"': '',
			
 
				-        'width=".*?"': '',
			
 
				-        '<a name=".*?">': '',
			
 
				-        '<o:p>': '',
			
 
				-        '</o:p>': '',
			
 
				-        '<A name=.*?>': '',
			
 
				-        '<a .*?>': '',
			
 
				-        '</a>': '',
			
 
				-        '<font .*?>': '',
			
 
				-        '</font>': '',
			
 
				-        '<body>': '',
			
 
				-        '</body>': '',
			
 
				-        '<h\d{1}\s{0,10}>.*</h\d{1}\s{0,10}>': '',
			
 
				-        '</h\d{1}\s{0,10}>': '',
			
 
				-        '<h\d{1}\s{0,10}}>': '',
			
 
				-        '【关闭】': '',
			
 
				-        '【打印】': '',
			
 
				-    }
			
 
				 
			
 
				-    nr = neirong
			
 
				+def _escape_character(html):
			
 
				+    """转义字符"""
			
 
				+    html = html.replace('&lt;', '<')
			
 
				+    html = html.replace('&gt;', '>')
			
 
				+    html = html.replace('&quot;', '"')
			
 
				+    html = html.replace('&amp;', '&')
			
 
				+    return html
			
 
				 
			
 
				-    all_tag = re.findall("<[^>]+>", nr)
			
 
				-    for tag in all_tag:
			
 
				-        nr = nr.replace(tag, str(tag).lower())
			
 
				 
			
 
				-    def thh(k, v, c):
			
 
				-        return re.sub(k, v, c)
			
 
				+def _lowercase_tag(html):
			
 
				+    """标签归一化处理（全部小写）"""
			
 
				+    tags = re.findall("<[^>]+>", html)
			
 
				+    for tag in tags:
			
 
				+        html = html.replace(tag, str(tag).lower())
			
 
				 
			
 
				-    for k, v in tihuan.items():
			
 
				-        nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
			
 
				-    return nr
			
 
				+    repair_tags = _repair_tag()
			
 
				+    for err, right in repair_tags.items():
			
 
				+        html = html.replace(err, right)
			
 
				 
			
 
				+    return html
			
 
				 
			
 
				-def th_1(neirong):
			
 
				-    tihuan = {
			
 
				-        '<!--.*?-->': '',
			
 
				-        '"': "'",
			
 
				-        '\n': '',
			
 
				-        '\xa0': "",
			
 
				-        '<script .*?>': '',
			
 
				-        '</script>': '',
			
 
				+
			
 
				+def cleaner(html, special=None, completely=False):
			
 
				+    """
			
 
				+    数据清洗
			
 
				+
			
 
				+    :param html: 清洗的页面
			
 
				+    :param special: 额外指定页面清洗规则
			
 
				+    :param completely: 是否完全清洗页面
			
 
				+    :return: 清洗后的页面源码
			
 
				+    """
			
 
				+    if special is None:
			
 
				+        special = {}
			
 
				+    OTHER.update(special)
			
 
				+    remove_tags = {
			
 
				+        **INDEPENDENT_TAGS,
			
 
				+        **INLINE_TAGS,
			
 
				+        **BLOCK_TAGS,
			
 
				+        **OTHER,
			
 
				+        **CSS_STYLE,
			
 
				+        **BLANKS,
			
 
				     }
			
 
				+    html = _lowercase_tag(html)
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				 
			
 
				-    nr = neirong
			
 
				+    if completely:
			
 
				+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
			
 
				+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				 
			
 
				-    all_tag = re.findall("<[^>]+>", nr)
			
 
				-    for tag in all_tag:
			
 
				-        nr = nr.replace(tag, str(tag).lower())
			
 
				+    html = _escape_character(html)
			
 
				+    return html
			
 
				 
			
 
				-    def thh(k, v, c):
			
 
				-        return re.sub(k, v, c)
			
 
				 
			
 
				-    for k, v in tihuan.items():
			
 
				-        nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
			
 
				-    return nr
			
 
				+def clean_js(html):
			
 
				+    remove_tags = {
			
 
				+        '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+        '\\xa0|\\u3000': '',  # 空格
			
 
				+        '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+        **BLANKS,
			
 
				+    }
			
 
				+    html = _lowercase_tag(html)
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				+    return html
			
--- a/zbytb/crawler/spiders/DetailPageSpider.py
+++ b/zbytb/crawler/spiders/DetailPageSpider.py
@@ -7,7 +7,7 @@ from bs4 import BeautifulSoup
 
				 from pymongo.errors import DuplicateKeyError
			
 
				 
			
 
				 from crawler.check_utils import CheckText, CheckTask
			
 
				-from crawler.clean_html import th_1, th
			
 
				+from crawler.clean_html import cleaner, clean_js
			
 
				 from crawler.crawl_scheduler import Scheduler
			
 
				 from crawler.defaults import http_request_get
			
 
				 from crawler.login import load_login_cookies, login, User, login_status_check
			
@@ -100,22 +100,27 @@ class CrawlDetailPageSpider:
 
				 
			
 
				     def process_attachment(self, content: str, rows: dict):
			
 
				         soup = BeautifulSoup(content, "lxml")
			
 
				-        all_a = soup.findAll("a")
			
 
				+        all_node = soup.findAll("a") or soup.findAll("iframe")
			
 
				         attachments = {}
			
 
				         index = 0
			
 
				-        for tag_a in all_a:
			
 
				-            file_name, file_type = (tag_a.string or tag_a.text), None
			
 
				-            file_path = tag_a.attrs.get("href", "")
			
 
				-            if file_type is None:
			
 
				-                # 抽取文件类型
			
 
				+        for node in all_node:
			
 
				+            file_name, file_type = (node.string or node.text), None
			
 
				+            file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
			
 
				+            # 附件可能包含在一个iframe中
			
 
				+            _id = node.attrs.get('id')
			
 
				+            if _id == 'pdfContainer':
			
 
				+                file_type = 'pdf'
			
 
				+            # 抽取文件类型
			
 
				+            elif file_type is None:
			
 
				                 file_type = (extract_file_type(file_name)
			
 
				                              or extract_file_type(file_path))
			
 
				+
			
 
				             # 抽取文件名称
			
 
				             parser = urlparse(file_path)
			
 
				             if parser.scheme in ['https', 'http'] and file_type is not None:
			
 
				                 if not file_name:
			
 
				                     name = extract_file_name_by_href(file_path, file_type)
			
 
				-                    if name is None:
			
 
				+                    if name is not None:
			
 
				                         file_name = name
			
 
				                     else:
			
 
				                         file_name = f"{rows['title']}_{index}"
			
@@ -133,8 +138,17 @@ class CrawlDetailPageSpider:
 
				 
			
 
				     def process_content(self, content, rows: dict):
			
 
				         self.process_attachment(content, rows)
			
 
				-        rows["contenthtml"] = th_1(content)
			
 
				-        rows["detail"] = th(content)
			
 
				+        rows["contenthtml"] = clean_js(content)
			
 
				+        special = {
			
 
				+            '<iframe[^<>]*>[\s\S]*?</iframe>': ''
			
 
				+        }
			
 
				+        rows["detail"] = cleaner(content, special=special)
			
 
				+        try:
			
 
				+            CheckText(rows["detail"])
			
 
				+        except CustomCheckError:
			
 
				+            # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
			
 
				+            rows["detail"] = "<br/>详细内容请访问原网页！"
			
 
				+
			
 
				         rows["comeintime"] = int2long(int(time.time()))
			
 
				         '''清除采集字段'''
			
 
				         if 'crawl_status' in rows: