|
@@ -1,11 +1,18 @@
|
|
import re
|
|
import re
|
|
|
|
+from html import unescape
|
|
from urllib.parse import urlencode, urljoin
|
|
from urllib.parse import urlencode, urljoin
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulSoup
|
|
-from lxml.html import HtmlElement, fromstring, tostring
|
|
|
|
|
|
+from lxml.html import etree, HtmlElement, fromstring, tostring
|
|
from urllib3 import get_host
|
|
from urllib3 import get_host
|
|
|
|
|
|
from common.log import logger
|
|
from common.log import logger
|
|
|
|
+from crawler.defaults import (
|
|
|
|
+ USELESS_TAG,
|
|
|
|
+ USELESS_ATTR,
|
|
|
|
+ TAGS_CAN_BE_REMOVE_IF_EMPTY,
|
|
|
|
+ KEYWORDS,
|
|
|
|
+)
|
|
|
|
|
|
|
|
|
|
def err_details(worker):
|
|
def err_details(worker):
|
|
@@ -39,8 +46,8 @@ def extract_domain(url):
|
|
return f"{host}" if port is None else f"{host}:{port}"
|
|
return f"{host}" if port is None else f"{host}:{port}"
|
|
|
|
|
|
|
|
|
|
-def extract_page_title(html):
|
|
|
|
- element = html2element(html)
|
|
|
|
|
|
+def extract_page_title(source):
|
|
|
|
+ element = html2element(source)
|
|
nodes = element.xpath('/html/head/title/text()|//title/text()')
|
|
nodes = element.xpath('/html/head/title/text()|//title/text()')
|
|
if len(nodes) > 1:
|
|
if len(nodes) > 1:
|
|
return "".join(";".join(nodes).split())
|
|
return "".join(";".join(nodes).split())
|
|
@@ -86,31 +93,59 @@ def get_url(url: str, parameters: dict):
|
|
return urljoin(url, _data)
|
|
return urljoin(url, _data)
|
|
|
|
|
|
|
|
|
|
-def iter_node(element: HtmlElement, depth=1):
|
|
|
|
- yield element, depth
|
|
|
|
- depth += 1
|
|
|
|
- for sub_element in element:
|
|
|
|
- if isinstance(sub_element, HtmlElement):
|
|
|
|
- yield from iter_node(sub_element, depth)
|
|
|
|
|
|
+def clean_html(source: str):
|
|
|
|
+ html_str = re.sub(r'<!--[\s\S]*?-->', '', source)
|
|
|
|
+ html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
|
|
|
|
+ html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
|
|
|
|
+ html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
|
|
|
|
+ html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
|
|
|
|
+ html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
|
|
|
|
+ html_str = re.sub(r'<img[^>]*>', '', html_str)
|
|
|
|
+ return html_str
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_text(source: str):
|
|
|
|
+ soup = BeautifulSoup(source, "lxml")
|
|
|
|
+ return soup.get_text()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def verify_text(val: str, length=50):
|
|
|
|
+ """检查数字、字母、中文的个数"""
|
|
|
|
+ if val is None:
|
|
|
|
+ return False
|
|
|
|
+ sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
|
|
|
|
+ for pattern in sub_pattern:
|
|
|
|
+ val = re.sub(pattern, '', val)
|
|
|
|
+ # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
|
|
|
|
+ if len(val) < length:
|
|
|
|
+ '''无效文本'''
|
|
|
|
+ return False
|
|
|
|
+ '''有效文本'''
|
|
|
|
+ return True
|
|
|
|
|
|
|
|
|
|
def element2html(element: HtmlElement) -> str:
|
|
def element2html(element: HtmlElement) -> str:
|
|
- return tostring(element, encoding="utf-8").decode()
|
|
|
|
|
|
+ return unescape(tostring(element, encoding="utf-8").decode())
|
|
|
|
|
|
|
|
|
|
-def html2element(html_str: str, base_url=None) -> HtmlElement:
|
|
|
|
- html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
|
|
|
|
|
|
+def html2element(source: str, base_url=None) -> HtmlElement:
|
|
|
|
+ html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', source)
|
|
|
|
+ html_str = re.sub('<!--[\s\S]*?-->', '', html_str) # 清除注释
|
|
|
|
+ html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str) # 清除样式
|
|
|
|
+ html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>', '', html_str) # 清除js
|
|
html_str = re.sub('</?br.*?>', '', html_str)
|
|
html_str = re.sub('</?br.*?>', '', html_str)
|
|
html_str = re.sub(r'<\?xml.*?>', '', html_str)
|
|
html_str = re.sub(r'<\?xml.*?>', '', html_str)
|
|
html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
|
|
html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
|
|
return fromstring(html_str, base_url=base_url)
|
|
return fromstring(html_str, base_url=base_url)
|
|
|
|
|
|
|
|
|
|
-def valid_element(node: HtmlElement, feature: str):
|
|
|
|
- if len(node.xpath(feature)) > 0:
|
|
|
|
- return True
|
|
|
|
- else:
|
|
|
|
- return False
|
|
|
|
|
|
+def iter_node(element: HtmlElement, depth=1):
|
|
|
|
+ yield element, depth
|
|
|
|
+ depth += 1
|
|
|
|
+ for sub_element in element:
|
|
|
|
+ if isinstance(sub_element, HtmlElement):
|
|
|
|
+ yield from iter_node(sub_element, depth)
|
|
|
|
+ # print('退出', depth)
|
|
|
|
|
|
|
|
|
|
def remove_node(node: HtmlElement):
|
|
def remove_node(node: HtmlElement):
|
|
@@ -121,7 +156,8 @@ def remove_node(node: HtmlElement):
|
|
"""
|
|
"""
|
|
parent = node.getparent()
|
|
parent = node.getparent()
|
|
if parent is not None:
|
|
if parent is not None:
|
|
- parent.remove(node)
|
|
|
|
|
|
+ node.drop_tree()
|
|
|
|
+ # parent.remove(node)
|
|
|
|
|
|
|
|
|
|
def drop_tag(node: HtmlElement):
|
|
def drop_tag(node: HtmlElement):
|
|
@@ -135,32 +171,66 @@ def drop_tag(node: HtmlElement):
|
|
node.drop_tag()
|
|
node.drop_tag()
|
|
|
|
|
|
|
|
|
|
-def clean_html(html_str: str):
|
|
|
|
- html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
|
|
|
|
- html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
|
|
|
|
- html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
|
|
|
|
- html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
|
|
|
|
- html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
|
|
|
|
- html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
|
|
|
|
- html_str = re.sub(r'<img[^>]*>', '', html_str)
|
|
|
|
- return html_str
|
|
|
|
|
|
+def is_empty_element(node: HtmlElement):
|
|
|
|
+ return not node.getchildren() and not node.text
|
|
|
|
|
|
|
|
|
|
-def extract_text(html_str: str):
|
|
|
|
- soup = BeautifulSoup(html_str, "lxml")
|
|
|
|
- return soup.get_text()
|
|
|
|
|
|
+def normalize_node(element: HtmlElement):
|
|
|
|
+ etree.strip_elements(element, *USELESS_TAG, with_tail=False)
|
|
|
|
+ # 节点预处理,删除节点与更新节点的操作在同一循环发生时,更新节点的操作不会生效,原因:?
|
|
|
|
+ # 空节点合并、噪声节点剔除
|
|
|
|
+ for node, _ in iter_node(element):
|
|
|
|
+ if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
|
|
|
|
+ remove_node(node)
|
|
|
|
|
|
|
|
+ if node.tag.lower() == 'p':
|
|
|
|
+ etree.strip_tags(node, 'span')
|
|
|
|
+ etree.strip_tags(node, 'strong')
|
|
|
|
|
|
-def verify_text(val: str, length=50):
|
|
|
|
|
|
+ # if a div tag does not contain any sub node, it could be converted to p node.
|
|
|
|
+ if node.tag.lower() == 'div' and not node.getchildren():
|
|
|
|
+ node.tag = 'p'
|
|
|
|
+
|
|
|
|
+ if node.tag.lower() == 'span' and not node.getchildren():
|
|
|
|
+ node.tag = 'p'
|
|
|
|
+
|
|
|
|
+ # remove empty p tag
|
|
|
|
+ if node.tag.lower() == 'p' and not node.xpath('.//img'):
|
|
|
|
+ if not (node.text and node.text.strip()):
|
|
|
|
+ drop_tag(node)
|
|
|
|
+
|
|
|
|
+ # Delete inline styles
|
|
|
|
+ style = node.get('style')
|
|
|
|
+ if style:
|
|
|
|
+ del node.attrib['style']
|
|
|
|
+
|
|
|
|
+ # 删除包含干扰属性的节点(完全匹配)
|
|
|
|
+ for node, _ in iter_node(element):
|
|
|
|
+ attr = (node.get('id') or node.get('class'))
|
|
|
|
+ if attr:
|
|
|
|
+ if attr.lower() in USELESS_ATTR:
|
|
|
|
+ remove_node(node)
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ # # 删除无效节点(模糊匹配)
|
|
|
|
+ # for node, _ in iter_node(element):
|
|
|
|
+ # attrib = (node.get('id') or node.get('class'))
|
|
|
|
+ # if attrib:
|
|
|
|
+ # for attr in USELESS_ATTR:
|
|
|
|
+ # if re.match(attr, attrib.lower()) is not None:
|
|
|
|
+ # remove_node(node)
|
|
|
|
+ # break
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def pre_parse(element):
|
|
|
|
+ normalize_node(element)
|
|
|
|
+ return element
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def is_title(val: str):
|
|
"""检查数字、字母、中文的个数"""
|
|
"""检查数字、字母、中文的个数"""
|
|
- if val is None:
|
|
|
|
- return False
|
|
|
|
- sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
|
|
|
|
- for pattern in sub_pattern:
|
|
|
|
- val = re.sub(pattern, '', val)
|
|
|
|
- # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
|
|
|
|
- if len(val) < length:
|
|
|
|
- '''无效文本'''
|
|
|
|
- return False
|
|
|
|
- '''有效文本'''
|
|
|
|
- return True
|
|
|
|
|
|
+ for keyword in KEYWORDS:
|
|
|
|
+ search = re.search(keyword, val)
|
|
|
|
+ if search is not None:
|
|
|
|
+ return True
|
|
|
|
+ return False
|