|
@@ -2,8 +2,8 @@ import re
|
|
from html import unescape
|
|
from html import unescape
|
|
from urllib.parse import urlencode, urljoin
|
|
from urllib.parse import urlencode, urljoin
|
|
|
|
|
|
|
|
+import tldextract
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulSoup
|
|
-from lxml.etree import ParseError
|
|
|
|
from lxml.html import etree, HtmlElement, fromstring, tostring
|
|
from lxml.html import etree, HtmlElement, fromstring, tostring
|
|
from urllib3 import get_host
|
|
from urllib3 import get_host
|
|
|
|
|
|
@@ -13,7 +13,9 @@ from crawler.defaults import (
|
|
USELESS_ATTR,
|
|
USELESS_ATTR,
|
|
TAGS_CAN_BE_REMOVE_IF_EMPTY,
|
|
TAGS_CAN_BE_REMOVE_IF_EMPTY,
|
|
VALID_WORDS,
|
|
VALID_WORDS,
|
|
- VOID_WORDS
|
|
|
|
|
|
+ VOID_WORDS,
|
|
|
|
+ PAGE_TEXT_CHECK_WORDS,
|
|
|
|
+ PAGE_TEXT_FILTER_WORDS
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
@@ -24,6 +26,12 @@ def err_details(worker):
|
|
return worker
|
|
return worker
|
|
|
|
|
|
|
|
|
|
|
|
+def split_domain(val: str):
|
|
|
|
+ if re.match(r'\d+', val) is None:
|
|
|
|
+ return re.split(r'[\\.:]', val)
|
|
|
|
+ return [val]
|
|
|
|
+
|
|
|
|
+
|
|
def extract_host(url):
|
|
def extract_host(url):
|
|
"""
|
|
"""
|
|
|
|
|
|
@@ -33,19 +41,23 @@ def extract_host(url):
|
|
return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
|
|
return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
|
|
|
|
|
|
|
|
|
|
-def split_domain(val: str):
|
|
|
|
- if re.match(r'\d+', val) is None:
|
|
|
|
- return re.split(r'[\\.:]', val)
|
|
|
|
- return [val]
|
|
|
|
-
|
|
|
|
-
|
|
|
|
def extract_domain(url):
|
|
def extract_domain(url):
|
|
"""
|
|
"""
|
|
|
|
+ 抽取一级域名,使用点连接域和后缀字段(如果提供的域名是ipv4,就返回ipv4;)
|
|
|
|
|
|
- # >>> base_url = extract_domain('http://192.168.3.207:8080/')
|
|
|
|
|
|
+ # >>> extract_domain('http://192.168.3.207:8080/')
|
|
|
|
+ 192.168.3.207
|
|
|
|
+ # >>> extract_domain('http://forums.bbc.co.uk')
|
|
|
|
+ 'bbc.co.uk'
|
|
"""
|
|
"""
|
|
- _, host, port = get_host(url)
|
|
|
|
- return f"{host}" if port is None else f"{host}:{port}"
|
|
|
|
|
|
+ ext = tldextract.extract(url)
|
|
|
|
+ return ext.registered_domain or ext.ipv4
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_fqdn(url):
|
|
|
|
+ """返回一个完全限定的域名"""
|
|
|
|
+ ext = tldextract.extract(url)
|
|
|
|
+ return ext.fqdn or ext.ipv4
|
|
|
|
|
|
|
|
|
|
def extract_page_title(source):
|
|
def extract_page_title(source):
|
|
@@ -53,7 +65,7 @@ def extract_page_title(source):
|
|
try:
|
|
try:
|
|
element = html2element(source)
|
|
element = html2element(source)
|
|
node = element.xpath('/html/head/title/text()|//title/text()')
|
|
node = element.xpath('/html/head/title/text()|//title/text()')
|
|
- except ParseError:
|
|
|
|
|
|
+ except etree.ParserError:
|
|
pass
|
|
pass
|
|
if len(node) > 1:
|
|
if len(node) > 1:
|
|
return "".join(";".join(node).split())
|
|
return "".join(";".join(node).split())
|
|
@@ -73,11 +85,10 @@ def is_url(url):
|
|
|
|
|
|
|
|
|
|
def is_domain(domain):
|
|
def is_domain(domain):
|
|
- _regex = re.compile(
|
|
|
|
- r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
|
|
|
|
- r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
|
|
- r'(?::\d+)?', re.IGNORECASE)
|
|
|
|
- return re.match(_regex, domain) is not None
|
|
|
|
|
|
+ ext = tldextract.extract(domain)
|
|
|
|
+ if not ext.domain:
|
|
|
|
+ return False
|
|
|
|
+ return True
|
|
|
|
|
|
|
|
|
|
def label_split(val):
|
|
def label_split(val):
|
|
@@ -142,6 +153,9 @@ def html2element(source: str, base_url=None) -> HtmlElement:
|
|
html_str = re.sub('</?br.*?>', '', html_str)
|
|
html_str = re.sub('</?br.*?>', '', html_str)
|
|
html_str = re.sub(r'<\?xml.*?>', '', html_str)
|
|
html_str = re.sub(r'<\?xml.*?>', '', html_str)
|
|
html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
|
|
html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
|
|
|
|
+ if len(html_str) == 0:
|
|
|
|
+ # 防止因清洗页面元素,实例elementHtml对象时报错
|
|
|
|
+ html_str = '''<html lang="en"></html>'''
|
|
return fromstring(html_str, base_url=base_url)
|
|
return fromstring(html_str, base_url=base_url)
|
|
|
|
|
|
|
|
|
|
@@ -239,3 +253,16 @@ def check_text_by_words(val: str):
|
|
if search is not None:
|
|
if search is not None:
|
|
return True
|
|
return True
|
|
return False
|
|
return False
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def check_page_by_words(val: str):
|
|
|
|
+ if 7 < len(val) < 100:
|
|
|
|
+ for word in PAGE_TEXT_FILTER_WORDS:
|
|
|
|
+ search = re.search(word, val)
|
|
|
|
+ if search is not None:
|
|
|
|
+ return False
|
|
|
|
+ for keyword in PAGE_TEXT_CHECK_WORDS:
|
|
|
|
+ search = re.search(keyword, val)
|
|
|
|
+ if search is not None:
|
|
|
|
+ return True
|
|
|
|
+ return False
|