|
@@ -3,7 +3,6 @@ import hashlib
|
|
|
import re
|
|
|
import time
|
|
|
from collections import namedtuple
|
|
|
-from urllib.parse import urlparse, urljoin
|
|
|
|
|
|
from lxml.html import HtmlElement, fromstring, tostring
|
|
|
|
|
@@ -36,21 +35,6 @@ def remove_node(node: HtmlElement):
|
|
|
parent.remove(node)
|
|
|
|
|
|
|
|
|
-def text_search(text: str) -> SearchText:
|
|
|
- """
|
|
|
- 中文检索
|
|
|
-
|
|
|
- :param text: 文本
|
|
|
- :return: 中文数量
|
|
|
- """
|
|
|
- if not text:
|
|
|
- return SearchText(0)
|
|
|
-
|
|
|
- results = re.findall('[\u4e00-\u9fa5]', text, re.S)
|
|
|
- # 列表长度即是中文的字数
|
|
|
- return SearchText(len(results))
|
|
|
-
|
|
|
-
|
|
|
def verify_text(val: str):
|
|
|
"""检查数字、字母、中文的个数"""
|
|
|
if val is None:
|
|
@@ -137,20 +121,3 @@ def delay_by_day(days, fmt="%Y-%m-%d %H:%M:%S"):
|
|
|
_days = int(days)
|
|
|
_current_now = datetime.datetime.now()
|
|
|
return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
|
|
|
-
|
|
|
-
|
|
|
-def compliance_href(href: str):
|
|
|
- if href in [None, ''] or re.match("^((https|http|ftp|rtsp|mms)?://|/\w+\\?)", href) is None:
|
|
|
- return False
|
|
|
- return True
|
|
|
-
|
|
|
-
|
|
|
-def is_href(href: str):
|
|
|
- result = urlparse(href)
|
|
|
- if all([len(result.scheme) == 0 and len(result.netloc) == 0]):
|
|
|
- return False
|
|
|
- return True
|
|
|
-
|
|
|
-
|
|
|
-def join_url(base, url):
|
|
|
- return urljoin(base, url)
|