import datetime
import hashlib
import re
import time
from bs4 import BeautifulSoup
from lxml.html import HtmlElement, fromstring, tostring
def element2html(element: HtmlElement) -> str:
return tostring(element, encoding="utf-8").decode()
def html2element(html_str: str, base_url=None) -> HtmlElement:
html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
html_str = re.sub('?br.*?>', '', html_str)
html_str = re.sub(r'<\?xml.*?>', '', html_str)
html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
return fromstring(html_str, base_url=base_url)
def valid_element(node: HtmlElement, feature: str):
if len(node.xpath(feature)) > 0:
return True
else:
return False
def remove_node(node: HtmlElement):
"""
this is a in-place operation, not necessary to return
:param node:
:return:
"""
parent = node.getparent()
if parent is not None:
parent.remove(node)
def clean_html(html_str: str):
html_str = re.sub(r'', '', html_str)
html_str = re.sub(r'|]*>|', '', html_str)
html_str = re.sub(r'
[\s\S]*?', '', html_str)
html_str = re.sub(r'|', '', html_str)
html_str = re.sub(r']*>[\s\S]*?', '', html_str)
html_str = re.sub(r'', '', html_str)
html_str = re.sub(r'
]*>', '', html_str)
return html_str
def extract_text(html_str: str):
soup = BeautifulSoup(html_str, "lxml")
return soup.get_text()
def verify_text(val: str, length=50):
"""检查数字、字母、中文的个数"""
if val is None:
return False
sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
for pattern in sub_pattern:
val = re.sub(pattern, '', val)
# 若文本长度小于指定文本长度(length),表示页面内容无详情内容
if len(val) < length:
'''无效文本'''
return False
'''有效文本'''
return True
def sha1(text: str):
"""
十六进制数字字符串形式摘要值
@param text: 字符串文本
@return: 摘要值
"""
_sha1 = hashlib.sha1()
_sha1.update(text.encode("utf-8"))
return _sha1.hexdigest()
def get_ms() -> int:
return int(round(time.time() * 1000))
def get_current_date():
return datetime.datetime.now().strftime("%Y-%m-%d")
def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
"""毫秒转日期"""
timestamp = float(ms / 1000)
time_array = time.localtime(timestamp)
return time.strftime(fmt, time_array)
def convert2type(ts_str):
"""字符串类型时间戳转成整型"""
return int(float(ts_str) / 1000)
def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
"""
时间戳转成日期
:param ts_str: 毫秒级时间戳
:param fmt: 日期格式
:return: 日期
"""
timestamp = int(float(ts_str) / 1000)
time_array = time.localtime(timestamp)
return time.strftime(fmt, time_array)
def date2ts(date_str: str, fmt="%Y-%m-%d"):
"""日期转成时间戳"""
time_array = time.strptime(date_str, fmt)
timestamp = int(time.mktime(time_array))
return timestamp
def delay_by(delay=0, method='seconds', fmt="%Y-%m-%d %H:%M:%S"):
"""按指定方式获得顺延时间"""
_current_now = datetime.datetime.now()
if method == 'days':
_timedelta = datetime.timedelta(days=delay)
elif method == 'hours':
_timedelta = datetime.timedelta(hours=delay)
elif method == 'minutes':
_timedelta = datetime.timedelta(minutes=delay)
elif method == 'microseconds':
_timedelta = datetime.timedelta(microseconds=delay)
else:
_timedelta = datetime.timedelta(seconds=delay)
return (_current_now + _timedelta).strftime(fmt)