import datetime import hashlib import re import time from lxml.html import HtmlElement, fromstring, tostring def element2html(element: HtmlElement) -> str: return tostring(element, encoding="utf-8").decode() def html2element(html_str: str) -> HtmlElement: html_str = re.sub('\ufeff|\xa0|\u3000', '', html_str) html_str = re.sub('', '', html_str) html_str = re.sub(r'<\?xml.*?>', '', html_str) html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str) return fromstring(html_str) def valid_element(node: HtmlElement, feature: str): if len(node.xpath(feature)) > 0: return True else: return False def remove_node(node: HtmlElement): """ this is a in-place operation, not necessary to return :param node: :return: """ parent = node.getparent() if parent is not None: parent.remove(node) def verify_text(val: str): """检查数字、字母、中文的个数""" if val is None: return False sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+'] for pattern in sub_pattern: val = re.sub(pattern, '', val) # 若文本长度小于50,表示页面内容无详情内容 if len(val) < 50: '''无效文本''' return False '''有效文本''' return True def sha1(text: str): """ 十六进制数字字符串形式摘要值 @param text: 字符串文本 @return: 摘要值 """ _sha1 = hashlib.sha1() _sha1.update(text.encode("utf-8")) return _sha1.hexdigest() def get_ms() -> int: return int(round(time.time() * 1000)) def get_current_date(): return datetime.datetime.now().strftime("%Y-%m-%d") def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"): """毫秒转日期""" timestamp = float(ms / 1000) time_array = time.localtime(timestamp) return time.strftime(fmt, time_array) def convert2type(ts_str): """字符串类型时间戳转成整型""" return int(float(ts_str) / 1000) def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str: """ 时间戳转成日期 :param ts_str: 毫秒级时间戳 :param fmt: 日期格式 :return: 日期 """ timestamp = int(float(ts_str) / 1000) time_array = time.localtime(timestamp) return time.strftime(fmt, time_array) def date2ts(date_str: str, fmt="%Y-%m-%d"): """日期转成时间戳""" time_array = time.strptime(date_str, fmt) timestamp = int(time.mktime(time_array)) return timestamp def delay_by(delay=0, method='seconds', fmt="%Y-%m-%d %H:%M:%S"): """按指定方式获得顺延时间""" _current_now = datetime.datetime.now() if method == 'days': _timedelta = datetime.timedelta(days=delay) elif method == 'hours': _timedelta = datetime.timedelta(hours=delay) elif method == 'minutes': _timedelta = datetime.timedelta(minutes=delay) elif method == 'microseconds': _timedelta = datetime.timedelta(microseconds=delay) else: _timedelta = datetime.timedelta(seconds=delay) return (_current_now + _timedelta).strftime(fmt)