|
@@ -23,7 +23,7 @@ def extract_base_url(url):
|
|
|
return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
|
|
|
|
|
|
|
|
|
-def parser_domain(val: str):
|
|
|
+def split_domain(val: str):
|
|
|
if re.match(r'\d+', val) is None:
|
|
|
return re.split(r'[\\.:]', val)
|
|
|
return [val]
|
|
@@ -40,9 +40,9 @@ def extract_domain(url):
|
|
|
|
|
|
def extract_page_title(html):
|
|
|
element = html2element(html)
|
|
|
- nodes = element.xpath('/html/head/title/text()')
|
|
|
+ nodes = element.xpath('/html/head/title/text()|//title/text()')
|
|
|
if len(nodes) > 1:
|
|
|
- return "".join("".format(nodes[-1]).split())
|
|
|
+ return "".join("".join(nodes[-1]).split())
|
|
|
return "".join("".join(nodes).split())
|
|
|
|
|
|
|
|
@@ -67,7 +67,7 @@ def is_domain(domain):
|
|
|
|
|
|
|
|
|
def label_split(val):
|
|
|
- '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
|
|
|
+ # '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
|
|
|
result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
|
|
|
result = [v for v in result if len(v) > 0]
|
|
|
return result
|