dongzhaorui %!s(int64=3) %!d(string=hai) anos
pai
achega
0c2316c60f
Modificáronse 1 ficheiros con 4 adicións e 4 borrados
  1. 4 4
      find_source/crawler/utils.py

+ 4 - 4
find_source/crawler/utils.py

@@ -23,7 +23,7 @@ def extract_base_url(url):
     return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
     return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
 
 
 
 
-def parser_domain(val: str):
+def split_domain(val: str):
     if re.match(r'\d+', val) is None:
     if re.match(r'\d+', val) is None:
         return re.split(r'[\\.:]', val)
         return re.split(r'[\\.:]', val)
     return [val]
     return [val]
@@ -40,9 +40,9 @@ def extract_domain(url):
 
 
 def extract_page_title(html):
 def extract_page_title(html):
     element = html2element(html)
     element = html2element(html)
-    nodes = element.xpath('/html/head/title/text()')
+    nodes = element.xpath('/html/head/title/text()|//title/text()')
     if len(nodes) > 1:
     if len(nodes) > 1:
-        return "".join("".format(nodes[-1]).split())
+        return "".join("".join(nodes[-1]).split())
     return "".join("".join(nodes).split())
     return "".join("".join(nodes).split())
 
 
 
 
@@ -67,7 +67,7 @@ def is_domain(domain):
 
 
 
 
 def label_split(val):
 def label_split(val):
-    '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
+    # '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
     result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
     result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
     result = [v for v in result if len(v) > 0]
     result = [v for v in result if len(v) > 0]
     return result
     return result