dongzhaorui 3 жил өмнө
parent
commit
0c2316c60f

+ 4 - 4
find_source/crawler/utils.py

@@ -23,7 +23,7 @@ def extract_base_url(url):
     return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
 
 
-def parser_domain(val: str):
+def split_domain(val: str):
     if re.match(r'\d+', val) is None:
         return re.split(r'[\\.:]', val)
     return [val]
@@ -40,9 +40,9 @@ def extract_domain(url):
 
 def extract_page_title(html):
     element = html2element(html)
-    nodes = element.xpath('/html/head/title/text()')
+    nodes = element.xpath('/html/head/title/text()|//title/text()')
     if len(nodes) > 1:
-        return "".join("".format(nodes[-1]).split())
+        return "".join("".join(nodes[-1]).split())
     return "".join("".join(nodes).split())
 
 
@@ -67,7 +67,7 @@ def is_domain(domain):
 
 
 def label_split(val):
-    '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
+    # '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
     result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
     result = [v for v in result if len(v) > 0]
     return result