3 lat temu · 3f4ab35e1e
--- a/find_source/crawler/utils.py
+++ b/find_source/crawler/utils.py
@@ -1,3 +1,5 @@
 
				+import re
			
 
				+
			
 
				 from urllib3 import get_host
			
 
				 
			
 
				 from common.log import logger
			
@@ -35,3 +37,15 @@ def extract_page_title(html):
 
				     if len(nodes) > 1:
			
 
				         return "".format(nodes[-1]).strip()
			
 
				     return "".join(nodes).strip()
			
 
				+
			
 
				+
			
 
				+def valid_url(url):
			
 
				+    """判断url格式畸形与否"""
			
 
				+    _regex = re.compile(
			
 
				+        r'^(?:http|ftp)s?://'  # http:// or https://
			
 
				+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
			
 
				+        r'localhost|'  # localhost...
			
 
				+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
			
 
				+        r'(?::\d+)?'  # optional port
			
 
				+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
			
 
				+    return re.match(_regex, url) is not None