před 3 roky · 6886bf314d
--- a/find_source/crawler/utils.py
+++ b/find_source/crawler/utils.py
@@ -39,7 +39,7 @@ def extract_page_title(html):
 
				     return "".join(nodes).strip()
			
 
				 
			
 
				 
			
 
				-def valid_url(url):
			
 
				+def is_url(url):
			
 
				     """判断url格式畸形与否"""
			
 
				     _regex = re.compile(
			
 
				         r'^(?:http|ftp)s?://'  # http:// or https://
			
@@ -49,3 +49,11 @@ def valid_url(url):
 
				         r'(?::\d+)?'  # optional port
			
 
				         r'(?:/?|[/?]\S+)$', re.IGNORECASE)
			
 
				     return re.match(_regex, url) is not None
			
 
				+
			
 
				+
			
 
				+def is_domain(domain):
			
 
				+    _regex = re.compile(
			
 
				+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
			
 
				+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
			
 
				+        r'(?::\d+)?', re.IGNORECASE)
			
 
				+    return re.match(_regex, domain) is not None