|
@@ -39,7 +39,7 @@ def extract_page_title(html):
|
|
|
return "".join(nodes).strip()
|
|
|
|
|
|
|
|
|
-def valid_url(url):
|
|
|
+def is_url(url):
|
|
|
"""判断url格式畸形与否"""
|
|
|
_regex = re.compile(
|
|
|
r'^(?:http|ftp)s?://' # http:// or https://
|
|
@@ -49,3 +49,11 @@ def valid_url(url):
|
|
|
r'(?::\d+)?' # optional port
|
|
|
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
|
|
return re.match(_regex, url) is not None
|
|
|
+
|
|
|
+
|
|
|
+def is_domain(domain):
|
|
|
+ _regex = re.compile(
|
|
|
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
|
|
|
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
|
+ r'(?::\d+)?', re.IGNORECASE)
|
|
|
+ return re.match(_regex, domain) is not None
|