|
@@ -1,3 +1,5 @@
|
|
|
+import re
|
|
|
+
|
|
|
from urllib3 import get_host
|
|
|
|
|
|
from common.log import logger
|
|
@@ -35,3 +37,15 @@ def extract_page_title(html):
|
|
|
if len(nodes) > 1:
|
|
|
return "".format(nodes[-1]).strip()
|
|
|
return "".join(nodes).strip()
|
|
|
+
|
|
|
+
|
|
|
+def valid_url(url):
|
|
|
+ """判断url格式畸形与否"""
|
|
|
+ _regex = re.compile(
|
|
|
+ r'^(?:http|ftp)s?://' # http:// or https://
|
|
|
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
|
|
|
+ r'localhost|' # localhost...
|
|
|
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
|
+ r'(?::\d+)?' # optional port
|
|
|
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
|
|
+ return re.match(_regex, url) is not None
|