Przeglądaj źródła

添加url格式畸形检验

dongzhaorui 3 lat temu
rodzic
commit
3f4ab35e1e
1 zmienionych plików z 14 dodań i 0 usunięć
  1. 14 0
      find_source/crawler/utils.py

+ 14 - 0
find_source/crawler/utils.py

@@ -1,3 +1,5 @@
+import re
+
 from urllib3 import get_host
 
 from common.log import logger
@@ -35,3 +37,15 @@ def extract_page_title(html):
     if len(nodes) > 1:
         return "".format(nodes[-1]).strip()
     return "".join(nodes).strip()
+
+
+def valid_url(url):
+    """判断url格式畸形与否"""
+    _regex = re.compile(
+        r'^(?:http|ftp)s?://'  # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+    return re.match(_regex, url) is not None