Explorar el Código

添加href抽取过滤的3种模式

dongzhaorui hace 3 años
padre
commit
93db197ac1

+ 16 - 3
find_source/crawler/analysis/DomAnalysis.py

@@ -3,6 +3,13 @@ import re
 from bs4 import BeautifulSoup
 
 from crawler.analysis.FilterUrl import FilterUrl
+from crawler.utils import extract_fqdn
+
+FILTER_MODES = {
+    0: 'general_mode',
+    1: 'same_origin_mode',
+    2: 'non_origin_mode',
+}
 
 
 class DomAnalysis(FilterUrl):
@@ -14,11 +21,17 @@ class DomAnalysis(FilterUrl):
         Comment
     """
 
-    def __init__(self, isogeny: bool, dom: str, addr: str):
+    def __init__(self, dom: str, url: str, mode=0):
+        """
+        :param dom: 页面源码
+        :param url: 当前访问网址
+        :param mode: 过滤模式:同源模式=1;非同源模式=2;通用模式=0
+        """
         self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
-        self.isogeny = isogeny
         self.soup = BeautifulSoup(dom, "lxml")
-        self.addr = addr  # 请求地址
+        self.mode = FILTER_MODES[mode]
+        self.request_url = url
+        self.domain = extract_fqdn(url)
 
     def show_html(self):
         #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符

+ 34 - 16
find_source/crawler/analysis/FilterUrl.py

@@ -1,40 +1,58 @@
 import re
 from urllib.parse import urljoin
 
-from crawler.utils import extract_host, is_url
+from crawler.utils import is_url
 
 
 class FilterUrl:
 
     def judge(self, link):
-        if link == None:
+        if link is None:
             return False
-        elif link == '/' or link == '#':
-            return False
-        elif link.find('javascript:') == 0 or link.find('//') == 0:
+
+        if any([
+            link == '/',
+            link == '#',
+            link.find('javascript:') == 0,
+            link.find('//') == 0
+        ]):
             return False
-        elif link.find('.htm') == -1 and re.search('([.][a-zA-Z]{3,5})$', link) is not None:
+
+        link_suffix = re.search('([.][a-zA-Z]{3,5})$', link)
+        if all([link.find('.htm') == -1, link_suffix is not None]):
             return False
+
         return True
 
     def urljoin(self, link):
         if link.find('http') != 0:
-            return urljoin(self.addr, link).rstrip(' ')
+            # 网站href若是以`./xxx/xxx.html`格式,使用reqeust_url来补充完整的访问地址
+            return urljoin(self.request_url, link).rstrip(' ')
         else:
             return link.rstrip(' ')
 
     def filter(self, link):
         if not is_url(link):
             return False
-
-        if self.isogeny:
-            # 同源策略
-            if link.find('http') != -1 and link.find(self.addr) == -1:
-                return False
-        else:
-            # 非同源策略
-            if link.find(extract_host(self.addr)) != -1:
-                return False
+        # 通用模式(符合网址规范的链接全部保留)
+        general_mode = all([
+            self.mode == 'general_mode',
+            link.find('http') == -1
+        ])
+        # 同源模式(保留与请求地址的主机地址相同链接)
+        same_origin_mode = all([
+            self.mode == 'same_origin_mode',
+            link.find('http') != -1,
+            link.find(self.domain) == -1,  # 包含相同域名(主机)地址
+        ])
+        # 非同源模式(过滤与请求地址的主机地址相同链接)
+        non_origin_mode = all([
+            self.mode == 'non_origin_mode',
+            link.find('http') != -1,
+            link.find(self.domain) != -1,  # 不包含相同域名(主机)地址
+        ])
+        if any([general_mode, same_origin_mode, non_origin_mode]):
+            return False
         return True
 
     def onclick_filter(self, link):

+ 8 - 18
find_source/crawler/analysis/__init__.py

@@ -9,30 +9,20 @@ __all__ = [
 ]
 
 
-def parse_urls(dom: str, **kwargs):
-    dom_handler = DomAnalysis(dom=dom, **kwargs)
+def parse_urls(dom, url, **kwargs):
+    dom_handler = DomAnalysis(dom=dom, url=url, **kwargs)
     return dom_handler.get_urls()
 
 
-def parser_items(dom: str, **kwargs):
-    dom_handler = DomAnalysis(dom=dom, **kwargs)
+def parser_items(dom, url, **kwargs):
+    dom_handler = DomAnalysis(dom=dom, url=url, **kwargs)
     return dom_handler.get_items()
 
 
 class Parser:
 
-    def __init__(self):
-        self._parse_urls = parse_urls
-        self._parser_items = parser_items
+    def urls(self, dom, url, **kwargs):
+        return parse_urls(dom, url=url, **kwargs)
 
-    def _site_items(self, dom, **kwargs):
-        return self._parser_items(dom, **kwargs)
-
-    def urls(self, isogeny, dom, **kwargs):
-        return self._parse_urls(dom, isogeny=isogeny, **kwargs)
-
-    def same_origin(self, dom, host):
-        return self._site_items(dom, isogeny=True, addr=host)
-
-    def non_origin(self, dom, url):
-        return self._site_items(dom, isogeny=False, addr=url)
+    def turls(self, dom, url, **kwargs):
+        return parser_items(dom, url=url, **kwargs)