dongzhaorui 3 жил өмнө
parent
commit
2860fdd7f4

+ 24 - 9
find_source/crawler/analysis/DomAnalysis.py

@@ -14,10 +14,25 @@ class DomAnalysis(FilterUrl):
         Comment
     """
 
-    def __init__(self, dom, base_url):
+    def __init__(self, isogeny: bool, dom: str, host=None, request_url=None):
         self.soup = BeautifulSoup(dom, "lxml")
-        self.url = base_url
         self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
+        self.isogeny = isogeny
+        if self.isogeny:
+            if host is None:
+                raise TypeError(
+                    '{} missing 1 required positional argument: {}'.format(
+                        self.__class__.__name__, 'host')
+                )
+            self.host = host  # 网址主机地址
+        if not self.isogeny:
+            if request_url is None:
+                raise TypeError(
+                    '{} missing 1 required positional argument: {}'.format(
+                        self.__class__.__name__, 'request_url'
+                    )
+                )
+            self.request_url = request_url  # 当前请求网址
 
     def show_html(self):
         #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
@@ -26,22 +41,22 @@ class DomAnalysis(FilterUrl):
     def _is_input_with_onclick(self, tag):
         return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
 
-    def get_urls(self, **kwargs):
+    def get_urls(self):
         urls = []
         # 静态页面链接解析 和 javascript动态解析
         for tag in self.soup.find_all('a'):
-            if self.judge(tag.get('href'), **kwargs):
+            if self.judge(tag.get('href')):
                 urls.append(self.filter(tag.get('href')))
 
         # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
         for tag in self.soup.find_all():
             if self._is_input_with_onclick(tag):
                 for item in re.findall(self.pattern, tag.get('onclick')):
-                    if self.judge(self.onclick_filter(item), **kwargs):
+                    if self.judge(self.onclick_filter(item)):
                         urls.append(self.filter(self.onclick_filter(item)))
         return urls
 
-    def get_items(self, **kwargs):
+    def get_items(self):
         items = []
 
         def _extract():
@@ -55,17 +70,17 @@ class DomAnalysis(FilterUrl):
                 href = self.filter(tag.get('href'))
             except ValueError:
                 return
-            data = {'name': name, 'host': href}
+            data = {'title': name, 'href': href}
             if data not in items:
                 items.append(data)
 
         for tag in self.soup.find_all('a'):
-            if self.judge(tag.get('href'), **kwargs):
+            if self.judge(tag.get('href')):
                 _extract()
 
         for tag in self.soup.find_all():
             if self._is_input_with_onclick(tag):
                 for item in re.findall(self.pattern, tag.get('onclick')):
-                    if self.judge(self.onclick_filter(item), **kwargs):
+                    if self.judge(self.onclick_filter(item)):
                         _extract()
         return items

+ 20 - 13
find_source/crawler/analysis/FilterUrl.py

@@ -1,32 +1,39 @@
 import re
 from urllib.parse import urljoin
 
+from crawler.utils import extract_host
+
 
 class FilterUrl:
 
-    def judge(self, link, isogeny=False):
+    def judge(self, link):
         if link == None:
             return False
-        elif link == '/':
-            return False
-        elif link.find('javascript:') == 0:
-            return False
-        elif re.match('.*(js|jpg|png|pdf|zip)$', link) is not None:
+        elif link == '/' or link == '#':
             return False
-        elif isogeny and link.find('http') != -1 and link.find(self.url) == -1:
-            # 保留同源策略范围链接
+        elif link.find('javascript:') == 0 or link.find('//') == 0:
             return False
-        elif not isogeny and link.find('http') != -1 and link.find(self.url) != -1:
-            # 去掉不在同源策略范围链接
+        elif link.find('.htm') == -1 and re.search('([.][a-zA-Z]{3,5})$', link) is not None:
             return False
+        elif self.isogeny:
+            if link.find('http') != -1 and link.find(self.host) == -1:
+                # 保留同源策略范围链接
+                return False
+        elif not self.isogeny:
+            host = extract_host(self.request_url)
+            if link.find(host) != -1:
+                # 保留非同源策略范围链接
+                return False
         return True
 
     def filter(self, link):
         if link.find('http') != 0:
-            # return self.url.rstrip('/') + "/" + link.lstrip('/')
-            return urljoin(self.url, link)
+            if self.isogeny:
+                return urljoin(self.host, link).rstrip(' ')
+            elif not self.isogeny:
+                return urljoin(self.request_url, link).rstrip(' ')
         else:
-            return link
+            return link.rstrip(' ')
 
     def onclick_filter(self, link):
         link_pattern = re.compile("[\"'][ ]*\+[ ]*[\"']")

+ 16 - 10
find_source/crawler/analysis/__init__.py

@@ -7,14 +7,14 @@ __all__ = [
 ]
 
 
-def parse_urls(dom, base_url, **kwargs):
-    dom_handler = DomAnalysis(dom, base_url)
-    return dom_handler.get_urls(**kwargs)
+def parse_urls(dom: str, **kwargs):
+    dom_handler = DomAnalysis(dom=dom, **kwargs)
+    return dom_handler.get_urls()
 
 
-def parser_items(dom, base_url, **kwargs):
-    dom_handler = DomAnalysis(dom, base_url)
-    return dom_handler.get_items(**kwargs)
+def parser_items(dom: str, **kwargs):
+    dom_handler = DomAnalysis(dom=dom, **kwargs)
+    return dom_handler.get_items()
 
 
 class Parser:
@@ -23,8 +23,14 @@ class Parser:
         self._parse_urls = parse_urls
         self._parser_items = parser_items
 
-    def site_items(self, dom, base_url):
-        return self._parser_items(dom, base_url)
+    def _site_items(self, dom, **kwargs):
+        return self._parser_items(dom, **kwargs)
 
-    def urls(self, dom, base_url):
-        return self._parse_urls(dom, base_url)
+    def urls(self, isogeny, dom, **kwargs):
+        return self._parse_urls(dom, isogeny=isogeny, **kwargs)
+
+    def same_origin(self, dom, host):
+        return self._site_items(dom, isogeny=True, host=host)
+
+    def non_origin(self, dom, url):
+        return self._site_items(dom, isogeny=False, request_url=url)