3 năm trước cách đây · b4198dfe5c
--- a/find_source/common/analysis/DomAnalysis.py
+++ b/find_source/common/analysis/DomAnalysis.py
@@ -0,0 +1,63 @@
 
				+import re
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+from common.analysis.FilterUrl import FilterUrl
			
 
				+
			
 
				+
			
 
				+class DomAnalysis(FilterUrl):
			
 
				+    """
			
 
				+    Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
			
 
				+        Tag
			
 
				+        NavigableString
			
 
				+        BeautifulSoup
			
 
				+        Comment
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, dom, base_url):
			
 
				+        self.soup = BeautifulSoup(dom, "lxml")
			
 
				+        self.url = base_url
			
 
				+        self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
			
 
				+
			
 
				+    def show_html(self):
			
 
				+        #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
			
 
				+        print(self.soup.prettify().encode('utf-8', 'ignore'))
			
 
				+
			
 
				+    def _is_input_with_onclick(self, tag):
			
 
				+        return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
			
 
				+
			
 
				+    def get_urls(self):
			
 
				+        urls = []
			
 
				+        # 静态页面链接分析 和 javascript动态解析
			
 
				+        for tag in self.soup.find_all('a'):
			
 
				+            if self.judge(tag.get('href')):
			
 
				+                urls.append(self.filter(tag.get('href')))
			
 
				+
			
 
				+        #  自动交互. 这里采用静态析的思路提取交互式生成的链接
			
 
				+        for tag in self.soup.find_all(self._is_input_with_onclick):
			
 
				+            for item in re.findall(self.pattern, tag.get('onclick')):
			
 
				+                if self.judge(self.onclick_filter(item)):
			
 
				+                    urls.append(self.filter(self.onclick_filter(item)))
			
 
				+        return urls
			
 
				+
			
 
				+    def get_items(self):
			
 
				+        items = []
			
 
				+        # 静态页面链接分析 和 javascript动态解析
			
 
				+        for tag in self.soup.find_all('a'):
			
 
				+            if self.judge(tag.get('href')):
			
 
				+                name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
			
 
				+                href = self.filter(tag.get('href'))
			
 
				+                item = {'name': name, 'host': href}
			
 
				+                if item not in items:
			
 
				+                    items.append(item)
			
 
				+
			
 
				+        #  自动交互. 这里采用静态析的思路提取交互式生成的链接
			
 
				+        for tag in self.soup.find_all(self._is_input_with_onclick):
			
 
				+            for item in re.findall(self.pattern, tag.get('onclick')):
			
 
				+                if self.judge(self.onclick_filter(item)):
			
 
				+                    name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
			
 
				+                    href = self.filter(tag.get('href'))
			
 
				+                    item = {'name': name, 'host': href}
			
 
				+                    if item not in items:
			
 
				+                        items.append(item)
			
 
				+        return items
			
--- a/find_source/common/analysis/FilterUrl.py
+++ b/find_source/common/analysis/FilterUrl.py
@@ -0,0 +1,31 @@
 
				+import re
			
 
				+from urllib.parse import urljoin
			
 
				+
			
 
				+
			
 
				+class FilterUrl:
			
 
				+
			
 
				+    def judge(self, link):
			
 
				+        if link == None:
			
 
				+            return False
			
 
				+        elif link == '/':
			
 
				+            return False
			
 
				+        elif link.find('javascript:') == 0:
			
 
				+            return False
			
 
				+        # elif link.find('http') != -1 and link.find(self.url) == -1:
			
 
				+        #     # 去掉不在同源策略范围链接
			
 
				+        #     return False
			
 
				+        elif link.find('http') != -1 and link.find(self.url) != -1:
			
 
				+            # 保留非同源策略范围链接
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    def filter(self, link):
			
 
				+        if link.find('http') != 0:
			
 
				+            # return self.url.rstrip('/') + "/" + link.lstrip('/')
			
 
				+            return urljoin(self.url, link)
			
 
				+        else:
			
 
				+            return link
			
 
				+
			
 
				+    def onclick_filter(self, link):
			
 
				+        link_pattern = re.compile("[\"'][ ]*\+[ ]*[\"']")
			
 
				+        return re.sub(link_pattern, '', link)
			
--- a/find_source/common/analysis/__init__.py
+++ b/find_source/common/analysis/__init__.py
@@ -0,0 +1,16 @@
 
				+from .DomAnalysis import DomAnalysis
			
 
				+
			
 
				+__all__ = [
			
 
				+    'parse_urls',
			
 
				+    'parser_items'
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def parse_urls(dom, base_url):
			
 
				+    dom_handler = DomAnalysis(dom, base_url)
			
 
				+    return dom_handler.get_urls()
			
 
				+
			
 
				+
			
 
				+def parser_items(dom, base_url):
			
 
				+    dom_handler = DomAnalysis(dom, base_url)
			
 
				+    return dom_handler.get_items()