|
@@ -0,0 +1,63 @@
|
|
|
+import re
|
|
|
+
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from common.analysis.FilterUrl import FilterUrl
|
|
|
+
|
|
|
+
|
|
|
+class DomAnalysis(FilterUrl):
|
|
|
+ """
|
|
|
+ Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
|
|
|
+ Tag
|
|
|
+ NavigableString
|
|
|
+ BeautifulSoup
|
|
|
+ Comment
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, dom, base_url):
|
|
|
+ self.soup = BeautifulSoup(dom, "lxml")
|
|
|
+ self.url = base_url
|
|
|
+ self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
|
|
|
+
|
|
|
+ def show_html(self):
|
|
|
+ #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
|
|
|
+ print(self.soup.prettify().encode('utf-8', 'ignore'))
|
|
|
+
|
|
|
+ def _is_input_with_onclick(self, tag):
|
|
|
+ return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
|
|
|
+
|
|
|
+ def get_urls(self):
|
|
|
+ urls = []
|
|
|
+ # 静态页面链接分析 和 javascript动态解析
|
|
|
+ for tag in self.soup.find_all('a'):
|
|
|
+ if self.judge(tag.get('href')):
|
|
|
+ urls.append(self.filter(tag.get('href')))
|
|
|
+
|
|
|
+ # 自动交互. 这里采用静态析的思路提取交互式生成的链接
|
|
|
+ for tag in self.soup.find_all(self._is_input_with_onclick):
|
|
|
+ for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
+ if self.judge(self.onclick_filter(item)):
|
|
|
+ urls.append(self.filter(self.onclick_filter(item)))
|
|
|
+ return urls
|
|
|
+
|
|
|
+ def get_items(self):
|
|
|
+ items = []
|
|
|
+ # 静态页面链接分析 和 javascript动态解析
|
|
|
+ for tag in self.soup.find_all('a'):
|
|
|
+ if self.judge(tag.get('href')):
|
|
|
+ name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
|
|
|
+ href = self.filter(tag.get('href'))
|
|
|
+ item = {'name': name, 'host': href}
|
|
|
+ if item not in items:
|
|
|
+ items.append(item)
|
|
|
+
|
|
|
+ # 自动交互. 这里采用静态析的思路提取交互式生成的链接
|
|
|
+ for tag in self.soup.find_all(self._is_input_with_onclick):
|
|
|
+ for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
+ if self.judge(self.onclick_filter(item)):
|
|
|
+ name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
|
|
|
+ href = self.filter(tag.get('href'))
|
|
|
+ item = {'name': name, 'host': href}
|
|
|
+ if item not in items:
|
|
|
+ items.append(item)
|
|
|
+ return items
|