|
@@ -42,22 +42,22 @@ class DomAnalysis(FilterUrl):
|
|
|
|
|
|
def get_items(self):
|
|
|
items = []
|
|
|
- # 静态页面链接分析 和 javascript动态解析
|
|
|
+
|
|
|
+ def _extract():
|
|
|
+ name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
|
|
|
+ if tag.get('href') is None:
|
|
|
+ return
|
|
|
+ href = self.filter(tag.get('href'))
|
|
|
+ data = {'name': name, 'host': href}
|
|
|
+ if data not in items:
|
|
|
+ items.append(data)
|
|
|
+
|
|
|
for tag in self.soup.find_all('a'):
|
|
|
if self.judge(tag.get('href')):
|
|
|
- name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
|
|
|
- href = self.filter(tag.get('href'))
|
|
|
- item = {'name': name, 'host': href}
|
|
|
- if item not in items:
|
|
|
- items.append(item)
|
|
|
+ _extract()
|
|
|
|
|
|
- # 自动交互. 这里采用静态析的思路提取交互式生成的链接
|
|
|
for tag in self.soup.find_all(self._is_input_with_onclick):
|
|
|
for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
if self.judge(self.onclick_filter(item)):
|
|
|
- name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
|
|
|
- href = self.filter(tag.get('href'))
|
|
|
- item = {'name': name, 'host': href}
|
|
|
- if item not in items:
|
|
|
- items.append(item)
|
|
|
+ _extract()
|
|
|
return items
|