|
@@ -28,16 +28,17 @@ class DomAnalysis(FilterUrl):
|
|
|
|
|
|
def get_urls(self):
|
|
|
urls = []
|
|
|
- # 静态页面链接分析 和 javascript动态解析
|
|
|
+ # 静态页面链接解析 和 javascript动态解析
|
|
|
for tag in self.soup.find_all('a'):
|
|
|
if self.judge(tag.get('href')):
|
|
|
urls.append(self.filter(tag.get('href')))
|
|
|
|
|
|
- # 自动交互. 这里采用静态析的思路提取交互式生成的链接
|
|
|
- for tag in self.soup.find_all(self._is_input_with_onclick):
|
|
|
- for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
- if self.judge(self.onclick_filter(item)):
|
|
|
- urls.append(self.filter(self.onclick_filter(item)))
|
|
|
+ # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
|
|
|
+ for tag in self.soup.find_all():
|
|
|
+ if self._is_input_with_onclick(tag):
|
|
|
+ for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
+ if self.judge(self.onclick_filter(item)):
|
|
|
+ urls.append(self.filter(self.onclick_filter(item)))
|
|
|
return urls
|
|
|
|
|
|
def get_items(self):
|
|
@@ -62,8 +63,9 @@ class DomAnalysis(FilterUrl):
|
|
|
if self.judge(tag.get('href')):
|
|
|
_extract()
|
|
|
|
|
|
- for tag in self.soup.find_all(self._is_input_with_onclick):
|
|
|
- for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
- if self.judge(self.onclick_filter(item)):
|
|
|
- _extract()
|
|
|
+ for tag in self.soup.find_all():
|
|
|
+ if self._is_input_with_onclick(tag):
|
|
|
+ for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
+ if self.judge(self.onclick_filter(item)):
|
|
|
+ _extract()
|
|
|
return items
|