dongzhaorui 3 years ago
parent
commit
04877055e7
1 changed files with 12 additions and 10 deletions
  1. 12 10
      find_source/crawler/analysis/DomAnalysis.py

+ 12 - 10
find_source/crawler/analysis/DomAnalysis.py

@@ -28,16 +28,17 @@ class DomAnalysis(FilterUrl):
 
 
     def get_urls(self):
     def get_urls(self):
         urls = []
         urls = []
-        # 静态页面链接析 和 javascript动态解析
+        # 静态页面链接析 和 javascript动态解析
         for tag in self.soup.find_all('a'):
         for tag in self.soup.find_all('a'):
             if self.judge(tag.get('href')):
             if self.judge(tag.get('href')):
                 urls.append(self.filter(tag.get('href')))
                 urls.append(self.filter(tag.get('href')))
 
 
-        #  自动交互. 这里采用静态析的思路提取交互式生成的链接
-        for tag in self.soup.find_all(self._is_input_with_onclick):
-            for item in re.findall(self.pattern, tag.get('onclick')):
-                if self.judge(self.onclick_filter(item)):
-                    urls.append(self.filter(self.onclick_filter(item)))
+        # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
+        for tag in self.soup.find_all():
+            if self._is_input_with_onclick(tag):
+                for item in re.findall(self.pattern, tag.get('onclick')):
+                    if self.judge(self.onclick_filter(item)):
+                        urls.append(self.filter(self.onclick_filter(item)))
         return urls
         return urls
 
 
     def get_items(self):
     def get_items(self):
@@ -62,8 +63,9 @@ class DomAnalysis(FilterUrl):
             if self.judge(tag.get('href')):
             if self.judge(tag.get('href')):
                 _extract()
                 _extract()
 
 
-        for tag in self.soup.find_all(self._is_input_with_onclick):
-            for item in re.findall(self.pattern, tag.get('onclick')):
-                if self.judge(self.onclick_filter(item)):
-                    _extract()
+        for tag in self.soup.find_all():
+            if self._is_input_with_onclick(tag):
+                for item in re.findall(self.pattern, tag.get('onclick')):
+                    if self.judge(self.onclick_filter(item)):
+                        _extract()
         return items
         return items