dongzhaorui 3 سال پیش
والد
کامیت
b1675df2c5
3فایلهای تغییر یافته به همراه33 افزوده شده و 14 حذف شده
  1. 19 2
      find_source/common/tools.py
  2. 6 6
      find_source/crawler/analysis/DomAnalysis.py
  3. 8 6
      find_source/crawler/analysis/FilterUrl.py

+ 19 - 2
find_source/common/tools.py

@@ -3,6 +3,7 @@ import hashlib
 import re
 import time
 
+from bs4 import BeautifulSoup
 from lxml.html import HtmlElement, fromstring, tostring
 
 
@@ -10,12 +11,12 @@ def element2html(element: HtmlElement) -> str:
     return tostring(element, encoding="utf-8").decode()
 
 
-def html2element(html_str: str) -> HtmlElement:
+def html2element(html_str: str, base_url=None) -> HtmlElement:
     html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
     html_str = re.sub('</?br.*?>', '', html_str)
     html_str = re.sub(r'<\?xml.*?>', '', html_str)
     html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
-    return fromstring(html_str)
+    return fromstring(html_str, base_url=base_url)
 
 
 def valid_element(node: HtmlElement, feature: str):
@@ -36,6 +37,22 @@ def remove_node(node: HtmlElement):
         parent.remove(node)
 
 
+def clean_html(html_str: str):
+    html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
+    html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
+    html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
+    html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
+    html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
+    html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
+    html_str = re.sub(r'<img[^>]*>', '', html_str)
+    return html_str
+
+
+def extract_text(html_str: str):
+    soup = BeautifulSoup(html_str, "lxml")
+    return soup.get_text()
+
+
 def verify_text(val: str, length=50):
     """检查数字、字母、中文的个数"""
     if val is None:

+ 6 - 6
find_source/crawler/analysis/DomAnalysis.py

@@ -26,22 +26,22 @@ class DomAnalysis(FilterUrl):
     def _is_input_with_onclick(self, tag):
         return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
 
-    def get_urls(self):
+    def get_urls(self, **kwargs):
         urls = []
         # 静态页面链接解析 和 javascript动态解析
         for tag in self.soup.find_all('a'):
-            if self.judge(tag.get('href')):
+            if self.judge(tag.get('href'), **kwargs):
                 urls.append(self.filter(tag.get('href')))
 
         # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
         for tag in self.soup.find_all():
             if self._is_input_with_onclick(tag):
                 for item in re.findall(self.pattern, tag.get('onclick')):
-                    if self.judge(self.onclick_filter(item)):
+                    if self.judge(self.onclick_filter(item), **kwargs):
                         urls.append(self.filter(self.onclick_filter(item)))
         return urls
 
-    def get_items(self):
+    def get_items(self, **kwargs):
         items = []
 
         def _extract():
@@ -60,12 +60,12 @@ class DomAnalysis(FilterUrl):
                 items.append(data)
 
         for tag in self.soup.find_all('a'):
-            if self.judge(tag.get('href')):
+            if self.judge(tag.get('href'), **kwargs):
                 _extract()
 
         for tag in self.soup.find_all():
             if self._is_input_with_onclick(tag):
                 for item in re.findall(self.pattern, tag.get('onclick')):
-                    if self.judge(self.onclick_filter(item)):
+                    if self.judge(self.onclick_filter(item), **kwargs):
                         _extract()
         return items

+ 8 - 6
find_source/crawler/analysis/FilterUrl.py

@@ -4,18 +4,20 @@ from urllib.parse import urljoin
 
 class FilterUrl:
 
-    def judge(self, link):
+    def judge(self, link, isogeny=False):
         if link == None:
             return False
         elif link == '/':
             return False
         elif link.find('javascript:') == 0:
             return False
-        # elif link.find('http') != -1 and link.find(self.url) == -1:
-        #     # 去掉不在同源策略范围链接
-        #     return False
-        elif link.find('http') != -1 and link.find(self.url) != -1:
-            # 保留非同源策略范围链接
+        elif re.match('.*(js|jpg|png|pdf|zip)$', link) is not None:
+            return False
+        elif isogeny and link.find('http') != -1 and link.find(self.url) == -1:
+            # 保留同源策略范围链接
+            return False
+        elif not isogeny and link.find('http') != -1 and link.find(self.url) != -1:
+            # 去掉不在同源策略范围链接
             return False
         return True