3 gadi atpakaļ · c3db8f5da8
--- a/find_source/crawler/analysis/DomAnalysis.py
+++ b/find_source/crawler/analysis/DomAnalysis.py
@@ -14,25 +14,11 @@ class DomAnalysis(FilterUrl):
 
				         Comment
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, isogeny: bool, dom: str, host=None, request_url=None):
			
 
				-        self.soup = BeautifulSoup(dom, "lxml")
			
 
				+    def __init__(self, isogeny: bool, dom: str, addr: str):
			
 
				         self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
			
 
				         self.isogeny = isogeny
			
 
				-        if self.isogeny:
			
 
				-            if host is None:
			
 
				-                raise TypeError(
			
 
				-                    '{} missing 1 required positional argument: {}'.format(
			
 
				-                        self.__class__.__name__, 'host')
			
 
				-                )
			
 
				-            self.host = host  # 网址主机地址
			
 
				-        if not self.isogeny:
			
 
				-            if request_url is None:
			
 
				-                raise TypeError(
			
 
				-                    '{} missing 1 required positional argument: {}'.format(
			
 
				-                        self.__class__.__name__, 'request_url'
			
 
				-                    )
			
 
				-                )
			
 
				-            self.request_url = request_url  # 当前请求网址
			
 
				+        self.soup = BeautifulSoup(dom, "lxml")
			
 
				+        self.addr = addr  # 请求地址
			
 
				 
			
 
				     def show_html(self):
			
 
				         #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
			
@@ -46,14 +32,18 @@ class DomAnalysis(FilterUrl):
 
				         # 静态页面链接解析 和 javascript动态解析
			
 
				         for tag in self.soup.find_all('a'):
			
 
				             if self.judge(tag.get('href')):
			
 
				-                urls.append(self.filter(tag.get('href')))
			
 
				+                href = self.urljoin(tag.get('href'))
			
 
				+                if self.filter(href) and href not in urls:
			
 
				+                    urls.append(href)
			
 
				 
			
 
				         # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
			
 
				         for tag in self.soup.find_all():
			
 
				             if self._is_input_with_onclick(tag):
			
 
				                 for item in re.findall(self.pattern, tag.get('onclick')):
			
 
				                     if self.judge(self.onclick_filter(item)):
			
 
				-                        urls.append(self.filter(self.onclick_filter(item)))
			
 
				+                        href = self.urljoin(self.onclick_filter(item))
			
 
				+                        if self.filter(href) and href not in urls:
			
 
				+                            urls.append(href)
			
 
				         return urls
			
 
				 
			
 
				     def get_items(self):
			
@@ -67,11 +57,11 @@ class DomAnalysis(FilterUrl):
 
				             if tag.get('href') is None:
			
 
				                 return
			
 
				             try:
			
 
				-                href = self.filter(tag.get('href'))
			
 
				+                href = self.urljoin(tag.get('href'))
			
 
				             except ValueError:
			
 
				                 return
			
 
				             data = {'title': name, 'href': href}
			
 
				-            if data not in items:
			
 
				+            if self.filter(href) and data not in items:
			
 
				                 items.append(data)
			
 
				 
			
 
				         for tag in self.soup.find_all('a'):
			
--- a/find_source/crawler/analysis/FilterUrl.py
+++ b/find_source/crawler/analysis/FilterUrl.py
@@ -1,7 +1,7 @@
 
				 import re
			
 
				 from urllib.parse import urljoin
			
 
				 
			
 
				-from crawler.utils import extract_host
			
 
				+from crawler.utils import extract_host, is_url
			
 
				 
			
 
				 
			
 
				 class FilterUrl:
			
@@ -15,26 +15,28 @@ class FilterUrl:
 
				             return False
			
 
				         elif link.find('.htm') == -1 and re.search('([.][a-zA-Z]{3,5})$', link) is not None:
			
 
				             return False
			
 
				-        elif self.isogeny:
			
 
				-            if link.find('http') != -1 and link.find(self.host) == -1:
			
 
				-                # 保留同源策略范围链接
			
 
				-                return False
			
 
				-        elif not self.isogeny:
			
 
				-            host = extract_host(self.request_url)
			
 
				-            if link.find(host) != -1:
			
 
				-                # 保留非同源策略范围链接
			
 
				-                return False
			
 
				         return True
			
 
				 
			
 
				-    def filter(self, link):
			
 
				+    def urljoin(self, link):
			
 
				         if link.find('http') != 0:
			
 
				-            if self.isogeny:
			
 
				-                return urljoin(self.host, link).rstrip(' ')
			
 
				-            elif not self.isogeny:
			
 
				-                return urljoin(self.request_url, link).rstrip(' ')
			
 
				+            return urljoin(self.addr, link).rstrip(' ')
			
 
				         else:
			
 
				             return link.rstrip(' ')
			
 
				 
			
 
				+    def filter(self, link):
			
 
				+        if not is_url(link):
			
 
				+            return False
			
 
				+
			
 
				+        if self.isogeny:
			
 
				+            # 同源策略
			
 
				+            if link.find('http') != -1 and link.find(self.addr) == -1:
			
 
				+                return False
			
 
				+        else:
			
 
				+            # 非同源策略
			
 
				+            if link.find(extract_host(self.addr)) != -1:
			
 
				+                return False
			
 
				+        return True
			
 
				+
			
 
				     def onclick_filter(self, link):
			
 
				         link_pattern = re.compile("[\"'][ ]*\+[ ]*[\"']")
			
 
				         return re.sub(link_pattern, '', link)
			
--- a/find_source/crawler/analysis/TimeExtractor.py
+++ b/find_source/crawler/analysis/TimeExtractor.py
@@ -13,6 +13,7 @@ DATETIME_PATTERN = [
 
				     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
			
 
				     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
			
 
				     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
			
 
				+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
			
 
				     "(\d{1,2}[-|/|.]\d{1,2})",
			
 
				     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
			
 
				     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
			
@@ -43,7 +44,7 @@ class TimeExtractor:
 
				 
			
 
				     def extractor(self, element: HtmlElement) -> str:
			
 
				         # text = ''.join(element.xpath('.//text()'))
			
 
				-        text = ''.join(element.xpath('string(.)'))
			
 
				+        text = ''.join(element.xpath('string(.)').split())
			
 
				         for dt in self.time_pattern:
			
 
				             dt_obj = re.search(dt, text)
			
 
				             if dt_obj:
			
--- a/find_source/crawler/defaults.py
+++ b/find_source/crawler/defaults.py
@@ -51,7 +51,8 @@ KEYWORDS = {
 
				     '终止', '系统'
			
 
				 }
			
 
				 
			
 
				-FOOTER_TEXTS = {'网安备', '关于我们', '地图', '建议意见', '法律声明', '信箱'}
			
 
				-CATEGORY_TEXTS = {'政策', '办事指南', '首页', '党'}
			
 
				+FOOTER_TEXTS = {}
			
 
				 PAGE_TEXTS = {'尾页', '下页', '下一页'}
			
 
				-LOGIN_TEXTS = {'忘记密码', '登录', '注册'}
			
 
				+LOGIN_TEXTS = {'忘记密码', '登录', '注册'}
			
 
				+NAV_TEXTS = {'政策', '办事指南', '首页', '党', '操作手册', '关于我们', '地图',
			
 
				+             '建议意见', '法律声明', '信箱', '网安备', }
			
--- a/find_source/crawler/engines.py
+++ b/find_source/crawler/engines.py
@@ -52,7 +52,7 @@ class BingSearchEngine(JySearchEngine):
 
				     def parser(self, response):
			
 
				         urls = []
			
 
				         if response.status_code == 200:
			
 
				-            urls = parse_urls(response.text, self.site)
			
 
				+            urls = parse_urls(response.text, addr=self.site, isogeny=False)
			
 
				         return urls
			
 
				 
			
 
				     def percolator(self, urls, retrieve_urls):