dongzhaorui 3 gadi atpakaļ
vecāks
revīzija
c3db8f5da8

+ 11 - 21
find_source/crawler/analysis/DomAnalysis.py

@@ -14,25 +14,11 @@ class DomAnalysis(FilterUrl):
         Comment
     """
 
-    def __init__(self, isogeny: bool, dom: str, host=None, request_url=None):
-        self.soup = BeautifulSoup(dom, "lxml")
+    def __init__(self, isogeny: bool, dom: str, addr: str):
         self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
         self.isogeny = isogeny
-        if self.isogeny:
-            if host is None:
-                raise TypeError(
-                    '{} missing 1 required positional argument: {}'.format(
-                        self.__class__.__name__, 'host')
-                )
-            self.host = host  # 网址主机地址
-        if not self.isogeny:
-            if request_url is None:
-                raise TypeError(
-                    '{} missing 1 required positional argument: {}'.format(
-                        self.__class__.__name__, 'request_url'
-                    )
-                )
-            self.request_url = request_url  # 当前请求网址
+        self.soup = BeautifulSoup(dom, "lxml")
+        self.addr = addr  # 请求地址
 
     def show_html(self):
         #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
@@ -46,14 +32,18 @@ class DomAnalysis(FilterUrl):
         # 静态页面链接解析 和 javascript动态解析
         for tag in self.soup.find_all('a'):
             if self.judge(tag.get('href')):
-                urls.append(self.filter(tag.get('href')))
+                href = self.urljoin(tag.get('href'))
+                if self.filter(href) and href not in urls:
+                    urls.append(href)
 
         # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
         for tag in self.soup.find_all():
             if self._is_input_with_onclick(tag):
                 for item in re.findall(self.pattern, tag.get('onclick')):
                     if self.judge(self.onclick_filter(item)):
-                        urls.append(self.filter(self.onclick_filter(item)))
+                        href = self.urljoin(self.onclick_filter(item))
+                        if self.filter(href) and href not in urls:
+                            urls.append(href)
         return urls
 
     def get_items(self):
@@ -67,11 +57,11 @@ class DomAnalysis(FilterUrl):
             if tag.get('href') is None:
                 return
             try:
-                href = self.filter(tag.get('href'))
+                href = self.urljoin(tag.get('href'))
             except ValueError:
                 return
             data = {'title': name, 'href': href}
-            if data not in items:
+            if self.filter(href) and data not in items:
                 items.append(data)
 
         for tag in self.soup.find_all('a'):

+ 17 - 15
find_source/crawler/analysis/FilterUrl.py

@@ -1,7 +1,7 @@
 import re
 from urllib.parse import urljoin
 
-from crawler.utils import extract_host
+from crawler.utils import extract_host, is_url
 
 
 class FilterUrl:
@@ -15,26 +15,28 @@ class FilterUrl:
             return False
         elif link.find('.htm') == -1 and re.search('([.][a-zA-Z]{3,5})$', link) is not None:
             return False
-        elif self.isogeny:
-            if link.find('http') != -1 and link.find(self.host) == -1:
-                # 保留同源策略范围链接
-                return False
-        elif not self.isogeny:
-            host = extract_host(self.request_url)
-            if link.find(host) != -1:
-                # 保留非同源策略范围链接
-                return False
         return True
 
-    def filter(self, link):
+    def urljoin(self, link):
         if link.find('http') != 0:
-            if self.isogeny:
-                return urljoin(self.host, link).rstrip(' ')
-            elif not self.isogeny:
-                return urljoin(self.request_url, link).rstrip(' ')
+            return urljoin(self.addr, link).rstrip(' ')
         else:
             return link.rstrip(' ')
 
+    def filter(self, link):
+        if not is_url(link):
+            return False
+
+        if self.isogeny:
+            # 同源策略
+            if link.find('http') != -1 and link.find(self.addr) == -1:
+                return False
+        else:
+            # 非同源策略
+            if link.find(extract_host(self.addr)) != -1:
+                return False
+        return True
+
     def onclick_filter(self, link):
         link_pattern = re.compile("[\"'][ ]*\+[ ]*[\"']")
         return re.sub(link_pattern, '', link)

+ 2 - 1
find_source/crawler/analysis/TimeExtractor.py

@@ -13,6 +13,7 @@ DATETIME_PATTERN = [
     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
     "(\d{1,2}[-|/|.]\d{1,2})",
     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
@@ -43,7 +44,7 @@ class TimeExtractor:
 
     def extractor(self, element: HtmlElement) -> str:
         # text = ''.join(element.xpath('.//text()'))
-        text = ''.join(element.xpath('string(.)'))
+        text = ''.join(element.xpath('string(.)').split())
         for dt in self.time_pattern:
             dt_obj = re.search(dt, text)
             if dt_obj:

+ 4 - 3
find_source/crawler/defaults.py

@@ -51,7 +51,8 @@ KEYWORDS = {
     '终止', '系统'
 }
 
-FOOTER_TEXTS = {'网安备', '关于我们', '地图', '建议意见', '法律声明', '信箱'}
-CATEGORY_TEXTS = {'政策', '办事指南', '首页', '党'}
+FOOTER_TEXTS = {}
 PAGE_TEXTS = {'尾页', '下页', '下一页'}
-LOGIN_TEXTS = {'忘记密码', '登录', '注册'}
+LOGIN_TEXTS = {'忘记密码', '登录', '注册'}
+NAV_TEXTS = {'政策', '办事指南', '首页', '党', '操作手册', '关于我们', '地图',
+             '建议意见', '法律声明', '信箱', '网安备', }

+ 1 - 1
find_source/crawler/engines.py

@@ -52,7 +52,7 @@ class BingSearchEngine(JySearchEngine):
     def parser(self, response):
         urls = []
         if response.status_code == 200:
-            urls = parse_urls(response.text, self.site)
+            urls = parse_urls(response.text, addr=self.site, isogeny=False)
         return urls
 
     def percolator(self, urls, retrieve_urls):