2 年前 · 6844f71a40
--- a/find_source/crawler/services/excavate.py
+++ b/find_source/crawler/services/excavate.py
@@ -16,7 +16,7 @@ from crawler.utils import (
 
				     html2element,
			
 
				     iter_node,
			
 
				     check_page_by_words,
			
 
				-    predict_bidding_model
			
 
				+    predict_bidding_model, is_contains
			
 
				 )
			
 
				 from settings import Dzr
			
 
				 
			
@@ -70,9 +70,11 @@ class DataExcavate(BasicService):
 
				         curr_depth = task['depth']
			
 
				         domain = extract_domain(url)
			
 
				         is_attachment_url = re.match(URL_SUFFIX_PATTERN, url) is not None
			
 
				+        is_login_url = is_contains(url, 'login')
			
 
				         if any([
			
 
				             curr_depth > self._max_depth,  # 检查任务层级
			
 
				-            is_attachment_url,  # 检查网址是否附件下载地址
			
 
				+            is_attachment_url,  # 判断是否附件下载地址
			
 
				+            is_login_url,  # 判断是否登录地址
			
 
				             self.validator.data(url),  # 垃圾池 - 判重任务请求网址
			
 
				             self.validator.data(domain),  # 垃圾池 - 过滤域名（一级）
			
 
				             self.collector.data(domain),  # 收录池 - 判重域名（一级）
			
--- a/find_source/crawler/utils.py
+++ b/find_source/crawler/utils.py
@@ -1,3 +1,4 @@
 
				+import operator
			
 
				 import re
			
 
				 import zlib
			
 
				 from html import unescape
			
@@ -86,6 +87,12 @@ def is_url(url):
 
				     return re.match(_regex, url) is not None
			
 
				 
			
 
				 
			
 
				+def is_contains(val: str, feature: str):
			
 
				+    if operator.contains(val, feature):
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				 def is_domain(domain):
			
 
				     ext = tldextract.extract(domain)
			
 
				     if not ext.domain: