3 năm trước cách đây · cae4797773
--- a/find_source/crawler/download.py
+++ b/find_source/crawler/download.py
@@ -72,9 +72,9 @@ class Downloader:
 
				                     if 'verify' in request_params:
			
 
				                         del request_params['verify']
			
 
				                     url = url.replace('https', 'http')
			
 
				-                retries += 1
			
 
				             except requests.RequestException as e:
			
 
				                 response.reason = e.__class__.__name__
			
 
				+            finally:
			
 
				                 retries += 1
			
 
				         if not disable_debug_log:
			
 
				             t_name = threading.currentThread().getName()
			
@@ -102,3 +102,58 @@ class Downloader:
 
				         session.mount('http://', adapter)
			
 
				         session.mount('https://', adapter)
			
 
				         return session
			
 
				+
			
 
				+
			
 
				+class RenderDownloader(Downloader):
			
 
				+
			
 
				+    # def get(self, url, **kw):
			
 
				+    #     splash_url = 'http://8.131.72.226:8998/render.html'
			
 
				+    #     args = {
			
 
				+    #         'url': url,
			
 
				+    #         'timeout': 60,
			
 
				+    #         'wait': 0.5,
			
 
				+    #     }
			
 
				+    #     resp = requests.get(splash_url, params=args, headers=headers)
			
 
				+    #     return resp
			
 
				+
			
 
				+    def get(self, url, **kw):
			
 
				+        splash_url = 'http://8.131.72.226:8998/render.json'
			
 
				+        args = {
			
 
				+            'url': url,
			
 
				+            'html': 1,
			
 
				+            'iframes': 1,
			
 
				+        }
			
 
				+        # splash_url = 'http://8.131.72.226:8998/render.html'
			
 
				+        # args = {
			
 
				+        #     'url': url,
			
 
				+        #     'timeout': 60,
			
 
				+        #     'wait': 0.5,
			
 
				+        # }
			
 
				+        resp = requests.get(splash_url, params=args, headers=headers)
			
 
				+        return resp
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    render = RenderDownloader()
			
 
				+    href = 'http://113.230.236.116:5002/mvvm/src/ebid/gcjs/combine/jypt.html?type=%e6%8b%9b%e6%a0%87%e5%85%ac%e5%91%8a&tpid=62c2943104c74c0e34cacef9&tpTitle=%e5%bb%ba%e5%b9%b3%e5%8e%bf%e7%ac%ac%e5%9b%9b%e5%b0%8f%e5%ad%a6%e8%bf%90%e5%8a%a8%e5%9c%ba%e5%8d%87%e7%ba%a7%e6%94%b9%e9%80%a0%e9%a1%b9%e7%9b%ae'
			
 
				+    resp = render.get(href)
			
 
				+    resp_json = resp.json()
			
 
				+
			
 
				+    for k, val in resp_json.items():
			
 
				+        print(f">> {k}", val)
			
 
				+
			
 
				+    childFrames = resp_json['childFrames']
			
 
				+    index = 0
			
 
				+    for child in childFrames:
			
 
				+        print(child)
			
 
				+        title = child['title']
			
 
				+        if len(title) == 0:
			
 
				+            title = index
			
 
				+            index += 1
			
 
				+
			
 
				+        with open(f'{title}.html', 'w') as fp:
			
 
				+            fp.write(child['html'])
			
 
				+
			
 
				+    html = resp_json['html']
			
 
				+    with open('p1.html', 'w') as fp:
			
 
				+        fp.write(html)
			
--- a/find_source/crawler/services/excavate.py
+++ b/find_source/crawler/services/excavate.py
@@ -78,7 +78,7 @@ class DataExcavate(BasicService):
 
				 
			
 
				     def process(self, t_name: str, task: Task):
			
 
				         logger.info(f'<{t_name}> - 请求 - {task["url"]}')
			
 
				-        response = self.downloader.get(task['url'])
			
 
				+        response = self.downloader.get(task['url'], timeout=5)
			
 
				         status_code = response.status_code
			
 
				         page_source = response.text
			
 
				         reason = response.reason
			
--- a/find_source/crawler/utils.py
+++ b/find_source/crawler/utils.py
@@ -3,6 +3,7 @@ from html import unescape
 
				 from urllib.parse import urlencode, urljoin
			
 
				 
			
 
				 from bs4 import BeautifulSoup
			
 
				+from lxml.etree import ParseError
			
 
				 from lxml.html import etree, HtmlElement, fromstring, tostring
			
 
				 from urllib3 import get_host
			
 
				 
			
@@ -47,11 +48,15 @@ def extract_domain(url):
 
				 
			
 
				 
			
 
				 def extract_page_title(source):
			
 
				-    element = html2element(source)
			
 
				-    nodes = element.xpath('/html/head/title/text()|//title/text()')
			
 
				-    if len(nodes) > 1:
			
 
				-        return "".join(";".join(nodes).split())
			
 
				-    return "".join("".join(nodes).split())
			
 
				+    node = ''
			
 
				+    try:
			
 
				+        element = html2element(source)
			
 
				+        node = element.xpath('/html/head/title/text()|//title/text()')
			
 
				+    except ParseError:
			
 
				+        pass
			
 
				+    if len(node) > 1:
			
 
				+        return "".join(";".join(node).split())
			
 
				+    return "".join("".join(node).split())
			
 
				 
			
 
				 
			
 
				 def is_url(url):