dongzhaorui 3 năm trước cách đây
mục cha
commit
cae4797773

+ 56 - 1
find_source/crawler/download.py

@@ -72,9 +72,9 @@ class Downloader:
                     if 'verify' in request_params:
                         del request_params['verify']
                     url = url.replace('https', 'http')
-                retries += 1
             except requests.RequestException as e:
                 response.reason = e.__class__.__name__
+            finally:
                 retries += 1
         if not disable_debug_log:
             t_name = threading.currentThread().getName()
@@ -102,3 +102,58 @@ class Downloader:
         session.mount('http://', adapter)
         session.mount('https://', adapter)
         return session
+
+
+class RenderDownloader(Downloader):
+
+    # def get(self, url, **kw):
+    #     splash_url = 'http://8.131.72.226:8998/render.html'
+    #     args = {
+    #         'url': url,
+    #         'timeout': 60,
+    #         'wait': 0.5,
+    #     }
+    #     resp = requests.get(splash_url, params=args, headers=headers)
+    #     return resp
+
+    def get(self, url, **kw):
+        splash_url = 'http://8.131.72.226:8998/render.json'
+        args = {
+            'url': url,
+            'html': 1,
+            'iframes': 1,
+        }
+        # splash_url = 'http://8.131.72.226:8998/render.html'
+        # args = {
+        #     'url': url,
+        #     'timeout': 60,
+        #     'wait': 0.5,
+        # }
+        resp = requests.get(splash_url, params=args, headers=headers)
+        return resp
+
+
+if __name__ == '__main__':
+    render = RenderDownloader()
+    href = 'http://113.230.236.116:5002/mvvm/src/ebid/gcjs/combine/jypt.html?type=%e6%8b%9b%e6%a0%87%e5%85%ac%e5%91%8a&tpid=62c2943104c74c0e34cacef9&tpTitle=%e5%bb%ba%e5%b9%b3%e5%8e%bf%e7%ac%ac%e5%9b%9b%e5%b0%8f%e5%ad%a6%e8%bf%90%e5%8a%a8%e5%9c%ba%e5%8d%87%e7%ba%a7%e6%94%b9%e9%80%a0%e9%a1%b9%e7%9b%ae'
+    resp = render.get(href)
+    resp_json = resp.json()
+
+    for k, val in resp_json.items():
+        print(f">> {k}", val)
+
+    childFrames = resp_json['childFrames']
+    index = 0
+    for child in childFrames:
+        print(child)
+        title = child['title']
+        if len(title) == 0:
+            title = index
+            index += 1
+
+        with open(f'{title}.html', 'w') as fp:
+            fp.write(child['html'])
+
+    html = resp_json['html']
+    with open('p1.html', 'w') as fp:
+        fp.write(html)

+ 1 - 1
find_source/crawler/services/excavate.py

@@ -78,7 +78,7 @@ class DataExcavate(BasicService):
 
     def process(self, t_name: str, task: Task):
         logger.info(f'<{t_name}> - 请求 - {task["url"]}')
-        response = self.downloader.get(task['url'])
+        response = self.downloader.get(task['url'], timeout=5)
         status_code = response.status_code
         page_source = response.text
         reason = response.reason

+ 10 - 5
find_source/crawler/utils.py

@@ -3,6 +3,7 @@ from html import unescape
 from urllib.parse import urlencode, urljoin
 
 from bs4 import BeautifulSoup
+from lxml.etree import ParseError
 from lxml.html import etree, HtmlElement, fromstring, tostring
 from urllib3 import get_host
 
@@ -47,11 +48,15 @@ def extract_domain(url):
 
 
 def extract_page_title(source):
-    element = html2element(source)
-    nodes = element.xpath('/html/head/title/text()|//title/text()')
-    if len(nodes) > 1:
-        return "".join(";".join(nodes).split())
-    return "".join("".join(nodes).split())
+    node = ''
+    try:
+        element = html2element(source)
+        node = element.xpath('/html/head/title/text()|//title/text()')
+    except ParseError:
+        pass
+    if len(node) > 1:
+        return "".join(";".join(node).split())
+    return "".join("".join(node).split())
 
 
 def is_url(url):