Jelajahi Sumber

修复内容超过3M以上的json文本序列化阻塞问题

dongzhaorui 1 tahun lalu
induk
melakukan
12e22334a8
1 mengubah file dengan 19 tambahan dan 8 penghapusan
  1. 19 8
      zgztb_cookie/detail_firefox.py

+ 19 - 8
zgztb_cookie/detail_firefox.py

@@ -144,13 +144,29 @@ class DetailSpider(feapder.AirSpider):
             self._proxies = proxies  # 全局代理
             break
 
+    def unpack_large_content(self, response):
+        if self._data_transmission_limit <= 3:
+            # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
+            text = response.text
+        else:
+            text = response.content.decode(errors='ignore')
+        return text
+
+    def unpack_large_json(self, response):
+        if self._data_transmission_limit <= 3:
+            resp_json = response.json
+        else:
+            body = response.content.decode(errors='ignore')
+            resp_json = tools.get_json(body)
+        return resp_json
+
     def extract_html(self, request, response):
         business_keyword = request.item['businessKeyWord']
         content_length_limit = self._data_transmission_limit * MEGABYTES
         upper_limit = response.headers['content-length'] > content_length_limit
         if not upper_limit:
             # 情况2.1:结构化数据,直接提取数据
-            resp_json = response.json
+            resp_json = self.unpack_large_json(response)
             try:
                 data_lst = resp_json['object'][business_keyword]
                 if isinstance(data_lst, list) and len(data_lst) == 0:
@@ -161,16 +177,11 @@ class DetailSpider(feapder.AirSpider):
                 # 该项目发生变更,分类名称发生变更,不检查
                 pass
         else:
+            # 情况2.2:非结构化数据
             if business_keyword == 'openBidRecord':
                 return None, DataStreamReadStatus.LOSE
 
-            # 情况2.2:非结构化数据
-            if self._data_transmission_limit <= 3:
-                # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
-                html = response.text
-            else:
-                html = response.content.decode(errors='ignore')
-
+            html = self.unpack_large_content(response)
             # 模糊查询结果,返回的数据内容是按照时间降序排列
             content = tools.get_info(html, '\"object\":({.*?}),', fetch_one=True)
             content = ":".join(content.split(':')[1:])[1:]  # [{...} -> {...}