1 tahun lalu · 12e22334a8
--- a/zgztb_cookie/detail_firefox.py
+++ b/zgztb_cookie/detail_firefox.py
@@ -144,13 +144,29 @@ class DetailSpider(feapder.AirSpider):
 
				             self._proxies = proxies  # 全局代理
			
 
				             break
			
 
				 
			
 
				+    def unpack_large_content(self, response):
			
 
				+        if self._data_transmission_limit <= 3:
			
 
				+            # 数据的内容越大（3M以上）首次解码耗时越长，且解码时会将无法识别的字符转换成替换字符
			
 
				+            text = response.text
			
 
				+        else:
			
 
				+            text = response.content.decode(errors='ignore')
			
 
				+        return text
			
 
				+
			
 
				+    def unpack_large_json(self, response):
			
 
				+        if self._data_transmission_limit <= 3:
			
 
				+            resp_json = response.json
			
 
				+        else:
			
 
				+            body = response.content.decode(errors='ignore')
			
 
				+            resp_json = tools.get_json(body)
			
 
				+        return resp_json
			
 
				+
			
 
				     def extract_html(self, request, response):
			
 
				         business_keyword = request.item['businessKeyWord']
			
 
				         content_length_limit = self._data_transmission_limit * MEGABYTES
			
 
				         upper_limit = response.headers['content-length'] > content_length_limit
			
 
				         if not upper_limit:
			
 
				             # 情况2.1：结构化数据，直接提取数据
			
 
				-            resp_json = response.json
			
 
				+            resp_json = self.unpack_large_json(response)
			
 
				             try:
			
 
				                 data_lst = resp_json['object'][business_keyword]
			
 
				                 if isinstance(data_lst, list) and len(data_lst) == 0:
			
@@ -161,16 +177,11 @@ class DetailSpider(feapder.AirSpider):
 
				                 # 该项目发生变更,分类名称发生变更,不检查
			
 
				                 pass
			
 
				         else:
			
 
				+            # 情况2.2：非结构化数据
			
 
				             if business_keyword == 'openBidRecord':
			
 
				                 return None, DataStreamReadStatus.LOSE
			
 
				 
			
 
				-            # 情况2.2：非结构化数据
			
 
				-            if self._data_transmission_limit <= 3:
			
 
				-                # 数据的内容越大（3M以上）首次解码耗时越长，且解码时会将无法识别的字符转换成替换字符
			
 
				-                html = response.text
			
 
				-            else:
			
 
				-                html = response.content.decode(errors='ignore')
			
 
				-
			
 
				+            html = self.unpack_large_content(response)
			
 
				             # 模糊查询结果，返回的数据内容是按照时间降序排列
			
 
				             content = tools.get_info(html, '\"object\":({.*?}),', fetch_one=True)
			
 
				             content = ":".join(content.split(':')[1:])[1:]  # [{...} -> {...}