|
@@ -144,13 +144,29 @@ class DetailSpider(feapder.AirSpider):
|
|
|
self._proxies = proxies # 全局代理
|
|
|
break
|
|
|
|
|
|
+ def unpack_large_content(self, response):
|
|
|
+ if self._data_transmission_limit <= 3:
|
|
|
+ # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
|
|
|
+ text = response.text
|
|
|
+ else:
|
|
|
+ text = response.content.decode(errors='ignore')
|
|
|
+ return text
|
|
|
+
|
|
|
+ def unpack_large_json(self, response):
|
|
|
+ if self._data_transmission_limit <= 3:
|
|
|
+ resp_json = response.json
|
|
|
+ else:
|
|
|
+ body = response.content.decode(errors='ignore')
|
|
|
+ resp_json = tools.get_json(body)
|
|
|
+ return resp_json
|
|
|
+
|
|
|
def extract_html(self, request, response):
|
|
|
business_keyword = request.item['businessKeyWord']
|
|
|
content_length_limit = self._data_transmission_limit * MEGABYTES
|
|
|
upper_limit = response.headers['content-length'] > content_length_limit
|
|
|
if not upper_limit:
|
|
|
# 情况2.1:结构化数据,直接提取数据
|
|
|
- resp_json = response.json
|
|
|
+ resp_json = self.unpack_large_json(response)
|
|
|
try:
|
|
|
data_lst = resp_json['object'][business_keyword]
|
|
|
if isinstance(data_lst, list) and len(data_lst) == 0:
|
|
@@ -161,16 +177,11 @@ class DetailSpider(feapder.AirSpider):
|
|
|
# 该项目发生变更,分类名称发生变更,不检查
|
|
|
pass
|
|
|
else:
|
|
|
+ # 情况2.2:非结构化数据
|
|
|
if business_keyword == 'openBidRecord':
|
|
|
return None, DataStreamReadStatus.LOSE
|
|
|
|
|
|
- # 情况2.2:非结构化数据
|
|
|
- if self._data_transmission_limit <= 3:
|
|
|
- # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
|
|
|
- html = response.text
|
|
|
- else:
|
|
|
- html = response.content.decode(errors='ignore')
|
|
|
-
|
|
|
+ html = self.unpack_large_content(response)
|
|
|
# 模糊查询结果,返回的数据内容是按照时间降序排列
|
|
|
content = tools.get_info(html, '\"object\":({.*?}),', fetch_one=True)
|
|
|
content = ":".join(content.split(':')[1:])[1:] # [{...} -> {...}
|