|
@@ -75,6 +75,98 @@ class DetailSpider(feapder.AirSpider):
|
|
|
def proxy(self):
|
|
|
return swordfish_proxy()
|
|
|
|
|
|
+ def get_response(self, request, response):
|
|
|
+ """
|
|
|
+ 接收响应体,并设置响应体大小["content-length"]。若超过数据上限则熔断接收流程
|
|
|
+ """
|
|
|
+ title = request.item['title']
|
|
|
+ content_length = 0 # 单位:字节
|
|
|
+ limit = self._data_transmission_limit * MEGABYTES # 接收数据的大小,单位:M
|
|
|
+ obj = io.BytesIO()
|
|
|
+ with tqdm(desc=title, total=limit, unit='iB', unit_scale=True, unit_divisor=1024) as bar:
|
|
|
+ for r in response.iter_content(chunk_size=MEGABYTES): # chunk_size 单位:字节
|
|
|
+ n = obj.write(r)
|
|
|
+ content_length += n
|
|
|
+ bar.update(n)
|
|
|
+ if content_length >= limit:
|
|
|
+ # 接收的数据内容超过上限时,影响后续流程处理,因此添加此熔断条件
|
|
|
+ break
|
|
|
+
|
|
|
+ response = Response.from_dict(
|
|
|
+ {
|
|
|
+ "url": request.url,
|
|
|
+ "cookies": response.cookies,
|
|
|
+ "_content": obj.getvalue(),
|
|
|
+ "status_code": response.status_code,
|
|
|
+ "elapsed": response.elapsed.microseconds,
|
|
|
+ "headers": {
|
|
|
+ **response.headers,
|
|
|
+ "content-length": content_length
|
|
|
+ }
|
|
|
+ }
|
|
|
+ )
|
|
|
+ return response
|
|
|
+
|
|
|
+ def ali_robots(self, request, limit=3):
|
|
|
+ """
|
|
|
+ 阿里无感机器人验证
|
|
|
+
|
|
|
+ @param request
|
|
|
+ @param limit 最大重试次数
|
|
|
+ """
|
|
|
+ retries_count = 0
|
|
|
+ proxies = request.proxies
|
|
|
+ while retries_count < limit:
|
|
|
+ # 尝试通过滑块验证的cookies会话信息,进行采集
|
|
|
+ cookies = self.cookie_pool.create_cookies(proxy=proxies.get("http"))
|
|
|
+ if not cookies or len(cookies) <= 1:
|
|
|
+ proxies = self.proxy
|
|
|
+ retries_count += 1
|
|
|
+ continue
|
|
|
+
|
|
|
+ request.session.cookies.update(cookies)
|
|
|
+ self._cookies = request.session.cookies.get_dict()
|
|
|
+ request.proxies = proxies # request自身代理
|
|
|
+ self._proxies = proxies # 全局代理
|
|
|
+ break
|
|
|
+
|
|
|
+ def extract_html(self, request, response):
|
|
|
+ business_keyword = request.item['businessKeyWord']
|
|
|
+ content_length_limit = self._data_transmission_limit * MEGABYTES
|
|
|
+ upper_limit = response.headers['content-length'] > content_length_limit
|
|
|
+ if not upper_limit:
|
|
|
+ # 情况2.1:结构化数据,直接提取数据
|
|
|
+ resp_json = response.json
|
|
|
+ else:
|
|
|
+ if business_keyword == 'openBidRecord':
|
|
|
+ return None, DataStreamReadStatus.LOSE
|
|
|
+
|
|
|
+ # 情况2.2:非结构化数据
|
|
|
+ if self._data_transmission_limit <= 3:
|
|
|
+ # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
|
|
|
+ html = response.text
|
|
|
+ else:
|
|
|
+ html = response.content.decode(errors='ignore')
|
|
|
+
|
|
|
+ # 模糊查询结果,返回的数据内容是按照时间降序排列
|
|
|
+ content = tools.get_info(html, '\"object\":({.*?}),', fetch_one=True)
|
|
|
+ content = ":".join(content.split(':')[1:])[1:] # [{...} -> {...}
|
|
|
+ if not content:
|
|
|
+ return None, DataStreamReadStatus.NULL
|
|
|
+ elif not content.endswith('}'):
|
|
|
+ # raise EOFError('content 不是以"}"结尾,文件内容不全,丢弃')
|
|
|
+ return None, DataStreamReadStatus.LOSE
|
|
|
+ else:
|
|
|
+ ret = tools.repair_json(content)
|
|
|
+ resp_json = {
|
|
|
+ "message": "",
|
|
|
+ "success": True,
|
|
|
+ "object": {business_keyword: [ret]}
|
|
|
+ }
|
|
|
+
|
|
|
+ html = splicing(business_keyword, resp_json)
|
|
|
+ return html, DataStreamReadStatus.NORMAL
|
|
|
+
|
|
|
def start_callback(self):
|
|
|
self._data_transmission_limit = 15 # 数据传输内容接收上限,单位:M, 建议不要超过3M
|
|
|
self._proxies = None # 全局代理
|
|
@@ -186,98 +278,6 @@ class DetailSpider(feapder.AirSpider):
|
|
|
self.to_db.add("data_bak", insert)
|
|
|
log.info(f"{msg}--采集成功,状态:{state.value}")
|
|
|
|
|
|
- def get_response(self, request, response):
|
|
|
- """
|
|
|
- 接收响应体,并设置响应体大小["content-length"]。若超过数据上限则熔断接收流程
|
|
|
- """
|
|
|
- title = request.item['title']
|
|
|
- content_length = 0 # 单位:字节
|
|
|
- limit = self._data_transmission_limit * MEGABYTES # 接收数据的大小,单位:M
|
|
|
- obj = io.BytesIO()
|
|
|
- with tqdm(desc=title, total=limit, unit='iB', unit_scale=True, unit_divisor=1024) as bar:
|
|
|
- for r in response.iter_content(chunk_size=MEGABYTES): # chunk_size 单位:字节
|
|
|
- n = obj.write(r)
|
|
|
- content_length += n
|
|
|
- bar.update(n)
|
|
|
- if content_length >= limit:
|
|
|
- # 接收的数据内容超过上限时,影响后续流程处理,因此添加此熔断条件
|
|
|
- break
|
|
|
-
|
|
|
- response = Response.from_dict(
|
|
|
- {
|
|
|
- "url": request.url,
|
|
|
- "cookies": response.cookies,
|
|
|
- "_content": obj.getvalue(),
|
|
|
- "status_code": response.status_code,
|
|
|
- "elapsed": response.elapsed.microseconds,
|
|
|
- "headers": {
|
|
|
- **response.headers,
|
|
|
- "content-length": content_length
|
|
|
- }
|
|
|
- }
|
|
|
- )
|
|
|
- return response
|
|
|
-
|
|
|
- def ali_robots(self, request, limit=3):
|
|
|
- """
|
|
|
- 阿里无感机器人验证
|
|
|
-
|
|
|
- @param request
|
|
|
- @param limit 最大重试次数
|
|
|
- """
|
|
|
- retries_count = 0
|
|
|
- proxies = request.proxies
|
|
|
- while retries_count < limit:
|
|
|
- # 尝试通过滑块验证的cookies会话信息,进行采集
|
|
|
- cookies = self.cookie_pool.create_cookies(proxy=proxies.get("http"))
|
|
|
- if not cookies or len(cookies) <= 1:
|
|
|
- proxies = self.proxy
|
|
|
- retries_count += 1
|
|
|
- continue
|
|
|
-
|
|
|
- request.session.cookies.update(cookies)
|
|
|
- self._cookies = request.session.cookies.get_dict()
|
|
|
- request.proxies = proxies # request自身代理
|
|
|
- self._proxies = proxies # 全局代理
|
|
|
- break
|
|
|
-
|
|
|
- def extract_html(self, request, response):
|
|
|
- business_keyword = request.item['businessKeyWord']
|
|
|
- content_length_limit = self._data_transmission_limit * MEGABYTES
|
|
|
- upper_limit = response.headers['content-length'] > content_length_limit
|
|
|
- if not upper_limit:
|
|
|
- # 情况2.1:结构化数据,直接提取数据
|
|
|
- resp_json = response.json
|
|
|
- else:
|
|
|
- if business_keyword == 'openBidRecord':
|
|
|
- return None, DataStreamReadStatus.LOSE
|
|
|
-
|
|
|
- # 情况2.2:非结构化数据
|
|
|
- if self._data_transmission_limit <= 3:
|
|
|
- # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
|
|
|
- html = response.text
|
|
|
- else:
|
|
|
- html = response.content.decode(errors='ignore')
|
|
|
-
|
|
|
- # 模糊查询结果,返回的数据内容是按照时间降序排列
|
|
|
- content = tools.get_info(html, '\"object\":({.*?}),', fetch_one=True)
|
|
|
- content = ":".join(content.split(':')[1:])[1:] # [{...} -> {...}
|
|
|
- if not content:
|
|
|
- return None, DataStreamReadStatus.NULL
|
|
|
- elif not content.endswith('}'):
|
|
|
- # raise EOFError('content 不是以"}"结尾,文件内容不全,丢弃')
|
|
|
- return None, DataStreamReadStatus.LOSE
|
|
|
- else:
|
|
|
- ret = tools.repair_json(content)
|
|
|
- resp_json = {
|
|
|
- "message": "",
|
|
|
- "success": True,
|
|
|
- "object": {business_keyword: [ret]}
|
|
|
- }
|
|
|
-
|
|
|
- html = splicing(business_keyword, resp_json)
|
|
|
- return html, DataStreamReadStatus.NORMAL
|
|
|
-
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
while True:
|