dongzhaorui 2 سال پیش
والد
کامیت
be90b8be69
1فایلهای تغییر یافته به همراه92 افزوده شده و 92 حذف شده
  1. 92 92
      zgztb_cookie/detail_firefox.py

+ 92 - 92
zgztb_cookie/detail_firefox.py

@@ -75,6 +75,98 @@ class DetailSpider(feapder.AirSpider):
     def proxy(self):
         return swordfish_proxy()
 
+    def get_response(self, request, response):
+        """
+            接收响应体,并设置响应体大小["content-length"]。若超过数据上限则熔断接收流程
+        """
+        title = request.item['title']
+        content_length = 0  # 单位:字节
+        limit = self._data_transmission_limit * MEGABYTES  # 接收数据的大小,单位:M
+        obj = io.BytesIO()
+        with tqdm(desc=title, total=limit, unit='iB', unit_scale=True, unit_divisor=1024) as bar:
+            for r in response.iter_content(chunk_size=MEGABYTES):  # chunk_size 单位:字节
+                n = obj.write(r)
+                content_length += n
+                bar.update(n)
+                if content_length >= limit:
+                    # 接收的数据内容超过上限时,影响后续流程处理,因此添加此熔断条件
+                    break
+
+        response = Response.from_dict(
+            {
+                "url": request.url,
+                "cookies": response.cookies,
+                "_content": obj.getvalue(),
+                "status_code": response.status_code,
+                "elapsed": response.elapsed.microseconds,
+                "headers": {
+                    **response.headers,
+                    "content-length": content_length
+                }
+            }
+        )
+        return response
+
+    def ali_robots(self, request, limit=3):
+        """
+        阿里无感机器人验证
+
+        @param request
+        @param limit 最大重试次数
+        """
+        retries_count = 0
+        proxies = request.proxies
+        while retries_count < limit:
+            # 尝试通过滑块验证的cookies会话信息,进行采集
+            cookies = self.cookie_pool.create_cookies(proxy=proxies.get("http"))
+            if not cookies or len(cookies) <= 1:
+                proxies = self.proxy
+                retries_count += 1
+                continue
+
+            request.session.cookies.update(cookies)
+            self._cookies = request.session.cookies.get_dict()
+            request.proxies = proxies  # request自身代理
+            self._proxies = proxies  # 全局代理
+            break
+
+    def extract_html(self, request, response):
+        business_keyword = request.item['businessKeyWord']
+        content_length_limit = self._data_transmission_limit * MEGABYTES
+        upper_limit = response.headers['content-length'] > content_length_limit
+        if not upper_limit:
+            # 情况2.1:结构化数据,直接提取数据
+            resp_json = response.json
+        else:
+            if business_keyword == 'openBidRecord':
+                return None, DataStreamReadStatus.LOSE
+
+            # 情况2.2:非结构化数据
+            if self._data_transmission_limit <= 3:
+                # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
+                html = response.text
+            else:
+                html = response.content.decode(errors='ignore')
+
+            # 模糊查询结果,返回的数据内容是按照时间降序排列
+            content = tools.get_info(html, '\"object\":({.*?}),', fetch_one=True)
+            content = ":".join(content.split(':')[1:])[1:]  # [{...} -> {...}
+            if not content:
+                return None, DataStreamReadStatus.NULL
+            elif not content.endswith('}'):
+                # raise EOFError('content 不是以"}"结尾,文件内容不全,丢弃')
+                return None, DataStreamReadStatus.LOSE
+            else:
+                ret = tools.repair_json(content)
+                resp_json = {
+                    "message": "",
+                    "success": True,
+                    "object": {business_keyword: [ret]}
+                }
+
+        html = splicing(business_keyword, resp_json)
+        return html, DataStreamReadStatus.NORMAL
+
     def start_callback(self):
         self._data_transmission_limit = 15  # 数据传输内容接收上限,单位:M, 建议不要超过3M
         self._proxies = None  # 全局代理
@@ -186,98 +278,6 @@ class DetailSpider(feapder.AirSpider):
             self.to_db.add("data_bak", insert)
             log.info(f"{msg}--采集成功,状态:{state.value}")
 
-    def get_response(self, request, response):
-        """
-            接收响应体,并设置响应体大小["content-length"]。若超过数据上限则熔断接收流程
-        """
-        title = request.item['title']
-        content_length = 0  # 单位:字节
-        limit = self._data_transmission_limit * MEGABYTES  # 接收数据的大小,单位:M
-        obj = io.BytesIO()
-        with tqdm(desc=title, total=limit, unit='iB', unit_scale=True, unit_divisor=1024) as bar:
-            for r in response.iter_content(chunk_size=MEGABYTES):  # chunk_size 单位:字节
-                n = obj.write(r)
-                content_length += n
-                bar.update(n)
-                if content_length >= limit:
-                    # 接收的数据内容超过上限时,影响后续流程处理,因此添加此熔断条件
-                    break
-
-        response = Response.from_dict(
-            {
-                "url": request.url,
-                "cookies": response.cookies,
-                "_content": obj.getvalue(),
-                "status_code": response.status_code,
-                "elapsed": response.elapsed.microseconds,
-                "headers": {
-                    **response.headers,
-                    "content-length": content_length
-                }
-            }
-        )
-        return response
-
-    def ali_robots(self, request, limit=3):
-        """
-        阿里无感机器人验证
-
-        @param request
-        @param limit 最大重试次数
-        """
-        retries_count = 0
-        proxies = request.proxies
-        while retries_count < limit:
-            # 尝试通过滑块验证的cookies会话信息,进行采集
-            cookies = self.cookie_pool.create_cookies(proxy=proxies.get("http"))
-            if not cookies or len(cookies) <= 1:
-                proxies = self.proxy
-                retries_count += 1
-                continue
-
-            request.session.cookies.update(cookies)
-            self._cookies = request.session.cookies.get_dict()
-            request.proxies = proxies  # request自身代理
-            self._proxies = proxies  # 全局代理
-            break
-
-    def extract_html(self, request, response):
-        business_keyword = request.item['businessKeyWord']
-        content_length_limit = self._data_transmission_limit * MEGABYTES
-        upper_limit = response.headers['content-length'] > content_length_limit
-        if not upper_limit:
-            # 情况2.1:结构化数据,直接提取数据
-            resp_json = response.json
-        else:
-            if business_keyword == 'openBidRecord':
-                return None, DataStreamReadStatus.LOSE
-
-            # 情况2.2:非结构化数据
-            if self._data_transmission_limit <= 3:
-                # 数据的内容越大(3M以上)首次解码耗时越长,且解码时会将无法识别的字符转换成替换字符
-                html = response.text
-            else:
-                html = response.content.decode(errors='ignore')
-
-            # 模糊查询结果,返回的数据内容是按照时间降序排列
-            content = tools.get_info(html, '\"object\":({.*?}),', fetch_one=True)
-            content = ":".join(content.split(':')[1:])[1:]  # [{...} -> {...}
-            if not content:
-                return None, DataStreamReadStatus.NULL
-            elif not content.endswith('}'):
-                # raise EOFError('content 不是以"}"结尾,文件内容不全,丢弃')
-                return None, DataStreamReadStatus.LOSE
-            else:
-                ret = tools.repair_json(content)
-                resp_json = {
-                    "message": "",
-                    "success": True,
-                    "object": {business_keyword: [ret]}
-                }
-
-        html = splicing(business_keyword, resp_json)
-        return html, DataStreamReadStatus.NORMAL
-
 
 if __name__ == "__main__":
     while True: