ソースを参照

添加删除页面源码中图片或者base64图片编码规则

dongzhaorui 2 年 前
コミット
9989b4715a
2 ファイル変更66 行追加55 行削除
  1. 27 22
      zgztb_cookie/detail_firefox.py
  2. 39 33
      zgztb_cookie/detail_normol.py

+ 27 - 22
zgztb_cookie/detail_firefox.py

@@ -17,6 +17,7 @@ import feapder
 import feapder.utils.tools as tools
 from cookie_pool import WebCookiePool
 from encode_info import encode_info
+from feapder import Item
 from feapder.db.mongodb import MongoDB
 from feapder.network.proxy_pool import swordfish_proxy
 from feapder.network.request import requests
@@ -60,7 +61,6 @@ class DetailSpider(feapder.AirSpider):
     cookie_pool = WebCookiePool(redis_key='zgztbcookie',
                                 page_url="http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do")
     _to_db = None
-    db_name = 'zgzb_list'
 
     @property
     def to_db(self):
@@ -168,15 +168,15 @@ class DetailSpider(feapder.AirSpider):
         self._data_transmission_limit = 15  # 数据传输内容接收上限,单位:M, 建议不要超过3M
         self._proxies = None  # 全局代理
         self._cookies = None  # 全局浏览器信息
+        self._coll_name = 'data_bak'  # 生产表
+        self._task_coll_name = 'zgzb_list'  # 任务表
+        self._ignore = ['_id', 'type', 'businessKeyWord', 'rowGuid']
 
     def start_requests(self):
-        task_lst = self.to_db.find(self.db_name,
-                                   {"type": "0", "timeout": None},
-                                   sort={"_id": -1},
-                                   limit=100)
         self._proxies = self.proxy
+        q = {"type": "0", "timeout": None}
+        task_lst = self.to_db.find(self._task_coll_name, q, sort={"_id": -1}, limit=100)
         for item in task_lst:
-            item.pop("rowGuid", None)
             schemaversion = item.pop("schemaVersion")
             try:
                 businessid, tenderprojectcode, _ = item['href'].split("&")
@@ -221,11 +221,11 @@ class DetailSpider(feapder.AirSpider):
                 request.session.cookies.clear_session_cookies()
 
     def parse(self, request, response):
-        item = request.item
+        task_item = request.item
         meta = request.meta
         msg = meta['msg']
         href = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={meta['uuid']}"
-        item["href"] = href
+        task_item["href"] = href
         request = request.copy()  # 复制request实例,复用session会话
         response = self.get_response(request, response)  # 接收数据
         if response.is_html:
@@ -250,7 +250,9 @@ class DetailSpider(feapder.AirSpider):
                 yield request
         else:
             contenthtml, state = self.extract_html(request, response)
-            item["contenthtml"] = contenthtml or ''
+            # 删除页面中的图片或者base64
+            contenthtml = tools.re.sub('data:image/(.*?)["|\']', '', contenthtml)
+            task_item["contenthtml"] = contenthtml or ''
 
             # 源码清洗
             special = {
@@ -258,22 +260,25 @@ class DetailSpider(feapder.AirSpider):
                 '</body[^>]*>|]]>': '',
             }
             detail = cleaner(contenthtml, special=special) if contenthtml else None
-            item["detail"] = detail or ''
+            task_item["detail"] = detail or ''
 
             # 汉字数量检查
             flag = "false" if tools.chinese_character(detail).total >= 20 else "true"
-            item["sendflag"] = flag
-
-            # 更新采集任务状态
-            update_data = {"timeout": state.value}
-            self.to_db.update(self.db_name, update_data, {"_id": meta['_id']})
-
-            # 数据推送生产库
-            ignore = ['_id', 'type', 'businessKeyWord']
-            insert = {k: v for k, v in item.items() if k not in ignore}
-            insert['comeintime'] = tools.ensure_int64(int(time.time()))
-            self.to_db.add("data_bak", insert)
-            log.info(f"{msg}--采集成功,状态:{state.value}")
+            task_item["sendflag"] = flag
+
+            # 保存data_bak
+            data = {k: v for k, v in task_item.items() if k not in self._ignore}
+            data['comeintime'] = tools.ensure_int64(tools.get_current_timestamp())
+            item = Item(**data)
+            item.table_name = self._coll_name
+            yield item
+            log.info(f"{msg} --上传成功,状态:{state.value}")
+
+            # 更新任务表
+            item = Item(timeout=state.value, _id=meta['_id'])
+            item.table_name = self._task_coll_name
+            yield item.to_UpdateItem()
+            log.debug(f"{msg} --采集完成")
 
 
 if __name__ == "__main__":

+ 39 - 33
zgztb_cookie/detail_normol.py

@@ -6,21 +6,18 @@ Created on 2021-12-13 13:25:15
 ---------
 @author: 马国鹏
 """
-import time
-
 from lxml import etree
 
 import feapder
 import feapder.utils.tools as tools
+from feapder import Item
 from feapder.db.mongodb import MongoDB
 from feapder.utils.cleaner import cleaner
 from feapder.utils.log import log
 
 
-class Details(feapder.Spider):
+class DetailSpider(feapder.Spider):
     _to_db = None
-    db_name = 'zgzb_list'
-    send_list = []
 
     # 定义mongo链接
     @property
@@ -29,17 +26,19 @@ class Details(feapder.Spider):
             self._to_db = MongoDB()
         return self._to_db
 
+    def start_callback(self):
+        self._coll_name = 'data_bak'  # 生产表
+        self._task_coll_name = 'zgzb_list'  # 任务表
+        self._ignore = ['_id', 'type', 'schemaVersion', 'businessKeyWord']  # 上传MongoDB需要忽略的字段
+
     def start_requests(self):
-        list_page_datas = self.to_db.find(
-            self.db_name,
-            {"type": "1", "timeout": {"$exists": 0}},
-            sort={"_id": -1},
-            limit=100)
-
-        for item in list_page_datas:
-            item.pop("businessKeyWord", None)  # 删除字段businessKeyWord
-            rowguid = item.pop("rowGuid")
-            url = f'http://connect.cebpubservice.com/PSPFrame/infobasemis/socialpublic/publicyewu/Frame_yewuDetail?rowguid={rowguid}'
+        q = {"type": "1", "timeout": None}
+        task_lst = self.to_db.find(self._task_coll_name, q, sort={"_id": -1}, limit=100)
+        for item in task_lst:
+            row_guid = item.pop("rowGuid")
+            if not row_guid:
+                continue
+            url = f'http://connect.cebpubservice.com/PSPFrame/infobasemis/socialpublic/publicyewu/Frame_yewuDetail?rowguid={row_guid}'
             yield feapder.Request(url, splash=True, render_time=3, iframes=1, item=item)
 
     def validate(self, request, response):
@@ -52,9 +51,9 @@ class Details(feapder.Spider):
             # print(request.href)
             return
 
-        item = request.item
-        msg = f"{item['title']} - {item['publishtime']}"
-        taskid = item.get("_id")
+        task_item = request.item
+        msg = f"{task_item['channel']}-{task_item['title']}-{task_item['publishtime']}"
+        taskid = task_item.get("_id")
         if not taskid:
             # print(item)
             return
@@ -78,25 +77,32 @@ class Details(feapder.Spider):
         if len(htmls) > 0:
             html = '\n'.join([etree.tounicode(html) for html in htmls])
 
-        item["contenthtml"] = html
-        item["detail"] = cleaner(html)
-        item["href"] = request.url
-        if tools.chinese_character(item["detail"]).total == 0:
+        # 清洗图片或者base64编码
+        html = tools.re.sub('data:image/(.*?)["|\']', '', html)
+        task_item["contenthtml"] = html
+        task_item["detail"] = cleaner(html)
+        task_item["href"] = request.url
+        if tools.chinese_character(task_item["detail"]).total == 0:
             # 无正文内容时,该内容直接标记true, 不在被统计
-            item["sendflag"] = "true"
+            task_item["sendflag"] = "true"
 
-        # 更新任务表
-        self.to_db.update(self.db_name, {"timeout": 3}, {"_id": taskid})
         # 保存data_bak
-        ignore = ['_id', 'type', 'schemaVersion']
-        insert = {k: v for k, v in item.items() if k not in ignore}
-        insert['comeintime'] = tools.ensure_int64(int(time.time()))
-        self.to_db.add("data_bak", insert)
-        log.info(f'splash >>> {msg} --采集成功')
+        data = {k: v for k, v in task_item.items() if k not in self._ignore}
+        data['comeintime'] = tools.ensure_int64(tools.get_current_timestamp())
+        item = Item(**data)
+        item.table_name = self._coll_name
+        yield item
+        log.info(f'splash >>> {msg} --上传成功')
+
+        # 更新任务表
+        item = Item(timeout=2, _id=taskid)  # 2=正常数据
+        item.table_name = self._task_coll_name
+        yield item.to_UpdateItem()
+        log.debug(f'splash >>> {msg} --采集完成')
 
 
 if __name__ == "__main__":
     while True:
-        apider = Details(redis_key="splish:zgzb:detail")
-        apider.start()
-        apider.join()
+        spider = DetailSpider(redis_key="splish:zgzb:detail")
+        spider.start()
+        spider.join()