浏览代码

模板更新

lizongze 2 年之前
父节点
当前提交
b9a23d39f0
共有 2 个文件被更改,包括 58 次插入41 次删除
  1. 57 40
      FworkSpider/feapder/templates/detail_template.tmpl
  2. 1 1
      FworkSpider/script_tools/create.py

+ 57 - 40
FworkSpider/feapder/templates/detail_template.tmpl

@@ -6,14 +6,18 @@ Created on {DATE}
 ---------
 @author: {USER}
 """
-import re
 import sys
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 from urllib.parse import urljoin
 import feapder
-from untils.attachment import AttachmentDownloader
 from items.spider_item import DataBakItem
+from untils.attachment import AttachmentDownloader
+from untils.tools import remove_htmldata
 from feapder.utils.log import log
+import time
+import json
+import re
+
 
 
 
@@ -21,57 +25,56 @@ class Details(feapder.BiddingDetailSpider):
 
     def start_requests(self):
         while True:
-            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},limit=50)
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},sort={"item.publishtime":-1},limit=50)
             for item in data_lsit:
-                log.debug(item.get("item"))
+                log.debug(item)
                 request_params = item.get("request_params")
-                is_join_html = item.get("is_join_html")          # 正文是否根据xpath拼接
-                extra_html = item.get("extra_html")              # 过滤无效内容
+                timeout = request_params.pop('timeout',10)
+                if item.get("js"):
+                    eval(item.get("js"))
+                if item.get("ex_python"):
+                    exec(item.get("ex_python"))
                 if item.get("proxies"):
-                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
-                                          callback=eval(item.get("parse")),base_info=item,**request_params)
+                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"),
+                                          files_info=item.get("files"),
+                                          deal_detail=item.get("deal_detail"),
+                                          callback=eval(item.get("parse")), base_info=item, **request_params,
+                                          timeout=timeout)
                 else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
-                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
+                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"),
+                                          files_info=item.get("files"),
+                                          deal_detail=item.get("deal_detail"), timeout=timeout,
+                                          callback=eval(item.get("parse")), base_info=item, proxies=False,
+                                          **request_params)
+
                 self.to_db.delete(self.db_name, {"_id": item.get("_id")})
             break
 
-    def detail_get(self,request,response):
+    def detail_get(self, request, response):
 
         items = request.item
         list_item = DataBakItem()
         for key in items:
-            list_item.__setitem__(key,items[key])
+            list_item.__setitem__(key, items[key])
 
         html = ''
         for xpath in request.deal_detail:
             html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if request.is_join_html:
-                if html is not None:
-                    html += html
-            else:
-                if html is not None:
-                    break
-
-        extra_html_info = request.extra_html
-        if html and extra_html_info:
-            for extra_item in extra_html_info:
-                if re.search('^//.*', extra_item):
-                    extra_html = response.xpath(extra_item).extract_first()
-                else:
-                    extra_html = extra_item
-                html = html.replace(extra_html,'')
+            if html is not None:
+                break
+
+        if request.to_dict.get('rm_list',None) and html:
+            rm_list = request.rm_list
+            html = remove_htmldata(rm_list,html,response)
 
         list_item.contenthtml = html
 
-        if request.files_info:      # 附件下载
+        if request.files_info:
             files_info = request.files_info
             files = response.xpath(files_info.get("list_xpath"))
-            if len(files)>0:
+            if len(files) > 0:
                 attachments = {}
-                for info in files:
+                for index, info in enumerate(files):
                     file_url = info.xpath(files_info.get("url_xpath")).extract_first()
                     file_name = info.xpath(files_info.get("name_xpath")).extract_first()
                     if not file_name:
@@ -84,33 +87,47 @@ class Details(feapder.BiddingDetailSpider):
                             file_type = file_url.split("?")[0].split(".")[-1].lower()
                             if file_type not in files_info.get("files_type"):
                                 file_type = file_name.split("?")[0].split(".")[-1].lower()
+                        elif files_info.get("file_type") == "file_name":
+                            file_type = file_name.split("?")[0].split(".")[-1].lower()
                         else:
                             file_type = files_info.get("file_type")
 
+                        if request.proxies:
+                            fpx = request.proxies()
+                        else:
+                            fpx = False
+
                         if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
                             attachment = AttachmentDownloader().fetch_attachment(
-                                file_name=file_name,file_type=file_type,download_url=file_url,
-                                enable_proxy=False)
-                            attachments[str(len(attachments)+1)] = attachment
-                if len(attachments)==0:
+                                file_name=file_name, file_type=file_type, download_url=file_url,
+                                enable_proxy=False,proxies=fpx)
+                            attachments[str(len(attachments) + 1)] = attachment
+                if len(attachments) == 0:
                     pass
                 else:
-                    list_item.projectinfo={"attachments":attachments}
+                    list_item.projectinfo = {"attachments": attachments}
 
         yield list_item
 
+    def detail_json(self, request, response):
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key, items[key])
+        exec(request.deal_detail)
 
-    def detail_json(self,request,response):
+        yield list_item
 
+    def detail_post(self, request, response):
         items = request.item
         list_item = DataBakItem()
         for key in items:
-            list_item.__setitem__(key,items[key])
-
+            list_item.__setitem__(key, items[key])
         exec(request.deal_detail)
 
         yield list_item
 
 
+
 if __name__ == "__main__":
     Details(redis_key="{USER}:${spider_name}").start()

+ 1 - 1
FworkSpider/script_tools/create.py

@@ -22,7 +22,7 @@ def create_spider(spider_name, spider_type):
 
 if __name__ == '__main__':
     # fire.Fire(create_spider('ztbpc_feapder', 4))
-    # fire.Fire(create_spider('ztbpc_feapder', 5))
+    # fire.Fire(create_spider('T_details', 5))
     # fire.Fire(create_spider('njpc_list', 6))
     # fire.Fire(create_spider('njpc_detail', 7))
     fire.Fire(create_spider('selenium_feapder', 2))