فهرست منبع

Merge branch 'master' of https://jygit.jydev.jianyu360.cn/data_processing/crawlab_feader

dongzhaorui 1 سال پیش
والد
کامیت
785cf7ef0e

+ 2 - 2
FworkSpider/feapder/templates/detail_template.tmpl

@@ -12,7 +12,7 @@ from urllib.parse import urljoin
 import feapder
 import feapder
 from items.spider_item import DataBakItem
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
 from untils.attachment import AttachmentDownloader
-from untils.tools import remove_htmldata
+from untils.tools import remove_htmldata,extract_file_type
 from feapder.utils.log import log
 from feapder.utils.log import log
 import time
 import time
 import json
 import json
@@ -25,7 +25,7 @@ class Details(feapder.BiddingDetailSpider):
 
 
     def start_requests(self):
     def start_requests(self):
         while True:
         while True:
-            data_lsit = self.get_tasks_by_mongodb(limit=20)
+            data_lsit = self.get_tasks_by_rabbitmq(limit=20)
             for item in data_lsit:
             for item in data_lsit:
                 log.debug(item)
                 log.debug(item)
                 request_params = item.get("request_params")
                 request_params = item.get("request_params")

+ 1 - 1
FworkSpider/feapder/templates/njpc_detail_template.tmpl

@@ -64,7 +64,7 @@ file_types.append(file_type)
 class Details(feapder.PlanToBuildDetailSpider):
 class Details(feapder.PlanToBuildDetailSpider):
 
 
     def start_requests(self):
     def start_requests(self):
-        data_lsit = self.get_tasks_by_mongodb(limit=1)
+        data_lsit = self.get_tasks_by_rabbitmq(limit=1)
         for item in data_lsit:
         for item in data_lsit:
             log.debug(item)
             log.debug(item)
             request_params = item.get("request_params")
             request_params = item.get("request_params")

+ 2 - 3
FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -65,12 +65,11 @@ class ${spider_name}(feapder.BiddingListSpider):
 
 
             list_item.unique_key = ('href',)
             list_item.unique_key = ('href',)
             list_item.parse = "self.detail_get"      # 详情页回调方法
             list_item.parse = "self.detail_get"      # 详情页回调方法
-            list_item.item = data_item
             list_item.deal_detail = ['//div[@class="****"]']   # 抽取正文xpath
             list_item.deal_detail = ['//div[@class="****"]']   # 抽取正文xpath
             list_item.proxies = False
             list_item.proxies = False
             list_item.parse_url = href               # 详情页请求地址
             list_item.parse_url = href               # 详情页请求地址
-            list_item.is_delay = 1                   # 延时推送标识
-            list_item.if_es = 1                      # 查询es标识
+            # list_item.is_delay = 1                   # 延时推送标识
+            # list_item.if_es = 1                      # 查询es标识
 
 
             list_item.files={                        # 附件采集规则
             list_item.files={                        # 附件采集规则
                 "list_xpath":'//div[@class="***"]//a[@href]',
                 "list_xpath":'//div[@class="***"]//a[@href]',