lizongze 1 rok pred
rodič
commit
7cc907fc7b

+ 21 - 21
FworkSpider/feapder/templates/njpc_detail_template.tmpl

@@ -16,6 +16,7 @@ from items.njpc_item import DataNjpcItem
 from untils.attachment import AttachmentDownloader as AD
 from untils.attachment_res import AttachmentDownloader as ADres
 from lxml.html import fromstring
+from untils.tools import remove_htmldata,extract_file_type
 from feapder.utils.log import log
 
 
@@ -24,39 +25,42 @@ redis_key = "njpc_details"
 
 # 拟建爬虫下载附件
 def njpc_get_files(html,file_type="",s_key="http",proxies=False):
+
+    def parse_filetype(response, filetypes):
+        val = response.headers.get("content-disposition")
+        filetype = val.split('.')[-1].replace('"', '').replace("'", "")
+        filetypes.append(filetype)
+
     root = fromstring(html)
     file_info = root.xpath('//a[@href]')
     if file_info:
         attachments = {}
         for info in file_info:
             file_url = "".join(info.xpath('./@href'))
-            file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
-                          'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps']
+            file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
+                          'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
             file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
             if file_type.lower() == "res":
-                file_type_name = "content-disposition"
-                get_file_type = '''
-file_type = file_type_txt.split('.')[-1].replace('"','').replace("'","")
-file_types.append(file_type)
-'''
                 if s_key in file_url and file_name:
                     file_name = file_name.strip()
-                    attachment = ADres().fetch_attachment(get_file_type=get_file_type,file_type_name=file_type_name,
-                                    proxies=proxies,file_name=file_name,download_url=file_url,enable_proxy=False,)
+                    attachment = ADres().fetch_attachment(
+                        file_name=file_name,
+                        download_url=file_url,
+                        callback=parse_filetype,
+                        proxies=proxies,
+                    )
                     attachments[str(len(attachments) + 1)] = attachment
             else:
                 if file_type.lower() in file_types:
                     file_tp = file_type
                 else:
-                    file_tp = file_url.split(".")[-1].lower()
-                    if file_tp not in file_types and file_name:
-                        file_tp = file_name.strip().split(".")[-1].lower()
+                    file_tp = extract_file_type(file_name,file_url,[file_type])
 
-                if file_tp in file_types and s_key in file_url and file_name:
+                if file_tp and s_key in file_url and file_name:
                     file_name = file_name.strip()
                     attachment = AD().fetch_attachment(
                         file_name=file_name, file_type=file_tp, download_url=file_url,
-                        enable_proxy=False, proxies=proxies)
+                        proxies=proxies)
                     attachments[str(len(attachments) + 1)] = attachment
         return attachments
 
@@ -64,7 +68,7 @@ file_types.append(file_type)
 class Details(feapder.PlanToBuildDetailSpider):
 
     def start_requests(self):
-        data_lsit = self.get_tasks_by_rabbitmq(limit=1)
+        data_lsit = self.get_tasks_by_rabbitmq(limit=100)
         for item in data_lsit:
             log.debug(item)
             request_params = item.get("request_params")
@@ -72,21 +76,17 @@ class Details(feapder.PlanToBuildDetailSpider):
             is_join_html = item.get("is_join_html")      # 正文是否根据xpath拼接
             extra_html = item.get("extra_html")          # 过滤无效内容
             title_xpath = item.get("title_xpath")        # 三级页标题
-            render = item.get("render") or False         # 是否开启浏览器
-            render_time = item.get("render_time") or 3   # 浏览器渲染时间
             extra_activity = item.get("extra_activity")  # 额外的需求动作
             file_params = item.get("file_params")        # 附件下载配置
             if item.get("proxies"):
                 yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
                                       is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
-                                      callback=item.get("parser"), render=render, render_time=render_time,
-                                      file_params=file_params,
+                                      callback=item.get("parser"), file_params=file_params,
                                       extra_activity=extra_activity, timeout=timeout, **request_params)
             else:
                 yield feapder.Request(url=item.get("parser_url"), item=item,deal_detail=item.get("deal_detail"),
                                       is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
-                                      callback=item.get("parser"), render=render, render_time=render_time,
-                                      file_params=file_params,
+                                      callback=item.get("parser"), file_params=file_params,
                                       extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
 
     def detail_get(self,request,response):

+ 9 - 11
FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -18,22 +18,22 @@ class ${spider_name}(feapder.BiddingListSpider):
 
     def start_callback(self):
 
-         self.site = ""
+        self.site = ""
 
-         #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
+        #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
+        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
 
-         self.menus = [
+        self.menus = [
              Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
              Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-         ]
+        ]
 
-         self.headers = {}
+        self.headers = {}
 
     def start_requests(self):
-         for menu in self.menus:
-             start_url = ''
-             yield feapder.Request(url=start_url,item=menu._asdict(),page=1,proxies=False)
+        for menu in self.menus:
+            start_url = ''
+            yield feapder.Request(url=start_url,item=menu._asdict(),page=1,proxies=False)
 
     def download_midware(self, request):
         page = request.page
@@ -75,8 +75,6 @@ class ${spider_name}(feapder.BiddingListSpider):
                 "list_xpath":'//div[@class="***"]//a[@href]',
                 "url_xpath":'./@href',
                 "name_xpath":'./text()',
-                "files_type":('zip','docx','ftp','pdf','doc','rar','gzzb',
-                              'jpg','png','zbid','xls','xlsx','swp','dwg'), # 需要下载的附件类型
                 #"file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
                 "url_key":'http',                    # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
                 "host":'',                           # 需要拼接url的host