Browse Source

删除失效历史脚本模版

dongzhaorui 2 months ago
parent
commit
7d419ad11a

+ 0 - 22
FworkSpider/feapder/templates/air_spider_template.tmpl

@@ -1,22 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary:
----------
-@author: {USER}
-"""
-
-import feapder
-
-
-class ${spider_name}(feapder.AirSpider):
-    def start_requests(self):
-        yield feapder.Request("https://www.baidu.com")
-
-    def parse(self, request, response):
-        print(response)
-
-
-if __name__ == "__main__":
-    ${spider_name}().start()

+ 0 - 121
FworkSpider/feapder/templates/detail_template.tmpl

@@ -1,121 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary:  ${spider_name}
----------
-@author: {USER}
-"""
-from urllib.parse import urljoin
-import feapder
-from items.spider_item import DataBakItem
-from untils.attachment import AttachmentDownloader
-from untils.tools import remove_htmldata, extract_file_type
-from feapder.utils.log import log
-import time
-import json
-import re
-
-
-class Details(feapder.BiddingDetailSpider):
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.get_tasks_by_rabbitmq(limit=20)
-            for item in data_lsit:
-                # log.debug(item)
-                request_params = item.get("request_params")
-                timeout = request_params.get('timeout', 10)
-                request_params.pop('timeout', None)
-                if item.get("js"):
-                    eval(item.get("js"))
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-                    yield feapder.Request(url=item.get("parse_url"), item=item, files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"), callback=eval(item.get("parse")),
-                                          **request_params, timeout=timeout)
-                else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item, files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"), timeout=timeout,
-                                          callback=eval(item.get("parse")), proxies=False, **request_params)
-
-            break
-
-    def detail_get(self, request, response):
-
-        items = request.item
-        list_item = DataBakItem(**items)
-
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        if request.to_dict.get('rm_list', None) and html:
-            rm_list = request.rm_list
-            html = remove_htmldata(rm_list, html, response)
-
-        if request.to_dict.get('title_xpath', None):
-            for sxpath in request.title_xpath:
-                title = response.xpath(sxpath).extract_first("").strip() # 三级页标题
-                if title:
-                    list_item.title = title
-                    break
-
-        list_item.contenthtml = html
-
-        if request.files_info:
-            files_info = request.files_info
-            files = response.xpath(files_info.get("list_xpath"))
-            if len(files) > 0:
-                attachments = {}
-                for index, info in enumerate(files):
-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                    if not file_name:
-                        file_name = info.xpath(files_info.get("name_xpath")).extract()
-                    if file_name:
-                        file_name = "".join("".join(file_name).split()).strip()
-                        if files_info.get("host"):
-                            file_url = urljoin(files_info.get("host"), file_url)
-                        if not files_info.get("file_type"):
-                            file_type = extract_file_type(file_name, file_url)
-                        else:
-                            file_type = files_info.get("file_type")
-
-                        if request.proxies:
-                            fpx = request.proxies()
-                        else:
-                            fpx = False
-
-                        if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                            attachment = AttachmentDownloader().fetch_attachment(
-                                file_name=file_name, file_type=file_type, download_url=file_url,
-                                proxies=fpx, headers=request.to_dict.get('headers', None))
-                            attachments[str(len(attachments) + 1)] = attachment
-                if attachments:
-                    list_item.projectinfo = {"attachments": attachments}
-
-        yield list_item
-
-    def detail_json(self, request, response):
-        items = request.item
-        list_item = DataBakItem(**items)
-
-        exec(request.deal_detail)
-
-        yield list_item
-
-    def detail_post(self, request, response):
-        items = request.item
-        list_item = DataBakItem(**items)
-
-        exec(request.deal_detail)
-
-        yield list_item
-
-
-if __name__ == "__main__":
-    Details(redis_key="detail:normal_details").start()

+ 0 - 22
FworkSpider/feapder/templates/item_template.tmpl

@@ -1,22 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary:
----------
-@author: {USER}
-"""
-
-from feapder import Item
-
-
-class ${item_name}Item(Item):
-    """
-    This class was generated by feapder.
-    command: feapder create -i ${table_name}.
-    """
-
-    __table_name__ = "${table_name}"
-
-    def __init__(self, *args, **kwargs):
-        ${propertys}

+ 0 - 146
FworkSpider/feapder/templates/njpc_detail_template.tmpl

@@ -1,146 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: 拟建爬虫通用快照页
----------
-@author: njpc_feapder
-"""
-import feapder
-import re
-import json
-import time, random
-from items.njpc_item import DataNjpcItem
-from untils.attachment import AttachmentDownloader as AD
-from untils.attachment_res import AttachmentDownloader as ADres
-from lxml.html import fromstring
-from untils.tools import remove_htmldata, extract_file_type
-from feapder.utils.log import log
-
-redis_key = "njpc_details"
-
-
-# 拟建爬虫下载附件
-def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
-    def parse_filetype(response, filetypes):
-        val = response.headers.get("content-disposition")
-        filetype = val.split('.')[-1].replace('"', '').replace("'", "")
-        filetypes.append(filetype)
-
-    root = fromstring(html)
-    file_info = root.xpath('//a[@href]')
-    if file_info:
-        attachments = {}
-        for info in file_info:
-            file_url = "".join(info.xpath('./@href'))
-            file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
-                          'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
-            file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
-            if file_type.lower() == "res":
-                if s_key in file_url and file_name:
-                    file_name = file_name.strip()
-                    attachment = ADres().fetch_attachment(
-                        file_name=file_name,
-                        download_url=file_url,
-                        callback=parse_filetype,
-                        proxies=proxies,
-                        headers=headers,
-                    )
-                    attachments[str(len(attachments) + 1)] = attachment
-            else:
-                if file_type.lower() in file_types:
-                    file_tp = file_type
-                else:
-                    file_tp = extract_file_type(file_name, file_url, [file_type])
-
-                if file_tp and s_key in file_url and file_name:
-                    file_name = file_name.strip()
-                    attachment = AD().fetch_attachment(
-                        file_name=file_name, file_type=file_tp, download_url=file_url,
-                        proxies=proxies, headers=headers,)
-                    attachments[str(len(attachments) + 1)] = attachment
-        return attachments
-
-
-class Details(feapder.PlanToBuildDetailSpider):
-
-    def start_requests(self):
-        data_lsit = self.get_tasks_by_rabbitmq(limit=100)
-        for item in data_lsit:
-            # log.debug(item)
-            request_params = item.get("request_params")
-            timeout = request_params.get('timeout', 10)
-            request_params.pop('timeout', None)
-            is_join_html = item.get("is_join_html")  # 正文是否根据xpath拼接
-            extra_html = item.get("extra_html")  # 过滤无效内容
-            title_xpath = item.get("title_xpath")  # 三级页标题
-            extra_activity = item.get("extra_activity")  # 额外的需求动作
-            file_params = item.get("file_params")  # 附件下载配置
-            if item.get("proxies"):
-                yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
-                                      callback=item.get("parser"), file_params=file_params,
-                                      extra_activity=extra_activity, timeout=timeout, **request_params)
-            else:
-                yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
-                                      callback=item.get("parser"), file_params=file_params,
-                                      extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
-
-    def detail_get(self, request, response):
-        items = request.item
-        data_item = DataNjpcItem(**items)
-
-        html = ''
-        for xpath in request.deal_detail:
-            htmls = response.xpath(xpath).extract_first()  # 标书详细内容
-            if request.is_join_html:
-                if htmls is not None:
-                    html += htmls
-            else:
-                if htmls is not None:
-                    html = htmls
-                    break
-
-        if request.title_xpath:
-            for sxpath in request.title_xpath:
-                title = response.xpath(sxpath).extract_first()  # 三级页标题
-                if title:
-                    data_item.title = title.strip()
-                    if "..." in data_item.projectname or "…" in data_item.projectname:
-                        data_item.projectname = title.strip()
-                    break
-
-        try:
-            if request.extra_activity:
-                from untils.tools import njpc_fields_extract, njpc_fields_extract_special
-                exec(request.extra_activity)
-        except:
-            pass
-
-        data_item.contenthtml = remove_htmldata(request.extra_html, html, response)
-
-        fp = request.file_params or {}
-        attachments = njpc_get_files(
-            html,
-            file_type=fp.get("file_type", ""),
-            s_key=fp.get("s_key", "http"),
-            proxies=fp.get("proxies", False),
-            headers=fp.get('headers', {}
-        )
-        if attachments:
-            data_item.projectinfo = {"attachments": attachments}
-
-        yield data_item
-
-    def detail_json(self, request, response):
-        items = request.item
-        data_item = DataNjpcItem(**items)
-
-        exec(request.deal_detail)
-
-        yield data_item
-
-
-if __name__ == '__main__':
-    Details(redis_key="detail:njpc_details").start()

+ 0 - 88
FworkSpider/feapder/templates/njpc_list_template.tmpl

@@ -1,88 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: ${spider_name}
----------
-@author: {USER}
-"""
-import feapder
-from items.njpc_item import NjpcListItem
-from collections import namedtuple
-import time, random
-
-
-class Njpc_Feapder(feapder.PlanToBuildListSpider):
-
-    def start_callback(self):
-
-        self.site = ""
-
-        #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-        self.menus = [
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-        ]
-
-        self.headers = {}
-
-    def start_requests(self):
-        start_url = ''
-        for menu in self.menus:
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
-
-    def download_midware(self, request):
-        page = request.page
-        request.headers = self.headers
-
-    def parse(self, request, response):
-        menu = request.item
-        info_list = response.xpath('')  # 数据结构为html
-        for info in info_list:
-            detail_href = info.xpath('').extract_first().strip()
-            projectname = info.xpath('').extract_first().strip()
-            publish_time = info.xpath('').extract_first().strip()
-
-            area = ""  # 省份
-            city = ""  # 城市
-            district = ""  # 区县
-
-            data_item = NjpcListItem()          # 存储数据的管道
-            data_item.unique_key = ("href", publish_time)  # 去重
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.projectname = projectname      # 项目名称
-            data_item.publishtime = publish_time     # 发布时间
-
-            data_item.site = self.site
-            data_item.area = area or "全国"                   # 城市默认:全国
-            data_item.city = city                            # 城市 默认为空
-            data_item.district = district                    # 城市 默认为空
-            data_item.parser_url = detail_href               # 详情页数据链接
-            data_item.href = detail_href                     # 详情链接
-            data_item.request_params = {"headers": self.headers}
-            data_item.parser = "detail_get"                  # 快照页爬虫调用的方法
-            data_item.deal_detail = ['//div[@class="***"]']  # 正文解析规则
-
-            # data_item.proxies = True               # 快照页是否开启代理
-            # data_item.is_join_html = True          # 正文是否根据xpath拼接
-            # data_item.extra_html = []              # 删除正文的无效数据(xpath列表 或 删除的内容)
-            # data_item.title_xpath = []             # 三级页标题 xpath列表
-            # data_item.file_params = {"file_type":"", "s_key":"http", "proxies":False}
-                                                     # 附件下载配置
-            # data_item.render = True                # 是否开启开启浏览器
-            # data_item.render_time = 3              # 渲染时间
-            # data_item.extra_activity = '''***'''   # 额外的需求动作(三引号内顶左边框写执行语句)
-
-            yield data_item
-
-        # 翻页
-        time.sleep(random.randint(2, 5))
-        request = self.infinite_pages(request, response)
-        yield request
-
-
-if __name__ == "__main__":
-    Njpc_Feapder(redis_key="detail:njpc_details").start()

+ 0 - 88
FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -1,88 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: ${spider_name}
----------
-@author: {USER}
-"""
-import feapder
-from items.spider_item import BidingListItem
-from collections import namedtuple
-
-
-class ${spider_name}(feapder.BiddingListSpider):
-
-    def start_callback(self):
-
-        self.site = ""
-
-        #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-        self.menus = [
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-        ]
-
-        self.headers = {}
-
-    def start_requests(self):
-        for menu in self.menus:
-            start_url = ''
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
-
-    def download_midware(self, request):
-        page = request.page
-        request.headers = self.headers
-
-    def parse(self, request, response):
-
-        menu = request.item
-        info_list = response.xpath('')  # 数据结构为html
-        for info in info_list:
-            href = info.xpath('').extract_first().strip()
-            title = info.xpath('').extract_first().strip()
-            publish_time = info.xpath('').extract_first().strip()
-
-            area = ""  # 省份
-            city = ""  # 城市
-            district = ""  # 区县
-
-            list_item = BidingListItem()     # 存储数据的管道
-            list_item.href = href            # 标书链接
-            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            list_item.title = title                  # 标题
-            list_item.publishtime = publish_time     # 标书发布时间
-            list_item.site = self.site
-            list_item.area = area or "全国"  # 省份 默认:全国
-            list_item.city = city           # 城市 默认 为空
-            list_item.district = district   # 区县 默认 为空
-
-            list_item.unique_key = ('href',)
-            list_item.parse = "self.detail_get"        # 详情页回调方法
-            list_item.deal_detail = ['//div[@class="****"]']  # 抽取正文xpath
-            list_item.proxies = False
-            list_item.parse_url = href                 # 详情页请求地址
-            # list_item.is_delay = 1                   # 延时推送标识
-            # list_item.if_es = 1                      # 查询es标识
-
-            list_item.files = {                       # 附件采集规则
-                "list_xpath": '//div[@class="***"]//a[@href]',
-                "url_xpath": './@href',
-                "name_xpath": './text()',
-                # "file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
-                "url_key": 'http',    # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                "host": '',           # 需要拼接url的host
-            }
-
-            yield list_item
-
-        # 翻页
-        request = self.infinite_pages(request, response)
-        yield request
-
-
-if __name__ == "__main__":
-    ${spider_name}(redis_key="detail:normal_details").start()

+ 0 - 108
FworkSpider/feapder/templates/spider_template.tmpl

@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: selenium抓取列表页无法获取href的信息
----------
-@author: {USER}
-"""
-import feapder
-from items.spider_item import DataBakItem
-from feapder.network.selector import Selector
-from collections import namedtuple
-import time
-
-
-class ${spider_name}(feapder.BiddingListSpider):
-
-    def start_callback(self):
-
-        self.site = ""
-
-        #   --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-        Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
-
-        self.menus = [
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', 1),
-        ]
-
-        self.headers = {}
-
-    def start_requests(self):
-        for menu in self.menus:
-            start_url = ''
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
-                                  render=True, render_time=3, proxies=False)
-
-    def download_midware(self, request):
-        page = request.page
-        request.headers = self.headers
-
-    def parse(self, request, response):
-        driver = response.browser
-        menu = request.item
-        info_list = response.xpath('')
-        for info in info_list:
-            # href = info.xpath('').extract_first().strip()
-            title = info.xpath('').extract_first().strip()
-            publish_time = info.xpath('').extract_first().strip()
-
-            area = ""  # 省份
-            city = ""  # 城市
-            district = ""  # 区县
-
-            try:
-                next_page = driver.find_element_by_xpath(f'//a[contains(text(),"{title}")]')
-            except:
-                try:
-                    next_page = driver.find_element_by_xpath(f'//a[contains(text(),"{title[:10]}")]')  # 标题过长
-                except:
-                    continue
-
-            driver.execute_script("arguments[0].click();", next_page)  # js点击
-            time.sleep(3)
-
-            # 点击三级页标题后打开新窗口
-            # handles = driver.window_handles
-            # driver.switch_to.window(handles[-1])
-
-            href = driver.current_url
-
-            data_item = DataBakItem()         # 存储数据的管道
-            data_item.href = href             # 标书链接
-            data_item.unique_key = ('title', 'href')  # 去重
-            data_item.channel = menu.get("channel")   # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")   # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title                   # 标题
-            data_item.publishtime = publish_time      # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area or "全国"  # 省份 默认:全国
-            data_item.city = city           # 城市 默认 为空
-            data_item.district = district   # 区县 默认 为空
-
-            detail_html = Selector(text=driver.page_source)
-            html = ""
-            dx_list = ['//div[@class="***"]', ]
-            for dx in dx_list:
-                html = detail_html.xpath(dx).extract_first()
-                if html:
-                    break
-
-            data_item.contenthtml = html
-
-            # (不同窗口)切换回主窗口
-            # driver.close()
-            # driver.switch_to.window(handles[0])
-
-            driver.back()
-            time.sleep(3)
-
-            yield data_item
-
-        # 翻页
-        request = self.infinite_pages(request, response)
-        yield request
-
-
-if __name__ == "__main__":
-    ${spider_name}(redis_key="{USER}:${spider_name}").start()