data_spider
/
crawlab_feader


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
							# -*- coding: utf-8 -*-
"""
Created on 2021-12-13 13:25:15
---------
@summary:
---------
@author: 马国鹏
"""

import feapder
from feapder.utils.log import Log
from feapder.utils.tools import wechat_warning
from items.spider_item import DataBakItem, MgpListItem
from feapder.db.mongodb import MongoDB
from login_pool.zglbw import ZglbwPool
from untils.attachment import AttachmentDownloader

Log().info("")


class FirefoxDetails(feapder.Spider):
    _to_db = None
    db_name = 'mgp_list'
    send_list = []

    # 定义mongo链接
    @property
    def to_db(self):
        if not self._to_db:
            self._to_db = MongoDB()
        return self._to_db

    def start_requests(self):
        while True:
            data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
                                        sort={"date": -1}, limit=1)
            print(data_lsit)
            for item in data_lsit:
                url = item.get("parse_url")
                url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
                cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
                cookie = cookie.get_cookie().cookie
                yield feapder.Request(url=url, item=item.get("item"),
                                      callback=self.detail_get, base_info=item, render=True,
                                      render_time=3, proxies=False, cookies=cookie)
                self.to_db.delete(self.db_name, item)
            break

    def detail_get(self, request, response):
        items = request.item
        # print(items)
        list_item = DataBakItem()
        for key in items:
            list_item.__setitem__(key, items[key])
        html = ''
        xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
                      '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
                      '//div[@class="login ng-star-inserted"]']
        for xpath in xpath_list:
            # import pdb
            # pdb.set_trace()
            html_one = response.xpath(xpath).extract_first()
            if html_one is not None:
                html += '\n'  # 标书详细内容
                html += html_one  # 拼接html
        print(html)
        list_item.contenthtml = html
        files_list = response.xpath("//iframe/@src").extract_first()
        file_url = files_list.split("file=")[-1]
        file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
        attachments = {}
        file_name = list_item.title

        attachment = AttachmentDownloader().fetch_attachment(
            file_name=file_name, file_type='pdf', download_url=file_url,
            enable_proxy=False)
        attachments["0"] = attachment
        list_item.projectinfo = {"attachments": attachments}
        yield list_item

    def failed_request(self, request, response):
        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
        if response is None:
            code = 0
        code = response.status_code
        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
        if 200 <= code < 300:
            err = 'analysis'
        elif 300 <= code < 400:
            err = 'download'
        elif 400 <= code < 500:
            err = 'download'
        elif 500 <= code:
            err = "servers"
        else:
            err = "timeout"
        mgp = MgpListItem()
        mgp.code = code
        mgp.error = err
        items = request.base_info
        for key in items:
            mgp.__setitem__(key, items[key])
        mgp.failed += 1
        if mgp.pri is None:
            mgp.pri = 0

        if mgp.pri > 5:
            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
                    '''
                    根据爬虫优先级报警'''
                    info = f'''`
        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
        > **爬虫名称:** {mgp.item.get("site")}
        > **栏目名称:** {mgp.item.get("channel")}
        > **爬虫代码:** {mgp.item.get("spidercode")}
        > **爬虫等级:** {mgp.pri}
        > **所属管理人员:** {mgp.author}
        请登录剑鱼爬虫管理平台查看详情。
        `'''
                    wechat_warning(info)
                    self.send_list.append(mgp.item.get("site"))
        yield mgp

    def end_callback(self):
        print("爬虫结束")
        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
    # def download_midware(self, request):
    #     request.proxies = self.prox_pool.get()
    #     return request


if __name__ == "__main__":
    FirefoxDetails(redis_key="magp:details:ztlbw").start()