data_spider
/
crawlab_feader


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
							import feapder.utils.tools as tools
from feapder.utils.log import log
from items.base_item import SwordFishProjectItem
from untils.check_data import CheckData
from untils.tools import (
    int2long,
    substitute,
    text_search,
)


class DataBakItem(SwordFishProjectItem):
    """标讯数据"""
    def __init__(self):
        super(DataBakItem, self).__init__()

        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）

        self.title = ""  # 文章标题
        self.s_title = ""  # 详情页标题（有必填），默认提供列表页标题
        self.area = "全国"  # 省
        self.city = ""  # 市
        self.district = ""  # 区/县
        self.publishtime = ""  # 文章发布时间（列表页或者详情页发布时间）
        self.l_np_publishtime = ""  # 发布时间的时间戳（秒级）, 需定义为long型
        self.comeintime = ""  # 入库时间戳（秒级）, 需定义为long型
        self.contenthtml = ""  # 详情页源码
        self.detail = ""  # 详情页源码清洗之后的文本

        self.href = ""  # 非竞品详情页地址
        self.competehref = None  # 竞品详情页地址
        self.projectinfo = None  # 附件信息,详见剑鱼招投标规范

        self.iscompete = True  # 新爬虫

        self.sendflag = "false"
        self.T = "bidding"
        self.infoformat = 1

        # 默认设置
        self.type = ""
        self.publishdept = ""
        self._d = "comeintime"

    def pre_to_db(self):
        if not self.s_title:
            self.s_title = self.title
            log.debug("请检测 < s_title > 是否正确！")

        self.comeintime = int2long(tools.get_current_timestamp())  # 生成入库时间戳（秒级）, 定义为long型

        if ":" in self.publishtime:
            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
        else:
            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))

        # html处理正文
        if not self.contenthtml:
            log.warning(f"正文数据为空：\n 发布地址：{self.href}\n 发布时间：{self.publishtime}\n 标题：{self.title}")
            self.save = False
        else:
            if not self.detail:
                self.detail = substitute(self.contenthtml)

            if text_search(self.detail).total == 0:
                self.sendflag = "true"   # 无内容数据，数据不入保存服务

        if not self.title or not self.publishtime or not self.href:
            log.warning(f"基础数据为空：\n 发布地址：{self.href}\n 发布时间：{self.publishtime}\n 标题：{self.title}")
            self.save = False

        # 竞品网站-详情页地址标识字段
        if not self.competehref:
            del self.competehref

        # 详情无附件，不需要 projectinfo 字段
        if not self.projectinfo:
            del self.projectinfo


class ExamineAndApproveItem(DataBakItem):
    """审批数据"""
    def __init__(self):
        super(ExamineAndApproveItem, self).__init__()

        self.table_name = "data_bak"

        self.T = "bidding"
        self.infoformat = 2


class PropertyRightItem(DataBakItem):
    """产权数据"""
    def __init__(self):
        super(PropertyRightItem, self).__init__()

        self.table_name = "data_bak"

        self.T = "bidding_other"
        self.infoformat = 3


class MgpListItem(SwordFishProjectItem):

    def __init__(self):
        super(MgpListItem, self).__init__()

        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）

        self.parse_url = ""  # 详情爬虫访问地址
        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
        self.parse = ""  # 详情爬虫解析回调方法名

        self.request_params = {}  # 定义callback所需的参数，诸如render，headers，method，data，params等等，必须与requests请求的参数名称对应，否则无法识别
        self.proxies = True  # 代理

        self.comeintime = int2long(tools.get_current_timestamp())  # 入库时间

        self.deal_detail = []  # 定义解析详情页主页内容的xpath列表
        self.ex_js = ""  # 定义需要执行的js代码,包括但不限于script、文件路径等
        self.ex_python = None  # 定义需要执行的python代码，生成params/date，如header和cookie特殊，最好使用特殊定义法

        self.files = False  # 采集附件配置

    @property
    def item(self) -> dict:
        return self.__dict__["item"]

    @item.setter
    def item(self, data_item: DataBakItem):
        self.__dict__["item"] = data_item.to_dict

    def pre_to_db(self):
        self.spidercode = self.item["spidercode"]

        title = self.item.get("title")
        channel = self.item["channel"]
        if CheckData.channel(channel):
            code, reason = CheckData.title(title)
            if code == 10106:
                log.warning(f"{title}--不可入库，原因:{reason}")
                self.save = False