data_spider
/
match_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
							import re

from utils.databases import es_query
from utils.execptions import (
    CustomAccountPrivilegeError,
    CustomCheckError
)

__all__ = ['CheckText', 'CheckTask']


class CheckContent:

    def __init__(self):
        self.sensitive_words = {
            '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员',
            '标准会员', '点击支付', '隐私政策及用户服务协议',
            '.*<a href=\"(.*?)\">点击查看内容'
        }

    @staticmethod
    def check_text_length(val: str):
        if len(val) == 0:
            raise CustomCheckError(code=10101, reason='文本内容为空')
        elif not re.findall(r'[\u4e00-\u9fa5]+', val, re.S):
            raise CustomCheckError(code=10102, reason='不存在中文字符')
        else:
            '''清洗数字、字母、中文之外的干扰元素'''
            sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
            for pattern in sub_pattern:
                val = re.sub(pattern, '', val)
            # 若文本长度小于50，表示页面内容无详情内容
            if len(val) < 50:
                raise CustomCheckError(code=10102, reason='页面无有效内容')

    @staticmethod
    def check_content(val: str):
        if val.count("部分文件可能不支持在线浏览"):
            raise CustomCheckError(code=10103, reason='文件不支持在线浏览')

    @staticmethod
    def check_account_privilege(val: str):
        if val.count("高级会员"):
            raise CustomAccountPrivilegeError
        elif "本招标项目仅供正式会员查阅" in val:
            raise CustomAccountPrivilegeError

    def check_sensitive_word(self, val: str):
        total = set()
        for word in self.sensitive_words:
            result = re.search(word, val)
            if result is not None:
                total.add(word)

        if len(total) > 0:
            raise CustomCheckError(code=10104, reason='详情内容包含敏感词')

    def __check(self, text):
        self.check_sensitive_word(text)
        self.check_text_length(text)
        self.check_content(text)
        self.check_account_privilege(text)

    def __call__(self, text: str, *args, **kwargs):
        self.__check(text)


class CheckPrePareRequest:

    def __init__(self):
        self.crawl_keywords = {
            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
            '终止', '系统'
        }

    @staticmethod
    def check_es_cache(title: str, publish_time: int, rows: dict):
        """

        :param title:  标题
        :param publish_time: 发布时间的时间戳(l_np_publishtime)
        :param rows: 采集内容
        """
        retrieved_result = es_query(title, publish_time)
        if retrieved_result != 0:
            '''es查询数据结果'''
            rows['count'] = retrieved_result
            raise CustomCheckError(code=10105, reason='标题内容已存在es')

    def check_crawl_title(self, title: str):
        for keyword in self.crawl_keywords:
            valid_keyword = re.search(keyword, title)
            if valid_keyword is not None:
                break
        else:
            raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)

    def __check(self, rows: dict):
        title, publish_time = rows['title'], rows['l_np_publishtime']
        self.check_crawl_title(title)
        self.check_es_cache(title, publish_time, rows)

    def __call__(self, rows: dict, *args, **kwargs):
        self.__check(rows)


CheckText = CheckContent()
CheckTask = CheckPrePareRequest()