from abc import abstractmethod, ABCMeta from common.execptions import QccError from constants import ( ORGANIZATION, KEYWORD ) from crawler.analysis import parse_urls from crawler.download import Downloader from crawler.utils import join_url, html2element from settings import ENGINE_FEATURE_RETRIEVES class JySearchEngine(Downloader, metaclass=ABCMeta): def __init__(self): super(JySearchEngine, self).__init__() @staticmethod def rubbish_url(url: str): for feature in ENGINE_FEATURE_RETRIEVES: if feature in url: return True return False @abstractmethod def search(self, keyword: str, page: int): raise NotImplementedError class BingSearchEngine(JySearchEngine): site = "https://cn.bing.com/" usage = KEYWORD def __init__(self): super(BingSearchEngine, self).__init__() self.headers = { "authority": "cn.bing.com", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "cache-control": "no-cache", "pragma": "no-cache", "referer": "https://cn.bing.com/?scope=web", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36" } def downloader(self, url, **kwargs): response = self.get(url, headers=self.headers, **kwargs) return response def parser(self, response): urls = [] if response.status_code == 200: urls = parse_urls(response.text, url=self.site, mode=2) return urls def percolator(self, urls, retrieve_urls): """url过滤器""" for url in urls: if not self.rubbish_url(url) and url not in retrieve_urls: retrieve_urls.append(url) def search(self, keyword, page): retrieve_urls = [] base_url = "https://cn.bing.com/search" first = (page - 1) * 5 + 1 params = { "q": 'intitle:{}'.format(keyword), "first": first, 'FORM': 'PERE', 'pq': 'intitle:' } url = join_url(base_url, params) # 下载 response = self.downloader(url) # 解析 urls = self.parser(response) # 过滤 self.percolator(urls, retrieve_urls) return retrieve_urls class QccSearchEngine(JySearchEngine): site = "https://www.qcc.com/" usage = ORGANIZATION def search(self, name: str, page=None): site = '-' headers = { "authority": "www.qcc.com", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "cache-control": "no-cache", "pragma": "no-cache", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36" } cookies = { "CNZZDATA1254842228": "1319079473-1648702017-https%253A%252F%252Fwww.google.com%252F%7C1650962497", } url = "https://www.qcc.com/web/search" params = {"key": name.strip()} response = self.get(url, headers=headers, cookies=cookies, params=params) if response.status_code != 200: raise QccError(reason='企查查搜索接口调用失败', code=response.status_code) element = html2element(response.text) nodes = element.xpath('//table[@class="ntable ntable-list"]//tr[1]/td[3]/div[1]/div[4]/div[2]/span[3]/span/child::*') if len(nodes) > 0: sub_node = nodes[0] site = "".join("".join(sub_node.xpath('./text()')).split()) return site