123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- from abc import abstractmethod, ABCMeta
- from common.execptions import QccError
- from constants import (
- ORGANIZATION,
- KEYWORD
- )
- from crawler.analysis import parse_urls
- from crawler.download import Downloader
- from crawler.utils import join_url, html2element
- from settings import ENGINE_FEATURE_RETRIEVES
- class JySearchEngine(Downloader, metaclass=ABCMeta):
- def __init__(self):
- super(JySearchEngine, self).__init__()
- @staticmethod
- def rubbish_url(url: str):
- for feature in ENGINE_FEATURE_RETRIEVES:
- if feature in url:
- return True
- return False
- @abstractmethod
- def search(self, keyword: str, page: int):
- raise NotImplementedError
- class BingSearchEngine(JySearchEngine):
- site = "https://cn.bing.com/"
- usage = KEYWORD
- def __init__(self):
- super(BingSearchEngine, self).__init__()
- self.headers = {
- "authority": "cn.bing.com",
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
- "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
- "cache-control": "no-cache",
- "pragma": "no-cache",
- "referer": "https://cn.bing.com/?scope=web",
- "upgrade-insecure-requests": "1",
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
- }
- def downloader(self, url, **kwargs):
- response = self.get(url, headers=self.headers, **kwargs)
- return response
- def parser(self, response):
- urls = []
- if response.status_code == 200:
- urls = parse_urls(response.text, url=self.site, mode=2)
- return urls
- def percolator(self, urls, retrieve_urls):
- """url过滤器"""
- for url in urls:
- if not self.rubbish_url(url) and url not in retrieve_urls:
- retrieve_urls.append(url)
- def search(self, keyword, page):
- retrieve_urls = []
- base_url = "https://cn.bing.com/search"
- first = (page - 1) * 5 + 1
- params = {
- "q": 'intitle:{}'.format(keyword),
- "first": first,
- 'FORM': 'PERE',
- 'pq': 'intitle:'
- }
- url = join_url(base_url, params)
- # 下载
- response = self.downloader(url)
- # 解析
- urls = self.parser(response)
- # 过滤
- self.percolator(urls, retrieve_urls)
- return retrieve_urls
- class QccSearchEngine(JySearchEngine):
- site = "https://www.qcc.com/"
- usage = ORGANIZATION
- def search(self, name: str, page=None):
- site = '-'
- headers = {
- "authority": "www.qcc.com",
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
- "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
- "cache-control": "no-cache",
- "pragma": "no-cache",
- "upgrade-insecure-requests": "1",
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
- }
- cookies = {
- "CNZZDATA1254842228": "1319079473-1648702017-https%253A%252F%252Fwww.google.com%252F%7C1650962497",
- }
- url = "https://www.qcc.com/web/search"
- params = {"key": name.strip()}
- response = self.get(url, headers=headers, cookies=cookies, params=params)
- if response.status_code != 200:
- raise QccError(reason='企查查搜索接口调用失败', code=response.status_code)
- element = html2element(response.text)
- nodes = element.xpath('//table[@class="ntable ntable-list"]//tr[1]/td[3]/div[1]/div[4]/div[2]/span[3]/span/child::*')
- if len(nodes) > 0:
- sub_node = nodes[0]
- site = "".join("".join(sub_node.xpath('./text()')).split())
- return site
|