|
@@ -0,0 +1,92 @@
|
|
|
+from abc import abstractmethod, ABCMeta
|
|
|
+
|
|
|
+from crawler.analysis import parse_urls
|
|
|
+from crawler.download import Downloader
|
|
|
+from crawler.qcc import QccService
|
|
|
+from crawler.utils import get_url
|
|
|
+from settings import ENGINE_FEATURE_RETRIEVES
|
|
|
+
|
|
|
+
|
|
|
+class JySearchEngine(Downloader, metaclass=ABCMeta):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ super(JySearchEngine, self).__init__()
|
|
|
+ self.qcc = QccService()
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def rubbish_url(url: str):
|
|
|
+ for feature in ENGINE_FEATURE_RETRIEVES:
|
|
|
+ if feature in url:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ @abstractmethod
|
|
|
+ def search(self, keyword: str, page: int):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ def by_org_get_site(self, name: str):
|
|
|
+ return self.qcc.get_site(name)
|
|
|
+
|
|
|
+ @abstractmethod
|
|
|
+ def downloader(self, url: str, **kwargs):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ @abstractmethod
|
|
|
+ def parser(self, response):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+ @abstractmethod
|
|
|
+ def percolator(self, urls: list, retrieve_urls: list):
|
|
|
+ raise NotImplementedError
|
|
|
+
|
|
|
+
|
|
|
+class BingSearchEngine(JySearchEngine):
|
|
|
+ site = "https://cn.bing.com/"
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ super(BingSearchEngine, self).__init__()
|
|
|
+ self.headers = {
|
|
|
+ "authority": "cn.bing.com",
|
|
|
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
+ "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
+ "cache-control": "no-cache",
|
|
|
+ "pragma": "no-cache",
|
|
|
+ "referer": "https://cn.bing.com/?scope=web",
|
|
|
+ "upgrade-insecure-requests": "1",
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
|
|
|
+ }
|
|
|
+
|
|
|
+ def downloader(self, url, **kwargs):
|
|
|
+ response = self.get(url, headers=self.headers, **kwargs)
|
|
|
+ return response
|
|
|
+
|
|
|
+ def parser(self, response):
|
|
|
+ urls = []
|
|
|
+ if response.status_code == 200:
|
|
|
+ urls = parse_urls(response.text, self.site)
|
|
|
+ return urls
|
|
|
+
|
|
|
+ def percolator(self, urls, retrieve_urls):
|
|
|
+ """url过滤器"""
|
|
|
+ for url in urls:
|
|
|
+ if not self.rubbish_url(url) and url not in retrieve_urls:
|
|
|
+ retrieve_urls.append(url)
|
|
|
+
|
|
|
+ def search(self, keyword, page):
|
|
|
+ retrieve_urls = []
|
|
|
+ base_url = "https://cn.bing.com/search"
|
|
|
+ first = (page - 1) * 10 + 1
|
|
|
+ params = {
|
|
|
+ "q": 'intitle:{}'.format(keyword),
|
|
|
+ "first": first,
|
|
|
+ 'FORM': 'PERE',
|
|
|
+ 'pq': 'intitle:{}'.format(keyword)
|
|
|
+ }
|
|
|
+ url = get_url(base_url, params)
|
|
|
+ # 下载
|
|
|
+ response = self.downloader(url)
|
|
|
+ # 解析
|
|
|
+ urls = self.parser(response)
|
|
|
+ # 过滤
|
|
|
+ self.percolator(urls, retrieve_urls)
|
|
|
+ return retrieve_urls
|