engines.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. from abc import abstractmethod, ABCMeta
  2. from crawler.analysis import parse_urls
  3. from crawler.download import Downloader
  4. from crawler.qcc import QccService
  5. from crawler.utils import get_url
  6. from settings import ENGINE_FEATURE_RETRIEVES
  7. class JySearchEngine(Downloader, metaclass=ABCMeta):
  8. def __init__(self):
  9. super(JySearchEngine, self).__init__()
  10. self.qcc = QccService()
  11. @staticmethod
  12. def rubbish_url(url: str):
  13. for feature in ENGINE_FEATURE_RETRIEVES:
  14. if feature in url:
  15. return True
  16. return False
  17. @abstractmethod
  18. def search(self, keyword: str, page: int):
  19. raise NotImplementedError
  20. def by_org_get_site(self, name: str):
  21. return self.qcc.get_site(name)
  22. @abstractmethod
  23. def downloader(self, url: str, **kwargs):
  24. raise NotImplementedError
  25. @abstractmethod
  26. def parser(self, response):
  27. raise NotImplementedError
  28. @abstractmethod
  29. def percolator(self, urls: list, retrieve_urls: list):
  30. raise NotImplementedError
  31. class BingSearchEngine(JySearchEngine):
  32. site = "https://cn.bing.com/"
  33. def __init__(self):
  34. super(BingSearchEngine, self).__init__()
  35. self.headers = {
  36. "authority": "cn.bing.com",
  37. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  38. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  39. "cache-control": "no-cache",
  40. "pragma": "no-cache",
  41. "referer": "https://cn.bing.com/?scope=web",
  42. "upgrade-insecure-requests": "1",
  43. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
  44. }
  45. def downloader(self, url, **kwargs):
  46. response = self.get(url, headers=self.headers, **kwargs)
  47. return response
  48. def parser(self, response):
  49. urls = []
  50. if response.status_code == 200:
  51. urls = parse_urls(response.text, self.site)
  52. return urls
  53. def percolator(self, urls, retrieve_urls):
  54. """url过滤器"""
  55. for url in urls:
  56. if not self.rubbish_url(url) and url not in retrieve_urls:
  57. retrieve_urls.append(url)
  58. def search(self, keyword, page):
  59. retrieve_urls = []
  60. base_url = "https://cn.bing.com/search"
  61. first = (page - 1) * 10 + 1
  62. params = {
  63. "q": 'intitle:{}'.format(keyword),
  64. "first": first,
  65. 'FORM': 'PERE',
  66. 'pq': 'intitle:{}'.format(keyword)
  67. }
  68. url = get_url(base_url, params)
  69. # 下载
  70. response = self.downloader(url)
  71. # 解析
  72. urls = self.parser(response)
  73. # 过滤
  74. self.percolator(urls, retrieve_urls)
  75. return retrieve_urls