engines.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. from abc import abstractmethod, ABCMeta
  2. from common.execptions import QccError
  3. from common.tools import html2element
  4. from crawler.analysis import parse_urls
  5. from crawler.download import Downloader
  6. from constants import (
  7. ORGANIZATION,
  8. KEYWORD
  9. )
  10. from crawler.utils import get_url
  11. from settings import ENGINE_FEATURE_RETRIEVES
  12. class JySearchEngine(Downloader, metaclass=ABCMeta):
  13. def __init__(self):
  14. super(JySearchEngine, self).__init__()
  15. @staticmethod
  16. def rubbish_url(url: str):
  17. for feature in ENGINE_FEATURE_RETRIEVES:
  18. if feature in url:
  19. return True
  20. return False
  21. @abstractmethod
  22. def search(self, keyword: str, page: int):
  23. raise NotImplementedError
  24. class BingSearchEngine(JySearchEngine):
  25. site = "https://cn.bing.com/"
  26. usage = KEYWORD
  27. def __init__(self):
  28. super(BingSearchEngine, self).__init__()
  29. self.headers = {
  30. "authority": "cn.bing.com",
  31. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  32. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  33. "cache-control": "no-cache",
  34. "pragma": "no-cache",
  35. "referer": "https://cn.bing.com/?scope=web",
  36. "upgrade-insecure-requests": "1",
  37. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
  38. }
  39. def downloader(self, url, **kwargs):
  40. response = self.get(url, headers=self.headers, **kwargs)
  41. return response
  42. def parser(self, response):
  43. urls = []
  44. if response.status_code == 200:
  45. urls = parse_urls(response.text, self.site)
  46. return urls
  47. def percolator(self, urls, retrieve_urls):
  48. """url过滤器"""
  49. for url in urls:
  50. if not self.rubbish_url(url) and url not in retrieve_urls:
  51. retrieve_urls.append(url)
  52. def search(self, keyword, page):
  53. retrieve_urls = []
  54. base_url = "https://cn.bing.com/search"
  55. first = (page - 1) * 10 + 1
  56. params = {
  57. "q": 'intitle:{}'.format(keyword),
  58. "first": first,
  59. 'FORM': 'PERE',
  60. 'pq': 'intitle:{}'.format(keyword)
  61. }
  62. url = get_url(base_url, params)
  63. # 下载
  64. response = self.downloader(url)
  65. # 解析
  66. urls = self.parser(response)
  67. # 过滤
  68. self.percolator(urls, retrieve_urls)
  69. return retrieve_urls
  70. class QccSearchEngine(JySearchEngine):
  71. usage = ORGANIZATION
  72. def search(self, name: str, page=None):
  73. site = '-'
  74. headers = {
  75. "authority": "www.qcc.com",
  76. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  77. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  78. "cache-control": "no-cache",
  79. "pragma": "no-cache",
  80. "upgrade-insecure-requests": "1",
  81. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
  82. }
  83. cookies = {
  84. "CNZZDATA1254842228": "1319079473-1648702017-https%253A%252F%252Fwww.google.com%252F%7C1650962497",
  85. }
  86. url = "https://www.qcc.com/web/search"
  87. params = {"key": name.strip()}
  88. response = self.get(url, headers=headers, cookies=cookies, params=params)
  89. if response.status_code != 200:
  90. raise QccError(reason='企查查搜索接口调用失败', code=response.status_code)
  91. element = html2element(response.text)
  92. nodes = element.xpath('//table[@class="ntable ntable-list"]//tr[1]/td[3]/div[1]/div[4]/div[2]/span[3]/span/child::*')
  93. if len(nodes) > 0:
  94. sub_node = nodes[0]
  95. site = "".join("".join(sub_node.xpath('./text()')).split())
  96. return site