engines.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. from abc import abstractmethod, ABCMeta
  2. from common.execptions import QccError
  3. from common.tools import html2element
  4. from constants import (
  5. ORGANIZATION,
  6. KEYWORD
  7. )
  8. from crawler.analysis import parse_urls
  9. from crawler.download import Downloader
  10. from crawler.utils import get_url
  11. from settings import ENGINE_FEATURE_RETRIEVES
  12. class JySearchEngine(Downloader, metaclass=ABCMeta):
  13. def __init__(self):
  14. super(JySearchEngine, self).__init__()
  15. @staticmethod
  16. def rubbish_url(url: str):
  17. for feature in ENGINE_FEATURE_RETRIEVES:
  18. if feature in url:
  19. return True
  20. return False
  21. @abstractmethod
  22. def search(self, keyword: str, page: int):
  23. raise NotImplementedError
  24. class BingSearchEngine(JySearchEngine):
  25. site = "https://cn.bing.com/"
  26. usage = KEYWORD
  27. def __init__(self):
  28. super(BingSearchEngine, self).__init__()
  29. self.headers = {
  30. "authority": "cn.bing.com",
  31. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  32. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  33. "cache-control": "no-cache",
  34. "pragma": "no-cache",
  35. "referer": "https://cn.bing.com/?scope=web",
  36. "upgrade-insecure-requests": "1",
  37. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
  38. }
  39. def downloader(self, url, **kwargs):
  40. response = self.get(url, headers=self.headers, **kwargs)
  41. return response
  42. def parser(self, response):
  43. urls = []
  44. if response.status_code == 200:
  45. urls = parse_urls(response.text, self.site)
  46. return urls
  47. def percolator(self, urls, retrieve_urls):
  48. """url过滤器"""
  49. for url in urls:
  50. if not self.rubbish_url(url) and url not in retrieve_urls:
  51. retrieve_urls.append(url)
  52. def search(self, keyword, page):
  53. retrieve_urls = []
  54. base_url = "https://cn.bing.com/search"
  55. first = (page - 1) * 10 + 1
  56. params = {
  57. "q": 'intitle:{}'.format(keyword),
  58. "first": first,
  59. 'FORM': 'PERE',
  60. 'pq': 'intitle:{}'.format(keyword)
  61. }
  62. url = get_url(base_url, params)
  63. # 下载
  64. response = self.downloader(url)
  65. # 解析
  66. urls = self.parser(response)
  67. # 过滤
  68. self.percolator(urls, retrieve_urls)
  69. return retrieve_urls
  70. class QccSearchEngine(JySearchEngine):
  71. site = "https://www.qcc.com/"
  72. usage = ORGANIZATION
  73. def search(self, name: str, page=None):
  74. site = '-'
  75. headers = {
  76. "authority": "www.qcc.com",
  77. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  78. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  79. "cache-control": "no-cache",
  80. "pragma": "no-cache",
  81. "upgrade-insecure-requests": "1",
  82. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
  83. }
  84. cookies = {
  85. "CNZZDATA1254842228": "1319079473-1648702017-https%253A%252F%252Fwww.google.com%252F%7C1650962497",
  86. }
  87. url = "https://www.qcc.com/web/search"
  88. params = {"key": name.strip()}
  89. response = self.get(url, headers=headers, cookies=cookies, params=params)
  90. if response.status_code != 200:
  91. raise QccError(reason='企查查搜索接口调用失败', code=response.status_code)
  92. element = html2element(response.text)
  93. nodes = element.xpath('//table[@class="ntable ntable-list"]//tr[1]/td[3]/div[1]/div[4]/div[2]/span[3]/span/child::*')
  94. if len(nodes) > 0:
  95. sub_node = nodes[0]
  96. site = "".join("".join(sub_node.xpath('./text()')).split())
  97. return site