engines.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. from abc import abstractmethod, ABCMeta
  2. from common.execptions import QccError
  3. from constants import (
  4. ORGANIZATION,
  5. KEYWORD
  6. )
  7. from crawler.analysis import parse_urls
  8. from crawler.download import Downloader
  9. from crawler.utils import join_url, html2element
  10. from settings import ENGINE_FEATURE_RETRIEVES
  11. class JySearchEngine(Downloader, metaclass=ABCMeta):
  12. def __init__(self):
  13. super(JySearchEngine, self).__init__()
  14. @staticmethod
  15. def rubbish_url(url: str):
  16. for feature in ENGINE_FEATURE_RETRIEVES:
  17. if feature in url:
  18. return True
  19. return False
  20. @abstractmethod
  21. def search(self, keyword: str, page: int):
  22. raise NotImplementedError
  23. class BingSearchEngine(JySearchEngine):
  24. site = "https://cn.bing.com/"
  25. usage = KEYWORD
  26. def __init__(self):
  27. super(BingSearchEngine, self).__init__()
  28. self.headers = {
  29. "authority": "cn.bing.com",
  30. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  31. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  32. "cache-control": "no-cache",
  33. "pragma": "no-cache",
  34. "referer": "https://cn.bing.com/?scope=web",
  35. "upgrade-insecure-requests": "1",
  36. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
  37. }
  38. def downloader(self, url, **kwargs):
  39. response = self.get(url, headers=self.headers, **kwargs)
  40. return response
  41. def parser(self, response):
  42. urls = []
  43. if response.status_code == 200:
  44. urls = parse_urls(response.text, url=self.site, mode=2)
  45. return urls
  46. def percolator(self, urls, retrieve_urls):
  47. """url过滤器"""
  48. for url in urls:
  49. if not self.rubbish_url(url) and url not in retrieve_urls:
  50. retrieve_urls.append(url)
  51. def search(self, keyword, page):
  52. retrieve_urls = []
  53. base_url = "https://cn.bing.com/search"
  54. first = (page - 1) * 5 + 1
  55. params = {
  56. "q": 'intitle:{}'.format(keyword),
  57. "first": first,
  58. 'FORM': 'PERE',
  59. 'pq': 'intitle:'
  60. }
  61. url = join_url(base_url, params)
  62. # 下载
  63. response = self.downloader(url)
  64. # 解析
  65. urls = self.parser(response)
  66. # 过滤
  67. self.percolator(urls, retrieve_urls)
  68. return retrieve_urls
  69. class QccSearchEngine(JySearchEngine):
  70. site = "https://www.qcc.com/"
  71. usage = ORGANIZATION
  72. def search(self, name: str, page=None):
  73. site = '-'
  74. headers = {
  75. "authority": "www.qcc.com",
  76. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  77. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  78. "cache-control": "no-cache",
  79. "pragma": "no-cache",
  80. "upgrade-insecure-requests": "1",
  81. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
  82. }
  83. cookies = {
  84. "CNZZDATA1254842228": "1319079473-1648702017-https%253A%252F%252Fwww.google.com%252F%7C1650962497",
  85. }
  86. url = "https://www.qcc.com/web/search"
  87. params = {"key": name.strip()}
  88. response = self.get(url, headers=headers, cookies=cookies, params=params)
  89. if response.status_code != 200:
  90. raise QccError(reason='企查查搜索接口调用失败', code=response.status_code)
  91. element = html2element(response.text)
  92. nodes = element.xpath('//table[@class="ntable ntable-list"]//tr[1]/td[3]/div[1]/div[4]/div[2]/span[3]/span/child::*')
  93. if len(nodes) > 0:
  94. sub_node = nodes[0]
  95. site = "".join("".join(sub_node.xpath('./text()')).split())
  96. return site