search_engines.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. from abc import abstractmethod, ABCMeta
  2. from common.analysis import parse_urls
  3. from crawler.constants import RBF, MGO_VISIT
  4. from crawler.downloader import Downloader
  5. class JySearchEngine(Downloader, metaclass=ABCMeta):
  6. def __init__(self):
  7. self.filter_instance = RBF
  8. self.mgo_instance = MGO_VISIT
  9. def is_exists(self, val):
  10. return self.filter_instance.is_exists(val)
  11. @abstractmethod
  12. def search(self, keyword: str):
  13. raise NotImplementedError
  14. class BaiDuSearchEngine(JySearchEngine):
  15. def search(self, keyword: str):
  16. pass
  17. class BingSearchEngine(JySearchEngine):
  18. def search(self, keyword: str):
  19. urls = []
  20. headers = {
  21. "authority": "cn.bing.com",
  22. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  23. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  24. "cache-control": "no-cache",
  25. "pragma": "no-cache",
  26. "referer": "https://cn.bing.com/?scope=web",
  27. "upgrade-insecure-requests": "1",
  28. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
  29. }
  30. url = "https://cn.bing.com/search"
  31. params = {
  32. "q": keyword,
  33. }
  34. response = self.get(url, headers=headers, params=params)
  35. response.encoding = response.apparent_encoding
  36. if response.status_code == 200:
  37. urls = parse_urls(response.text, 'https://cn.bing.com/')
  38. return urls
  39. # if __name__ == '__main__':
  40. # b = BingSearchEngine()
  41. # for i in b.search('招标'):
  42. # print(i)