search_engine.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. from abc import abstractmethod, ABCMeta
  2. from crawler.analysis import parse_urls
  3. from crawler.constants import MGO_VISIT
  4. from crawler.download import Downloader
  5. class JySearchEngine(Downloader, metaclass=ABCMeta):
  6. def __init__(self):
  7. self.mgo_instance = MGO_VISIT
  8. @abstractmethod
  9. def search(self, keyword: str):
  10. raise NotImplementedError
  11. class BaiDuSearchEngine(JySearchEngine):
  12. def search(self, keyword: str):
  13. pass
  14. class BingSearchEngine(JySearchEngine):
  15. def search(self, keyword: str):
  16. urls = []
  17. headers = {
  18. "authority": "cn.bing.com",
  19. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  20. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  21. "cache-control": "no-cache",
  22. "pragma": "no-cache",
  23. "referer": "https://cn.bing.com/?scope=web",
  24. "upgrade-insecure-requests": "1",
  25. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
  26. }
  27. url = "https://cn.bing.com/search"
  28. params = {
  29. "q": keyword,
  30. }
  31. response = self.get(url, headers=headers, params=params)
  32. response.encoding = response.apparent_encoding
  33. if response.status_code == 200:
  34. urls = parse_urls(response.text, 'https://cn.bing.com/')
  35. return urls
  36. # if __name__ == '__main__':
  37. # b = BingSearchEngine()
  38. # for i in b.search('招标'):
  39. # print(i)