dongzhaorui 3 سال پیش
والد
کامیت
2a0bb61f12
1فایلهای تغییر یافته به همراه56 افزوده شده و 0 حذف شده
  1. 56 0
      find_source/crawler/search_engines.py

+ 56 - 0
find_source/crawler/search_engines.py

@@ -0,0 +1,56 @@
+from abc import abstractmethod, ABCMeta
+
+from common.analysis import parse_urls
+from crawler.constants import RBF, MGO_VISIT
+from crawler.downloader import Downloader
+
+
+class JySearchEngine(Downloader, metaclass=ABCMeta):
+
+    def __init__(self):
+        self.filter_instance = RBF
+        self.mgo_instance = MGO_VISIT
+
+    def is_exists(self, val):
+        return self.filter_instance.is_exists(val)
+
+    @abstractmethod
+    def search(self, keyword: str):
+        raise NotImplementedError
+
+
+class BaiDuSearchEngine(JySearchEngine):
+
+    def search(self, keyword: str):
+        pass
+
+
+class BingSearchEngine(JySearchEngine):
+
+    def search(self, keyword: str):
+        urls = []
+        headers = {
+            "authority": "cn.bing.com",
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "cache-control": "no-cache",
+            "pragma": "no-cache",
+            "referer": "https://cn.bing.com/?scope=web",
+            "upgrade-insecure-requests": "1",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
+        }
+        url = "https://cn.bing.com/search"
+        params = {
+            "q": keyword,
+        }
+        response = self.get(url, headers=headers, params=params)
+        response.encoding = response.apparent_encoding
+        if response.status_code == 200:
+            urls = parse_urls(response.text, 'https://cn.bing.com/')
+        return urls
+
+
+# if __name__ == '__main__':
+#     b = BingSearchEngine()
+#     for i in b.search('招标'):
+#         print(i)