dongzhaorui 3 سال پیش
والد
کامیت
6fcf088a48
1فایلهای تغییر یافته به همراه37 افزوده شده و 17 حذف شده
  1. 37 17
      find_source/crawler/engines.py

+ 37 - 17
find_source/crawler/engines.py

@@ -1,8 +1,13 @@
 from abc import abstractmethod, ABCMeta
 
+from common.execptions import QccError
+from common.tools import html2element
 from crawler.analysis import parse_urls
 from crawler.download import Downloader
-from crawler.qcc import QccService
+from constants import (
+    ORGANIZATION,
+    KEYWORD
+)
 from crawler.utils import get_url
 from settings import ENGINE_FEATURE_RETRIEVES
 
@@ -11,7 +16,6 @@ class JySearchEngine(Downloader, metaclass=ABCMeta):
 
     def __init__(self):
         super(JySearchEngine, self).__init__()
-        self.qcc = QccService()
 
     @staticmethod
     def rubbish_url(url: str):
@@ -24,24 +28,10 @@ class JySearchEngine(Downloader, metaclass=ABCMeta):
     def search(self, keyword: str, page: int):
         raise NotImplementedError
 
-    def by_org_get_site(self, name: str):
-        return self.qcc.get_site(name)
-
-    @abstractmethod
-    def downloader(self, url: str, **kwargs):
-        raise NotImplementedError
-
-    @abstractmethod
-    def parser(self, response):
-        raise NotImplementedError
-
-    @abstractmethod
-    def percolator(self, urls: list, retrieve_urls: list):
-        raise NotImplementedError
-
 
 class BingSearchEngine(JySearchEngine):
     site = "https://cn.bing.com/"
+    usage = KEYWORD
 
     def __init__(self):
         super(BingSearchEngine, self).__init__()
@@ -90,3 +80,33 @@ class BingSearchEngine(JySearchEngine):
         # 过滤
         self.percolator(urls, retrieve_urls)
         return retrieve_urls
+
+
+class QccSearchEngine(JySearchEngine):
+    usage = ORGANIZATION
+
+    def search(self, name: str, page=None):
+        site = '-'
+        headers = {
+            "authority": "www.qcc.com",
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "cache-control": "no-cache",
+            "pragma": "no-cache",
+            "upgrade-insecure-requests": "1",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
+        }
+        cookies = {
+            "CNZZDATA1254842228": "1319079473-1648702017-https%253A%252F%252Fwww.google.com%252F%7C1650962497",
+        }
+        url = "https://www.qcc.com/web/search"
+        params = {"key": name.strip()}
+        response = self.get(url, headers=headers, cookies=cookies, params=params)
+        if response.status_code != 200:
+            raise QccError(reason='企查查搜索接口调用失败', code=response.status_code)
+        element = html2element(response.text)
+        nodes = element.xpath('//table[@class="ntable ntable-list"]//tr[1]/td[3]/div[1]/div[4]/div[2]/span[3]/span/child::*')
+        if len(nodes) > 0:
+            sub_node = nodes[0]
+            site = "".join("".join(sub_node.xpath('./text()')).split())
+        return site