qcc.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536
  1. from common.execptions import QccError
  2. from common.tools import html2element
  3. from crawler.download import Downloader
  4. class QccService(Downloader):
  5. def get_site(self, name: str):
  6. site = '-'
  7. headers = {
  8. "authority": "www.qcc.com",
  9. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  10. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  11. "cache-control": "no-cache",
  12. "pragma": "no-cache",
  13. "upgrade-insecure-requests": "1",
  14. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
  15. }
  16. cookies = {
  17. "qcc_did": "d6af577d-b7e1-455b-a02d-bdfbd681d0db",
  18. "UM_distinctid": "17fde8eb058bcc-0f0dac963d063e-1c3a645d-1aeaa0-17fde8eb059107d",
  19. "QCCSESSID": "19da28dabd9a95bfa3354284a3",
  20. "CNZZDATA1254842228": "1319079473-1648702017-https%253A%252F%252Fwww.google.com%252F%7C1650861011",
  21. "acw_tc": "74d3df2616508705426597994e18592eed62a8f8e51fb14fafa552e27d"
  22. }
  23. url = "https://www.qcc.com/web/search"
  24. params = {"key": name.strip()}
  25. response = self.get(url, headers=headers, cookies=cookies, params=params)
  26. if response.status_code != 200:
  27. raise QccError(reason='企查查搜索接口调用失败', code=response.status_code)
  28. element = html2element(response.text)
  29. nodes = element.xpath('//table[@class="ntable ntable-list"]//tr[1]/td[3]/div[1]/div[4]/div[2]/span[3]/span/child::*')
  30. if len(nodes) > 0:
  31. sub_node = nodes[0]
  32. site = "".join("".join(sub_node.xpath('./text()')).split())
  33. return site