crawl_spiders.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. import threading
  2. import requests
  3. from exceptions import InvalidProxiesException
  4. from utils.log import logger
  5. __all__ = ['QueryList', 'QueryDetail', 'crawl_spider']
  6. Lock = threading.Lock()
  7. class CreditChinaListSpider:
  8. def __init__(self, keyword: str = '', proxies: dict = None):
  9. self.proxies = proxies
  10. self.url = "https://public.creditchina.gov.cn/private-api/catalogSearchHome"
  11. self.headers = {
  12. "Host": "public.creditchina.gov.cn",
  13. "Accept": "application/json, text/javascript, */*; q=0.01",
  14. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
  15. "Origin": "https://www.creditchina.gov.cn",
  16. "Referer": "https://www.creditchina.gov.cn/",
  17. "Accept-Language": "zh-CN,zh;q=0.9"
  18. }
  19. self.keyword = keyword
  20. self.params = {
  21. "keyword": self.keyword,
  22. "scenes": "defaultScenario",
  23. "tableName": "credit_xyzx_tyshxydm",
  24. "searchState": "2",
  25. "entityType": "1,2,4,5,6,7,8",
  26. "templateId": "",
  27. "page": "1",
  28. "pageSize": "10"
  29. }
  30. self.results = []
  31. def set_results(self, val: list):
  32. self.results = val
  33. def get_results(self):
  34. return self.results
  35. def crawl_request(self):
  36. request_params = {
  37. 'headers': self.headers,
  38. 'proxies': self.proxies,
  39. 'timeout': 60
  40. }
  41. try:
  42. r = requests.get(self.url, params=self.params, **request_params)
  43. logger.info(f"[采集]{self.keyword} 列表查询状态:{r.status_code}")
  44. return r
  45. except requests.exceptions.ReadTimeout:
  46. raise InvalidProxiesException()
  47. except requests.exceptions.ConnectTimeout:
  48. raise InvalidProxiesException()
  49. except requests.RequestException as e:
  50. logger.error(e.__class__.__name__)
  51. def crawl_response(self, response):
  52. results = []
  53. data_json = response.json()
  54. if len(data_json) > 0:
  55. data_list = data_json.get('data').get('list')
  56. logger.info('[采集]列表查询:{} 结果:{}条'.format(self.keyword, len(data_list)))
  57. for item in data_list:
  58. results.append({
  59. 'entity_uuid': item['uuid'],
  60. 'entity_name': item['accurate_entity_name'],
  61. 'entity_code': item['accurate_entity_code'],
  62. 'entity_type': item['entityType'],
  63. 'entity_name_query': item['accurate_entity_name_query'],
  64. 'recid': item['recid'],
  65. })
  66. return results
  67. def crawl_spider(self):
  68. response = self.crawl_request()
  69. results = self.crawl_response(response)
  70. self.set_results(results)
  71. def start(self):
  72. self.crawl_spider()
  73. def __iter__(self):
  74. return iter(self.get_results())
  75. def __call__(self, *args, **kwargs):
  76. self.proxies = kwargs.get('proxies')
  77. if 'keyword' in kwargs and kwargs.get('keyword') is not None:
  78. self.keyword = kwargs.get('keyword')
  79. self.params.update({'keyword': self.keyword})
  80. if len(self.keyword) > 0:
  81. self.start()
  82. return self
  83. class CreditChinaDetailSpider:
  84. def __init__(
  85. self,
  86. entity_uuid: str = '',
  87. entity_code: str = '',
  88. entity_name: str = '',
  89. entity_type: str = '',
  90. proxies: dict = None):
  91. self.uuid = entity_uuid
  92. self.social_id = entity_code
  93. self.keyword = entity_name
  94. self.entity_type = entity_type
  95. self.proxies = proxies
  96. self.url = "https://public.creditchina.gov.cn/private-api/getTyshxydmDetailsContent"
  97. self.headers = {
  98. "Host": "public.creditchina.gov.cn",
  99. "Accept": "application/json, text/javascript, */*; q=0.01",
  100. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
  101. "Origin": "https://www.creditchina.gov.cn",
  102. "Referer": "https://www.creditchina.gov.cn/",
  103. "Accept-Language": "zh-CN,zh;q=0.9"
  104. }
  105. self.params = {
  106. "keyword": self.keyword,
  107. "scenes": "defaultscenario",
  108. "entityType": self.entity_type,
  109. "searchState": "1",
  110. "uuid": self.uuid,
  111. "tyshxydm": self.social_id
  112. }
  113. self.results = {}
  114. def crawl_request(self):
  115. request_params = {
  116. 'headers': self.headers,
  117. 'proxies': self.proxies,
  118. 'timeout': 60
  119. }
  120. try:
  121. r = requests.get(self.url, params=self.params, **request_params)
  122. # logger.info(f"[采集]{self.keyword} 详情查询状态:{r.status_code}")
  123. return r
  124. except requests.exceptions.ReadTimeout:
  125. raise InvalidProxiesException()
  126. except requests.exceptions.ConnectTimeout:
  127. raise InvalidProxiesException()
  128. except requests.RequestException as e:
  129. logger.error(e.__class__.__name__)
  130. def crawl_response(self, response):
  131. data_json = response.json()
  132. if len(data_json) > 0:
  133. # message = data_json.get('message')
  134. # logger.info('[采集]详情查询:{} 结果:{}'.format(self.keyword, message))
  135. try:
  136. data = data_json.get('data').get('data')
  137. entity = data.get('entity')
  138. head_entity = data_json.get('data').get('headEntity')
  139. results = {
  140. 'entity_type': data.get('data_catalog', ''), # 主体类型
  141. 'social_id': head_entity.get('tyshxydm') or self.social_id, # 统一社会信用代码
  142. 'status': head_entity.get('status'), # 运行状态
  143. 'entity': head_entity.get('dymc') or entity.get('dymc', ''), # 第一名称
  144. 'entity_1': entity.get('demc', ''), # 第二名称
  145. 'entity_2': entity.get('dsmc', ''), # 第三名称
  146. 'entity_other': entity.get('qtmc', ''), # 其他名称
  147. 'legal_person': entity.get('fddbr', ''), # 法人
  148. 'capital_origin': entity.get('jfly', ''), # 经费来源
  149. 'capital': entity.get('kbzj', '') + '万元', # 开办资金
  150. 'jbdw': entity.get('jbdw', ''), # 举办单位
  151. 'spjg': entity.get('spjg', ''), # 审批机关
  152. 'zsyxqz1': entity.get('zsyxqz1', ''), # 证书有效期自
  153. 'zsyxqz2': entity.get('zsyxqz2', ''), # 证书有效期至
  154. 'address': entity.get('dz', ''), # 地址
  155. 'purpose_and_business': entity.get('zzhywfw', ''), # 宗旨
  156. }
  157. return results
  158. except Exception as e:
  159. logger.error(e.__class__.__name__)
  160. def set_result(self, val: dict):
  161. self.results = val
  162. def get_result(self):
  163. return self.results
  164. def crawl_spider(self):
  165. response = self.crawl_request()
  166. results = self.crawl_response(response)
  167. self.set_result(results)
  168. def start(self):
  169. self.crawl_spider()
  170. def __call__(self, *args, **kwargs):
  171. self.proxies = kwargs.get('proxies')
  172. for key, value in kwargs.items():
  173. if key == 'entity_uuid' and kwargs.get('entity_uuid') is not None:
  174. self.uuid = kwargs.get(key)
  175. elif key == 'entity_code' and kwargs.get('entity_code') is not None:
  176. self.social_id = kwargs.get('entity_code')
  177. elif key == 'entity_name' and kwargs.get('entity_name') is not None:
  178. self.keyword = kwargs.get('entity_name')
  179. elif key == 'entity_type' and kwargs.get('entity_type') is not None:
  180. self.entity_type = kwargs.get('entity_type')
  181. self.params.update({
  182. 'keyword': self.keyword,
  183. "entityType": self.entity_type,
  184. "uuid": self.uuid,
  185. "tyshxydm": self.social_id
  186. })
  187. conditions = [
  188. len(self.uuid) > 0,
  189. len(self.social_id) > 0,
  190. len(self.keyword) > 0,
  191. len(self.entity_type) > 0
  192. ]
  193. if all(conditions):
  194. self.start()
  195. return self
  196. QueryList = CreditChinaListSpider()
  197. QueryDetail = CreditChinaDetailSpider()
  198. def crawl_spider(keyword: str, proxies: dict = None):
  199. Lock.acquire()
  200. results = []
  201. for items in QueryList(keyword=keyword, proxies=proxies):
  202. # print(f">>> {keyword} ", items)
  203. detail = QueryDetail(**items)
  204. results.append(detail.results)
  205. # print(f"{keyword} 搜索完成")
  206. Lock.release()
  207. return results