import threading import requests from exceptions import InvalidProxiesException from utils.log import logger __all__ = ['QueryList', 'QueryDetail', 'crawl_spider'] Lock = threading.Lock() class CreditChinaListSpider: def __init__(self, keyword: str = '', proxies: dict = None): self.proxies = proxies self.url = "https://public.creditchina.gov.cn/private-api/catalogSearchHome" self.headers = { "Host": "public.creditchina.gov.cn", "Accept": "application/json, text/javascript, */*; q=0.01", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", "Origin": "https://www.creditchina.gov.cn", "Referer": "https://www.creditchina.gov.cn/", "Accept-Language": "zh-CN,zh;q=0.9" } self.keyword = keyword self.params = { "keyword": self.keyword, "scenes": "defaultScenario", "tableName": "credit_xyzx_tyshxydm", "searchState": "2", "entityType": "1,2,4,5,6,7,8", "templateId": "", "page": "1", "pageSize": "10" } self.results = [] def set_results(self, val: list): self.results = val def get_results(self): return self.results def crawl_request(self): request_params = { 'headers': self.headers, 'proxies': self.proxies, 'timeout': 60 } try: r = requests.get(self.url, params=self.params, **request_params) logger.info(f"[采集]{self.keyword} 列表查询状态:{r.status_code}") return r except requests.exceptions.ReadTimeout: raise InvalidProxiesException() except requests.exceptions.ConnectTimeout: raise InvalidProxiesException() except requests.RequestException as e: logger.error(e.__class__.__name__) def crawl_response(self, response): results = [] data_json = response.json() if len(data_json) > 0: data_list = data_json.get('data').get('list') logger.info('[采集]列表查询:{} 结果:{}条'.format(self.keyword, len(data_list))) for item in data_list: results.append({ 'entity_uuid': item['uuid'], 'entity_name': item['accurate_entity_name'], 'entity_code': item['accurate_entity_code'], 'entity_type': item['entityType'], 'entity_name_query': item['accurate_entity_name_query'], 'recid': item['recid'], }) return results def crawl_spider(self): response = self.crawl_request() results = self.crawl_response(response) self.set_results(results) def start(self): self.crawl_spider() def __iter__(self): return iter(self.get_results()) def __call__(self, *args, **kwargs): self.proxies = kwargs.get('proxies') if 'keyword' in kwargs and kwargs.get('keyword') is not None: self.keyword = kwargs.get('keyword') self.params.update({'keyword': self.keyword}) if len(self.keyword) > 0: self.start() return self class CreditChinaDetailSpider: def __init__( self, entity_uuid: str = '', entity_code: str = '', entity_name: str = '', entity_type: str = '', proxies: dict = None): self.uuid = entity_uuid self.social_id = entity_code self.keyword = entity_name self.entity_type = entity_type self.proxies = proxies self.url = "https://public.creditchina.gov.cn/private-api/getTyshxydmDetailsContent" self.headers = { "Host": "public.creditchina.gov.cn", "Accept": "application/json, text/javascript, */*; q=0.01", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", "Origin": "https://www.creditchina.gov.cn", "Referer": "https://www.creditchina.gov.cn/", "Accept-Language": "zh-CN,zh;q=0.9" } self.params = { "keyword": self.keyword, "scenes": "defaultscenario", "entityType": self.entity_type, "searchState": "1", "uuid": self.uuid, "tyshxydm": self.social_id } self.results = {} def crawl_request(self): request_params = { 'headers': self.headers, 'proxies': self.proxies, 'timeout': 60 } try: r = requests.get(self.url, params=self.params, **request_params) # logger.info(f"[采集]{self.keyword} 详情查询状态:{r.status_code}") return r except requests.exceptions.ReadTimeout: raise InvalidProxiesException() except requests.exceptions.ConnectTimeout: raise InvalidProxiesException() except requests.RequestException as e: logger.error(e.__class__.__name__) def crawl_response(self, response): data_json = response.json() if len(data_json) > 0: # message = data_json.get('message') # logger.info('[采集]详情查询:{} 结果:{}'.format(self.keyword, message)) try: data = data_json.get('data').get('data') entity = data.get('entity') head_entity = data_json.get('data').get('headEntity') results = { 'entity_type': data.get('data_catalog', ''), # 主体类型 'social_id': head_entity.get('tyshxydm') or self.social_id, # 统一社会信用代码 'status': head_entity.get('status'), # 运行状态 'entity': head_entity.get('dymc') or entity.get('dymc', ''), # 第一名称 'entity_1': entity.get('demc', ''), # 第二名称 'entity_2': entity.get('dsmc', ''), # 第三名称 'entity_other': entity.get('qtmc', ''), # 其他名称 'legal_person': entity.get('fddbr', ''), # 法人 'capital_origin': entity.get('jfly', ''), # 经费来源 'capital': entity.get('kbzj', '') + '万元', # 开办资金 'jbdw': entity.get('jbdw', ''), # 举办单位 'spjg': entity.get('spjg', ''), # 审批机关 'zsyxqz1': entity.get('zsyxqz1', ''), # 证书有效期自 'zsyxqz2': entity.get('zsyxqz2', ''), # 证书有效期至 'address': entity.get('dz', ''), # 地址 'purpose_and_business': entity.get('zzhywfw', ''), # 宗旨 } return results except Exception as e: logger.error(e.__class__.__name__) def set_result(self, val: dict): self.results = val def get_result(self): return self.results def crawl_spider(self): response = self.crawl_request() results = self.crawl_response(response) self.set_result(results) def start(self): self.crawl_spider() def __call__(self, *args, **kwargs): self.proxies = kwargs.get('proxies') for key, value in kwargs.items(): if key == 'entity_uuid' and kwargs.get('entity_uuid') is not None: self.uuid = kwargs.get(key) elif key == 'entity_code' and kwargs.get('entity_code') is not None: self.social_id = kwargs.get('entity_code') elif key == 'entity_name' and kwargs.get('entity_name') is not None: self.keyword = kwargs.get('entity_name') elif key == 'entity_type' and kwargs.get('entity_type') is not None: self.entity_type = kwargs.get('entity_type') self.params.update({ 'keyword': self.keyword, "entityType": self.entity_type, "uuid": self.uuid, "tyshxydm": self.social_id }) conditions = [ len(self.uuid) > 0, len(self.social_id) > 0, len(self.keyword) > 0, len(self.entity_type) > 0 ] if all(conditions): self.start() return self QueryList = CreditChinaListSpider() QueryDetail = CreditChinaDetailSpider() def crawl_spider(keyword: str, proxies: dict = None): Lock.acquire() results = [] for items in QueryList(keyword=keyword, proxies=proxies): # print(f">>> {keyword} ", items) detail = QueryDetail(**items) results.append(detail.results) # print(f"{keyword} 搜索完成") Lock.release() return results