123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- import threading
- import requests
- from exceptions import InvalidProxiesException
- from utils.log import logger
- __all__ = ['QueryList', 'QueryDetail', 'crawl_spider']
- Lock = threading.Lock()
- class CreditChinaListSpider:
- def __init__(self, keyword: str = '', proxies: dict = None):
- self.proxies = proxies
- self.url = "https://public.creditchina.gov.cn/private-api/catalogSearchHome"
- self.headers = {
- "Host": "public.creditchina.gov.cn",
- "Accept": "application/json, text/javascript, */*; q=0.01",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
- "Origin": "https://www.creditchina.gov.cn",
- "Referer": "https://www.creditchina.gov.cn/",
- "Accept-Language": "zh-CN,zh;q=0.9"
- }
- self.keyword = keyword
- self.params = {
- "keyword": self.keyword,
- "scenes": "defaultScenario",
- "tableName": "credit_xyzx_tyshxydm",
- "searchState": "2",
- "entityType": "1,2,4,5,6,7,8",
- "templateId": "",
- "page": "1",
- "pageSize": "10"
- }
- self.results = []
- def set_results(self, val: list):
- self.results = val
- def get_results(self):
- return self.results
- def crawl_request(self):
- request_params = {
- 'headers': self.headers,
- 'proxies': self.proxies,
- 'timeout': 60
- }
- try:
- r = requests.get(self.url, params=self.params, **request_params)
- logger.info(f"[采集]{self.keyword} 列表查询状态:{r.status_code}")
- return r
- except requests.exceptions.ReadTimeout:
- raise InvalidProxiesException()
- except requests.exceptions.ConnectTimeout:
- raise InvalidProxiesException()
- except requests.RequestException as e:
- logger.error(e.__class__.__name__)
- def crawl_response(self, response):
- results = []
- data_json = response.json()
- if len(data_json) > 0:
- data_list = data_json.get('data').get('list')
- logger.info('[采集]列表查询:{} 结果:{}条'.format(self.keyword, len(data_list)))
- for item in data_list:
- results.append({
- 'entity_uuid': item['uuid'],
- 'entity_name': item['accurate_entity_name'],
- 'entity_code': item['accurate_entity_code'],
- 'entity_type': item['entityType'],
- 'entity_name_query': item['accurate_entity_name_query'],
- 'recid': item['recid'],
- })
- return results
- def crawl_spider(self):
- response = self.crawl_request()
- results = self.crawl_response(response)
- self.set_results(results)
- def start(self):
- self.crawl_spider()
- def __iter__(self):
- return iter(self.get_results())
- def __call__(self, *args, **kwargs):
- self.proxies = kwargs.get('proxies')
- if 'keyword' in kwargs and kwargs.get('keyword') is not None:
- self.keyword = kwargs.get('keyword')
- self.params.update({'keyword': self.keyword})
- if len(self.keyword) > 0:
- self.start()
- return self
- class CreditChinaDetailSpider:
- def __init__(
- self,
- entity_uuid: str = '',
- entity_code: str = '',
- entity_name: str = '',
- entity_type: str = '',
- proxies: dict = None):
- self.uuid = entity_uuid
- self.social_id = entity_code
- self.keyword = entity_name
- self.entity_type = entity_type
- self.proxies = proxies
- self.url = "https://public.creditchina.gov.cn/private-api/getTyshxydmDetailsContent"
- self.headers = {
- "Host": "public.creditchina.gov.cn",
- "Accept": "application/json, text/javascript, */*; q=0.01",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
- "Origin": "https://www.creditchina.gov.cn",
- "Referer": "https://www.creditchina.gov.cn/",
- "Accept-Language": "zh-CN,zh;q=0.9"
- }
- self.params = {
- "keyword": self.keyword,
- "scenes": "defaultscenario",
- "entityType": self.entity_type,
- "searchState": "1",
- "uuid": self.uuid,
- "tyshxydm": self.social_id
- }
- self.results = {}
- def crawl_request(self):
- request_params = {
- 'headers': self.headers,
- 'proxies': self.proxies,
- 'timeout': 60
- }
- try:
- r = requests.get(self.url, params=self.params, **request_params)
- # logger.info(f"[采集]{self.keyword} 详情查询状态:{r.status_code}")
- return r
- except requests.exceptions.ReadTimeout:
- raise InvalidProxiesException()
- except requests.exceptions.ConnectTimeout:
- raise InvalidProxiesException()
- except requests.RequestException as e:
- logger.error(e.__class__.__name__)
- def crawl_response(self, response):
- data_json = response.json()
- if len(data_json) > 0:
- # message = data_json.get('message')
- # logger.info('[采集]详情查询:{} 结果:{}'.format(self.keyword, message))
- try:
- data = data_json.get('data').get('data')
- entity = data.get('entity')
- head_entity = data_json.get('data').get('headEntity')
- results = {
- 'entity_type': data.get('data_catalog', ''), # 主体类型
- 'social_id': head_entity.get('tyshxydm') or self.social_id, # 统一社会信用代码
- 'status': head_entity.get('status'), # 运行状态
- 'entity': head_entity.get('dymc') or entity.get('dymc', ''), # 第一名称
- 'entity_1': entity.get('demc', ''), # 第二名称
- 'entity_2': entity.get('dsmc', ''), # 第三名称
- 'entity_other': entity.get('qtmc', ''), # 其他名称
- 'legal_person': entity.get('fddbr', ''), # 法人
- 'capital_origin': entity.get('jfly', ''), # 经费来源
- 'capital': entity.get('kbzj', '') + '万元', # 开办资金
- 'jbdw': entity.get('jbdw', ''), # 举办单位
- 'spjg': entity.get('spjg', ''), # 审批机关
- 'zsyxqz1': entity.get('zsyxqz1', ''), # 证书有效期自
- 'zsyxqz2': entity.get('zsyxqz2', ''), # 证书有效期至
- 'address': entity.get('dz', ''), # 地址
- 'purpose_and_business': entity.get('zzhywfw', ''), # 宗旨
- }
- return results
- except Exception as e:
- logger.error(e.__class__.__name__)
- def set_result(self, val: dict):
- self.results = val
- def get_result(self):
- return self.results
- def crawl_spider(self):
- response = self.crawl_request()
- results = self.crawl_response(response)
- self.set_result(results)
- def start(self):
- self.crawl_spider()
- def __call__(self, *args, **kwargs):
- self.proxies = kwargs.get('proxies')
- for key, value in kwargs.items():
- if key == 'entity_uuid' and kwargs.get('entity_uuid') is not None:
- self.uuid = kwargs.get(key)
- elif key == 'entity_code' and kwargs.get('entity_code') is not None:
- self.social_id = kwargs.get('entity_code')
- elif key == 'entity_name' and kwargs.get('entity_name') is not None:
- self.keyword = kwargs.get('entity_name')
- elif key == 'entity_type' and kwargs.get('entity_type') is not None:
- self.entity_type = kwargs.get('entity_type')
- self.params.update({
- 'keyword': self.keyword,
- "entityType": self.entity_type,
- "uuid": self.uuid,
- "tyshxydm": self.social_id
- })
- conditions = [
- len(self.uuid) > 0,
- len(self.social_id) > 0,
- len(self.keyword) > 0,
- len(self.entity_type) > 0
- ]
- if all(conditions):
- self.start()
- return self
- QueryList = CreditChinaListSpider()
- QueryDetail = CreditChinaDetailSpider()
- def crawl_spider(keyword: str, proxies: dict = None):
- Lock.acquire()
- results = []
- for items in QueryList(keyword=keyword, proxies=proxies):
- # print(f">>> {keyword} ", items)
- detail = QueryDetail(**items)
- results.append(detail.results)
- # print(f"{keyword} 搜索完成")
- Lock.release()
- return results
|