|
@@ -147,7 +147,7 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
|
|
|
|
|
|
'''识别验证码'''
|
|
|
captcha_result = chaojiying.PostPic(bytes_array.getvalue(), 9004)
|
|
|
- logger.info(captcha_result)
|
|
|
+ logger.info(f'[识别结果]{captcha_result}')
|
|
|
pic_id = captcha_result['pic_id']
|
|
|
|
|
|
'''解析识别结果'''
|
|
@@ -155,12 +155,11 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
|
|
|
locations = [[int(number) for number in group.split(',')] for group in groups]
|
|
|
|
|
|
'''点击验证图片'''
|
|
|
- for location in locations:
|
|
|
- # logger.info(location)
|
|
|
+ for index, location in enumerate(locations):
|
|
|
ActionChains(driver).move_to_element_with_offset(
|
|
|
touclick_element,
|
|
|
location[0] + 10,
|
|
|
- location[1] + 47
|
|
|
+ location[1] + 53,
|
|
|
).click().perform()
|
|
|
time.sleep(1)
|
|
|
|
|
@@ -178,6 +177,7 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
|
|
|
def check_page(driver: Chrome, **kwargs):
|
|
|
"""检查页面"""
|
|
|
prompt_popup(driver)
|
|
|
+ time.sleep(1)
|
|
|
geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
|
|
|
|
|
|
|
|
@@ -272,7 +272,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
logger.info(f"[重复数据]{title} - 丢弃")
|
|
|
continue
|
|
|
button = td_element.find_element_by_class_name("link")
|
|
|
- click(driver, button, wait_time=2)
|
|
|
+ click(driver, button, wait_time=10)
|
|
|
for current_handler in driver.window_handles:
|
|
|
if current_handler == handler:
|
|
|
continue
|
|
@@ -306,67 +306,82 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
# logger.info(item)
|
|
|
|
|
|
'''企业资质'''
|
|
|
- element = html2element(driver.page_source)
|
|
|
- node = element.xpath('//div[@class="panel-container"]')[0]
|
|
|
- company_quality_html = element2html(node)
|
|
|
- company_quality = extract_content(company_quality_html)
|
|
|
- company['company_quality'] = company_quality
|
|
|
- company['company_quality_html'] = {'html': company_quality_html}
|
|
|
+ try:
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ node = element.xpath('//div[@class="panel-container"]')[0]
|
|
|
+ company_quality_html = element2html(node)
|
|
|
+ company_quality = extract_content(company_quality_html)
|
|
|
+ company['company_quality'] = company_quality
|
|
|
+ company['company_quality_html'] = {'html': company_quality_html}
|
|
|
+ except IndexError:
|
|
|
+ pass
|
|
|
|
|
|
'''注册人员'''
|
|
|
- company_staff = driver.find_element_by_id("tab-companyStaff")
|
|
|
- click(driver, company_staff, allow_check_page=True)
|
|
|
- registrar = []
|
|
|
- reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
|
|
|
- for btn in reg_buttons:
|
|
|
- '''点击分类'''
|
|
|
- driver.execute_script("arguments[0].click();", btn)
|
|
|
- element = html2element(driver.page_source)
|
|
|
- nodes = element.xpath('//div[@class="el-table__body-wrapper is-scrolling-none"]/table//tr')
|
|
|
- for node in nodes:
|
|
|
- name = "".join(node.xpath('./td[2]//span/text()')).strip()
|
|
|
- id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
|
|
|
- reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
|
|
|
- reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
|
|
|
- reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
|
|
|
- registrar.append({
|
|
|
- 'name': name, # 姓名
|
|
|
- 'id_no': id_no, # 身份证号
|
|
|
- 'reg_type': reg_type, # 注册类别
|
|
|
- 'reg_no': reg_no, # 注册号(执业印章号)
|
|
|
- 'reg_major': reg_major, # 注册专业
|
|
|
- })
|
|
|
- company['company_staff'] = registrar
|
|
|
+ try:
|
|
|
+ company_staff = driver.find_element_by_id("tab-companyStaff")
|
|
|
+ click(driver, company_staff, allow_check_page=True)
|
|
|
+ registrar = []
|
|
|
+ reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
|
|
|
+ for btn in reg_buttons:
|
|
|
+ '''点击分类'''
|
|
|
+ driver.execute_script("arguments[0].click();", btn)
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ nodes = element.xpath('//div[@class="el-table__body-wrapper is-scrolling-none"]/table//tr')
|
|
|
+ for node in nodes:
|
|
|
+ name = "".join(node.xpath('./td[2]//span/text()')).strip()
|
|
|
+ id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
|
|
|
+ reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
|
|
|
+ reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
|
|
|
+ reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
|
|
|
+ registrar.append({
|
|
|
+ 'name': name, # 姓名
|
|
|
+ 'id_no': id_no, # 身份证号
|
|
|
+ 'reg_type': reg_type, # 注册类别
|
|
|
+ 'reg_no': reg_no, # 注册号(执业印章号)
|
|
|
+ 'reg_major': reg_major, # 注册专业
|
|
|
+ })
|
|
|
+ company['company_staff'] = registrar
|
|
|
+ except IndexError:
|
|
|
+ pass
|
|
|
|
|
|
'''不良行为'''
|
|
|
- bad_behavior = driver.find_element_by_id('tab-badBehavior')
|
|
|
- click(driver, bad_behavior, allow_check_page=True)
|
|
|
- element = html2element(driver.page_source)
|
|
|
- node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
|
|
|
- bad_behavior_html = element2html(node)
|
|
|
- bad_behaviors = extract_content(bad_behavior_html)
|
|
|
- company['bad_behavior'] = bad_behaviors
|
|
|
- company['bad_behavior_html'] = {'html': bad_behavior_html}
|
|
|
+ try:
|
|
|
+ bad_behavior = driver.find_element_by_id('tab-badBehavior')
|
|
|
+ click(driver, bad_behavior, allow_check_page=True)
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
|
|
|
+ bad_behavior_html = element2html(node)
|
|
|
+ bad_behaviors = extract_content(bad_behavior_html)
|
|
|
+ company['bad_behavior'] = bad_behaviors
|
|
|
+ company['bad_behavior_html'] = {'html': bad_behavior_html}
|
|
|
+ except IndexError:
|
|
|
+ pass
|
|
|
|
|
|
'''黑名单记录'''
|
|
|
- black_list = driver.find_element_by_id('tab-blackList')
|
|
|
- click(driver, black_list, allow_check_page=True)
|
|
|
- element = html2element(driver.page_source)
|
|
|
- node = element.xpath('//div[@id="pane-blackList"]/div')[0]
|
|
|
- black_list_html = element2html(node)
|
|
|
- black_list_array = extract_content(black_list_html)
|
|
|
- company['black_list'] = black_list_array
|
|
|
- company['black_list_html'] = {'html': black_list_html}
|
|
|
+ try:
|
|
|
+ black_list = driver.find_element_by_id('tab-blackList')
|
|
|
+ click(driver, black_list, allow_check_page=True)
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ node = element.xpath('//div[@id="pane-blackList"]/div')[0]
|
|
|
+ black_list_html = element2html(node)
|
|
|
+ black_list_array = extract_content(black_list_html)
|
|
|
+ company['black_list'] = black_list_array
|
|
|
+ company['black_list_html'] = {'html': black_list_html}
|
|
|
+ except IndexError:
|
|
|
+ pass
|
|
|
|
|
|
'''失信联合惩戒记录'''
|
|
|
- punish = driver.find_element_by_id('tab-punishLog')
|
|
|
- click(driver, punish, allow_check_page=True)
|
|
|
- element = html2element(driver.page_source)
|
|
|
- node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
|
|
|
- punish_html = element2html(node)
|
|
|
- punish_array = extract_content(punish_html)
|
|
|
- company['punish'] = punish_array
|
|
|
- company['punish_html'] = {'html': punish_html}
|
|
|
+ try:
|
|
|
+ punish = driver.find_element_by_id('tab-punishLog')
|
|
|
+ click(driver, punish, allow_check_page=True)
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
|
|
|
+ punish_html = element2html(node)
|
|
|
+ punish_array = extract_content(punish_html)
|
|
|
+ company['punish'] = punish_array
|
|
|
+ company['punish_html'] = {'html': punish_html}
|
|
|
+ except IndexError:
|
|
|
+ pass
|
|
|
|
|
|
'''保存企业数据'''
|
|
|
if len(company['credit_no']) > 0:
|