|
@@ -1,4 +1,5 @@
|
|
import io
|
|
import io
|
|
|
|
+import random
|
|
import time
|
|
import time
|
|
|
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
@@ -177,8 +178,9 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
|
|
def check_page(driver: Chrome, **kwargs):
|
|
def check_page(driver: Chrome, **kwargs):
|
|
"""检查页面"""
|
|
"""检查页面"""
|
|
prompt_popup(driver)
|
|
prompt_popup(driver)
|
|
- time.sleep(1)
|
|
|
|
|
|
+ time.sleep(3)
|
|
geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
|
|
geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
|
|
|
|
+ time.sleep(3)
|
|
|
|
|
|
|
|
|
|
def click(driver: Chrome, button, allow_check_page=False, wait_time=1):
|
|
def click(driver: Chrome, button, allow_check_page=False, wait_time=1):
|
|
@@ -272,7 +274,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
logger.info(f"[重复数据]{title} - 丢弃")
|
|
logger.info(f"[重复数据]{title} - 丢弃")
|
|
continue
|
|
continue
|
|
button = td_element.find_element_by_class_name("link")
|
|
button = td_element.find_element_by_class_name("link")
|
|
- click(driver, button, wait_time=10)
|
|
|
|
|
|
+ click(driver, button, wait_time=random.randint(5, 10))
|
|
for current_handler in driver.window_handles:
|
|
for current_handler in driver.window_handles:
|
|
if current_handler == handler:
|
|
if current_handler == handler:
|
|
continue
|
|
continue
|
|
@@ -320,26 +322,27 @@ def crawl_spider(driver: Chrome, handler):
|
|
try:
|
|
try:
|
|
company_staff = driver.find_element_by_id("tab-companyStaff")
|
|
company_staff = driver.find_element_by_id("tab-companyStaff")
|
|
click(driver, company_staff, allow_check_page=True)
|
|
click(driver, company_staff, allow_check_page=True)
|
|
- registrar = []
|
|
|
|
reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
|
|
reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
|
|
for btn in reg_buttons:
|
|
for btn in reg_buttons:
|
|
- '''点击分类'''
|
|
|
|
- driver.execute_script("arguments[0].click();", btn)
|
|
|
|
- element = html2element(driver.page_source)
|
|
|
|
- nodes = element.xpath('//div[@class="el-table__body-wrapper is-scrolling-none"]/table//tr')
|
|
|
|
- for node in nodes:
|
|
|
|
- name = "".join(node.xpath('./td[2]//span/text()')).strip()
|
|
|
|
- id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
|
|
|
|
- reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
|
|
|
|
- reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
|
|
|
|
- reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
|
|
|
|
- registrar.append({
|
|
|
|
- 'name': name, # 姓名
|
|
|
|
- 'id_no': id_no, # 身份证号
|
|
|
|
- 'reg_type': reg_type, # 注册类别
|
|
|
|
- 'reg_no': reg_no, # 注册号(执业印章号)
|
|
|
|
- 'reg_major': reg_major, # 注册专业
|
|
|
|
- })
|
|
|
|
|
|
+ logger.info(f'[{btn.text}]')
|
|
|
|
+ click(driver, btn)
|
|
|
|
+
|
|
|
|
+ registrar = []
|
|
|
|
+ element = html2element(driver.page_source)
|
|
|
|
+ nodes = element.xpath('//div[@id="pane-companyStaff"]//table[@class="el-table__body"]//tr')
|
|
|
|
+ for node in nodes:
|
|
|
|
+ name = "".join(node.xpath('./td[2]//span/text()')).strip()
|
|
|
|
+ id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
|
|
|
|
+ reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
|
|
|
|
+ reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
|
|
|
|
+ reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
|
|
|
|
+ registrar.append({
|
|
|
|
+ 'name': name, # 姓名
|
|
|
|
+ 'id_no': id_no, # 身份证号
|
|
|
|
+ 'reg_type': reg_type, # 注册类别
|
|
|
|
+ 'reg_no': reg_no, # 注册号(执业印章号)
|
|
|
|
+ 'reg_major': reg_major, # 注册专业
|
|
|
|
+ })
|
|
company['company_staff'] = registrar
|
|
company['company_staff'] = registrar
|
|
except IndexError:
|
|
except IndexError:
|
|
pass
|
|
pass
|
|
@@ -396,6 +399,8 @@ def crawl_spider(driver: Chrome, handler):
|
|
driver.close()
|
|
driver.close()
|
|
'''返回列表页'''
|
|
'''返回列表页'''
|
|
driver.switch_to.window(handler)
|
|
driver.switch_to.window(handler)
|
|
|
|
+ '''下一条执行时间'''
|
|
|
|
+ time.sleep(2)
|
|
else:
|
|
else:
|
|
return True
|
|
return True
|
|
|
|
|
|
@@ -421,11 +426,17 @@ def start(enable_remote_driver=False):
|
|
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
|
|
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
|
|
options.add_argument("--disable-gpu")
|
|
options.add_argument("--disable-gpu")
|
|
chrome_driver = webdriver.Chrome(options=options)
|
|
chrome_driver = webdriver.Chrome(options=options)
|
|
- main_handler = chrome_driver.current_window_handle # 获取操作句柄
|
|
|
|
|
|
+ main_handler = chrome_driver.current_window_handle # 获取句柄
|
|
|
|
+ '''清除多窗口'''
|
|
|
|
+ for handler in chrome_driver.window_handles:
|
|
|
|
+ if handler != main_handler:
|
|
|
|
+ chrome_driver.switch_to.window(handler)
|
|
|
|
+ chrome_driver.close()
|
|
|
|
+ chrome_driver.switch_to.window(main_handler)
|
|
|
|
+
|
|
chrome_driver.get(CRAWL_SITE)
|
|
chrome_driver.get(CRAWL_SITE)
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
'''采集记录'''
|
|
'''采集记录'''
|
|
- # records = ['全部', '勘察企业', '监理企业', '设计与施工一体化企业', '建筑业企业']
|
|
|
|
records = ['全部', '造价咨询企业']
|
|
records = ['全部', '造价咨询企业']
|
|
while True:
|
|
while True:
|
|
'''选择资质类别'''
|
|
'''选择资质类别'''
|
|
@@ -443,4 +454,11 @@ def start(enable_remote_driver=False):
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
- start(enable_remote_driver=True)
|
|
|
|
|
|
+ while True:
|
|
|
|
+ try:
|
|
|
|
+ start(enable_remote_driver=True)
|
|
|
|
+ except:
|
|
|
|
+ logger.info("等待100秒")
|
|
|
|
+ time.sleep(100)
|
|
|
|
+
|
|
|
|
+ # start(enable_remote_driver=True)
|