|
@@ -101,21 +101,24 @@ def display_geetest_panel(html):
|
|
|
return False
|
|
|
|
|
|
|
|
|
-def prompt_popup(driver: Chrome):
|
|
|
+def prompt_popup(driver: Chrome, wait_time=None):
|
|
|
while True:
|
|
|
if not display_prompt_popup(driver.page_source):
|
|
|
break
|
|
|
- logger.info("处理提示框")
|
|
|
+ logger.info(">>> 点击提示弹框")
|
|
|
driver.find_element_by_xpath('//div[@class="el-dialog captchaDilaog"]/div[3]/div/button[1]').click()
|
|
|
- time.sleep(2)
|
|
|
+ time.sleep(1)
|
|
|
+ '''流程之间的间隔时间'''
|
|
|
+ _wait_time = (wait_time or 1)
|
|
|
+ time.sleep(_wait_time)
|
|
|
|
|
|
|
|
|
-def geetest_panel(driver: Chrome, save_img_to_local=False):
|
|
|
+def geetest_panel(driver: Chrome, save_img_to_local=False, wait_time=None):
|
|
|
pic_id = None
|
|
|
while True:
|
|
|
if not display_geetest_panel(driver.page_source):
|
|
|
break
|
|
|
- logger.info("处理验证码")
|
|
|
+ logger.info(">>> 验证码检测")
|
|
|
if pic_id is not None:
|
|
|
'''打码平台失败'''
|
|
|
captcha_result = chaojiying.ReportError(pic_id)
|
|
@@ -173,21 +176,34 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
|
|
|
commit_element = wait.until(EC.presence_of_element_located(locator))
|
|
|
ActionChains(driver).click(commit_element).perform()
|
|
|
time.sleep(5)
|
|
|
+ _wait_time = (wait_time or 1)
|
|
|
+ time.sleep(wait_time)
|
|
|
|
|
|
|
|
|
-def check_page(driver: Chrome, **kwargs):
|
|
|
+def check_page(driver: Chrome, wait_time=None, **kwargs):
|
|
|
"""检查页面"""
|
|
|
- prompt_popup(driver)
|
|
|
- time.sleep(3)
|
|
|
- geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
|
|
|
- time.sleep(3)
|
|
|
+ _wait_time = (wait_time or 1)
|
|
|
+ prompt_popup(driver, wait_time=_wait_time)
|
|
|
+ geetest_panel(
|
|
|
+ driver,
|
|
|
+ wait_time=_wait_time,
|
|
|
+ save_img_to_local=kwargs.get('save_img_to_local'),
|
|
|
+ )
|
|
|
|
|
|
|
|
|
-def click(driver: Chrome, button, allow_check_page=False, wait_time=1):
|
|
|
+def click(driver: Chrome, button, wait_time=None, allow_check_page=False):
|
|
|
driver.execute_script("arguments[0].click();", button)
|
|
|
- time.sleep(wait_time)
|
|
|
+ _wait_time = (wait_time or 1)
|
|
|
+ time.sleep(_wait_time)
|
|
|
if allow_check_page:
|
|
|
- check_page(driver)
|
|
|
+ check_page(driver, wait_time=_wait_time)
|
|
|
+
|
|
|
+
|
|
|
+def click_query(driver: Chrome, wait_time=None):
|
|
|
+ """查询按钮"""
|
|
|
+ button = driver.find_element_by_class_name("ssButton")
|
|
|
+ _wait_time = (wait_time or 1)
|
|
|
+ click(driver, button, wait_time=_wait_time)
|
|
|
|
|
|
|
|
|
def next_page(driver: Chrome):
|
|
@@ -239,27 +255,6 @@ def extract_content(html):
|
|
|
return results
|
|
|
|
|
|
|
|
|
-def click_query(driver: Chrome):
|
|
|
- """查询按钮"""
|
|
|
- button = driver.find_element_by_class_name("ssButton")
|
|
|
- click(driver, button)
|
|
|
- time.sleep(1)
|
|
|
-
|
|
|
-
|
|
|
-def select_qualify_category(driver: Chrome, records):
|
|
|
- span_elements = driver.find_elements(by=By.XPATH, value='//div[@class="labelInPut labelInPutRadio"]/span')
|
|
|
- for span_element in span_elements:
|
|
|
- qualify_category = span_element.text
|
|
|
- if qualify_category not in records:
|
|
|
- logger.info(f'>>资质类别:{qualify_category} <<')
|
|
|
- click(driver, span_element)
|
|
|
- click_query(driver)
|
|
|
- records.append(span_element.text)
|
|
|
- return False
|
|
|
- else:
|
|
|
- return True
|
|
|
-
|
|
|
-
|
|
|
def crawl_spider(driver: Chrome, handler):
|
|
|
"""采集爬虫"""
|
|
|
exception_count = 0
|
|
@@ -274,17 +269,17 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
logger.info(f"[重复数据]{title} - 丢弃")
|
|
|
continue
|
|
|
button = td_element.find_element_by_class_name("link")
|
|
|
- click(driver, button, wait_time=random.randint(5, 10))
|
|
|
+ click(driver, button, wait_time=random.randint(3, 10))
|
|
|
for current_handler in driver.window_handles:
|
|
|
if current_handler == handler:
|
|
|
continue
|
|
|
'''切换到弹出页面'''
|
|
|
driver.switch_to.window(current_handler)
|
|
|
current_url = driver.current_url
|
|
|
- '''首次进入详情页,检查页面弹框和验证码面板'''
|
|
|
- check_page(driver)
|
|
|
+ '''首次进入详情页,检查提示弹框和验证码面板'''
|
|
|
+ check_page(driver, wait_time=random.randint(2, 6))
|
|
|
+ '''企业数据处理'''
|
|
|
company = {}
|
|
|
-
|
|
|
'''企业基础数据'''
|
|
|
element = html2element(driver.page_source)
|
|
|
nodes = element.xpath('//div[@class="detaile-header__info--table"]')
|
|
@@ -323,9 +318,11 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
company_staff = driver.find_element_by_id("tab-companyStaff")
|
|
|
click(driver, company_staff, allow_check_page=True)
|
|
|
reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
|
|
|
- for btn in reg_buttons:
|
|
|
- logger.info(f'[{btn.text}]')
|
|
|
- click(driver, btn)
|
|
|
+ logger.info(f'>>> 人员注册类别 <<<')
|
|
|
+ for element in reg_buttons:
|
|
|
+ # TODO 页面需翻页的逻辑未添加
|
|
|
+ logger.info(f'[{element.text}]')
|
|
|
+ click(driver, element, wait_time=random.randint(1, 3))
|
|
|
|
|
|
registrar = []
|
|
|
element = html2element(driver.page_source)
|
|
@@ -407,7 +404,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
|
|
|
def downloader(driver: Chrome, handler):
|
|
|
while True:
|
|
|
- logger.info(f">>> 第{current_page(driver.page_source)}页<<<")
|
|
|
+ logger.info(f">>> 第{current_page(driver.page_source)}页 <<<")
|
|
|
allow_crawl = crawl_spider(driver, handler)
|
|
|
'''是否继续采集'''
|
|
|
if not allow_crawl:
|
|
@@ -420,6 +417,44 @@ def downloader(driver: Chrome, handler):
|
|
|
return True
|
|
|
|
|
|
|
|
|
+def select_province(driver: Chrome, records):
|
|
|
+ """选择注册属地"""
|
|
|
+ '''点击省份下拉框'''
|
|
|
+ drop_down_button = driver.find_element_by_xpath('//div[@class="region-select"]/div[1]/div[1]/span[1]/span[1]/i[contains(@class,"el-select__caret el-input__icon el-icon-arrow-up")]')
|
|
|
+ click(driver, drop_down_button, wait_time=1)
|
|
|
+ '''选择省份'''
|
|
|
+ li_elements = driver.find_elements(by=By.XPATH, value='/html/body/div[@class="el-select-dropdown el-popper"][1]/div[1]/div[1]/ul/li')
|
|
|
+ for element in li_elements:
|
|
|
+ province = element.text
|
|
|
+ if province not in records:
|
|
|
+ logger.info(f'>> 企业注册属地省份:{province} <<')
|
|
|
+ click(driver, element, wait_time=1.5)
|
|
|
+ records.append(province)
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+def select_categories(driver: Chrome, records):
|
|
|
+ span_elements = driver.find_elements(by=By.XPATH, value='//div[@class="labelInPut labelInPutRadio"]/span')
|
|
|
+ for element in span_elements:
|
|
|
+ qualification = element.text
|
|
|
+ if qualification not in records:
|
|
|
+ logger.info(f'>> 企业资质类别:{qualification} <<')
|
|
|
+ records.setdefault(qualification, [])
|
|
|
+
|
|
|
+ provinces = records.get(qualification)
|
|
|
+ if provinces is not None:
|
|
|
+ if len(provinces) < 32:
|
|
|
+ click(driver, element, wait_time=1.5)
|
|
|
+ crawl_finished = select_province(driver, provinces)
|
|
|
+ if not crawl_finished:
|
|
|
+ click_query(driver, wait_time=2)
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
def start(enable_remote_driver=False):
|
|
|
options = webdriver.ChromeOptions()
|
|
|
if enable_remote_driver:
|
|
@@ -427,7 +462,7 @@ def start(enable_remote_driver=False):
|
|
|
options.add_argument("--disable-gpu")
|
|
|
chrome_driver = webdriver.Chrome(options=options)
|
|
|
main_handler = chrome_driver.current_window_handle # 获取句柄
|
|
|
- '''清除多窗口'''
|
|
|
+ '''清除其余窗口'''
|
|
|
for handler in chrome_driver.window_handles:
|
|
|
if handler != main_handler:
|
|
|
chrome_driver.switch_to.window(handler)
|
|
@@ -437,10 +472,12 @@ def start(enable_remote_driver=False):
|
|
|
chrome_driver.get(CRAWL_SITE)
|
|
|
time.sleep(3)
|
|
|
'''采集记录'''
|
|
|
- records = ['全部', '造价咨询企业']
|
|
|
+ records = {
|
|
|
+ '全部': None,
|
|
|
+ '造价咨询企业': None,
|
|
|
+ }
|
|
|
while True:
|
|
|
- '''选择资质类别'''
|
|
|
- crawl_finished = select_qualify_category(chrome_driver, records)
|
|
|
+ crawl_finished = select_categories(chrome_driver, records)
|
|
|
if crawl_finished:
|
|
|
logger.info('任务结束')
|
|
|
break
|
|
@@ -454,11 +491,11 @@ def start(enable_remote_driver=False):
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- while True:
|
|
|
- try:
|
|
|
- start(enable_remote_driver=True)
|
|
|
- except:
|
|
|
- logger.info("等待100秒")
|
|
|
- time.sleep(100)
|
|
|
-
|
|
|
- # start(enable_remote_driver=True)
|
|
|
+ # while True:
|
|
|
+ # try:
|
|
|
+ # start(enable_remote_driver=True)
|
|
|
+ # except:
|
|
|
+ # logger.info("等待100秒")
|
|
|
+ # time.sleep(100)
|
|
|
+
|
|
|
+ start(enable_remote_driver=True)
|