Browse Source

添加企业注册属地选择;优化添加每个流程执行间隔时长

萤火也是火 3 years ago
parent
commit
cbc434c136
1 changed files with 91 additions and 54 deletions
  1. 91 54
      jzsc/spider.py

+ 91 - 54
jzsc/spider.py

@@ -101,21 +101,24 @@ def display_geetest_panel(html):
         return False
 
 
-def prompt_popup(driver: Chrome):
+def prompt_popup(driver: Chrome, wait_time=None):
     while True:
         if not display_prompt_popup(driver.page_source):
             break
-        logger.info("处理提示框")
+        logger.info(">>> 点击提示弹框")
         driver.find_element_by_xpath('//div[@class="el-dialog captchaDilaog"]/div[3]/div/button[1]').click()
-        time.sleep(2)
+        time.sleep(1)
+    '''流程之间的间隔时间'''
+    _wait_time = (wait_time or 1)
+    time.sleep(_wait_time)
 
 
-def geetest_panel(driver: Chrome, save_img_to_local=False):
+def geetest_panel(driver: Chrome, save_img_to_local=False, wait_time=None):
     pic_id = None
     while True:
         if not display_geetest_panel(driver.page_source):
             break
-        logger.info("处理验证码")
+        logger.info(">>> 验证码检测")
         if pic_id is not None:
             '''打码平台失败'''
             captcha_result = chaojiying.ReportError(pic_id)
@@ -173,21 +176,34 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
         commit_element = wait.until(EC.presence_of_element_located(locator))
         ActionChains(driver).click(commit_element).perform()
         time.sleep(5)
+    _wait_time = (wait_time or 1)
+    time.sleep(wait_time)
 
 
-def check_page(driver: Chrome, **kwargs):
+def check_page(driver: Chrome, wait_time=None, **kwargs):
     """检查页面"""
-    prompt_popup(driver)
-    time.sleep(3)
-    geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
-    time.sleep(3)
+    _wait_time = (wait_time or 1)
+    prompt_popup(driver, wait_time=_wait_time)
+    geetest_panel(
+        driver,
+        wait_time=_wait_time,
+        save_img_to_local=kwargs.get('save_img_to_local'),
+    )
 
 
-def click(driver: Chrome, button, allow_check_page=False, wait_time=1):
+def click(driver: Chrome, button, wait_time=None, allow_check_page=False):
     driver.execute_script("arguments[0].click();", button)
-    time.sleep(wait_time)
+    _wait_time = (wait_time or 1)
+    time.sleep(_wait_time)
     if allow_check_page:
-        check_page(driver)
+        check_page(driver, wait_time=_wait_time)
+
+
+def click_query(driver: Chrome, wait_time=None):
+    """查询按钮"""
+    button = driver.find_element_by_class_name("ssButton")
+    _wait_time = (wait_time or 1)
+    click(driver, button, wait_time=_wait_time)
 
 
 def next_page(driver: Chrome):
@@ -239,27 +255,6 @@ def extract_content(html):
     return results
 
 
-def click_query(driver: Chrome):
-    """查询按钮"""
-    button = driver.find_element_by_class_name("ssButton")
-    click(driver, button)
-    time.sleep(1)
-
-
-def select_qualify_category(driver: Chrome, records):
-    span_elements = driver.find_elements(by=By.XPATH, value='//div[@class="labelInPut labelInPutRadio"]/span')
-    for span_element in span_elements:
-        qualify_category = span_element.text
-        if qualify_category not in records:
-            logger.info(f'>>资质类别:{qualify_category} <<')
-            click(driver, span_element)
-            click_query(driver)
-            records.append(span_element.text)
-            return False
-    else:
-        return True
-
-
 def crawl_spider(driver: Chrome, handler):
     """采集爬虫"""
     exception_count = 0
@@ -274,17 +269,17 @@ def crawl_spider(driver: Chrome, handler):
             logger.info(f"[重复数据]{title} - 丢弃")
             continue
         button = td_element.find_element_by_class_name("link")
-        click(driver, button, wait_time=random.randint(5, 10))
+        click(driver, button, wait_time=random.randint(3, 10))
         for current_handler in driver.window_handles:
             if current_handler == handler:
                 continue
             '''切换到弹出页面'''
             driver.switch_to.window(current_handler)
             current_url = driver.current_url
-            '''首次进入详情页,检查页面弹框和验证码面板'''
-            check_page(driver)
+            '''首次进入详情页,检查提示弹框和验证码面板'''
+            check_page(driver, wait_time=random.randint(2, 6))
+            '''企业数据处理'''
             company = {}
-
             '''企业基础数据'''
             element = html2element(driver.page_source)
             nodes = element.xpath('//div[@class="detaile-header__info--table"]')
@@ -323,9 +318,11 @@ def crawl_spider(driver: Chrome, handler):
                 company_staff = driver.find_element_by_id("tab-companyStaff")
                 click(driver, company_staff, allow_check_page=True)
                 reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
-                for btn in reg_buttons:
-                    logger.info(f'[{btn.text}]')
-                    click(driver, btn)
+                logger.info(f'>>> 人员注册类别 <<<')
+                for element in reg_buttons:
+                    # TODO 页面需翻页的逻辑未添加
+                    logger.info(f'[{element.text}]')
+                    click(driver, element, wait_time=random.randint(1, 3))
 
                 registrar = []
                 element = html2element(driver.page_source)
@@ -407,7 +404,7 @@ def crawl_spider(driver: Chrome, handler):
 
 def downloader(driver: Chrome, handler):
     while True:
-        logger.info(f">>> 第{current_page(driver.page_source)}页<<<")
+        logger.info(f">>> 第{current_page(driver.page_source)}页 <<<")
         allow_crawl = crawl_spider(driver, handler)
         '''是否继续采集'''
         if not allow_crawl:
@@ -420,6 +417,44 @@ def downloader(driver: Chrome, handler):
     return True
 
 
+def select_province(driver: Chrome, records):
+    """选择注册属地"""
+    '''点击省份下拉框'''
+    drop_down_button = driver.find_element_by_xpath('//div[@class="region-select"]/div[1]/div[1]/span[1]/span[1]/i[contains(@class,"el-select__caret el-input__icon el-icon-arrow-up")]')
+    click(driver, drop_down_button, wait_time=1)
+    '''选择省份'''
+    li_elements = driver.find_elements(by=By.XPATH, value='/html/body/div[@class="el-select-dropdown el-popper"][1]/div[1]/div[1]/ul/li')
+    for element in li_elements:
+        province = element.text
+        if province not in records:
+            logger.info(f'>> 企业注册属地省份:{province} <<')
+            click(driver, element, wait_time=1.5)
+            records.append(province)
+            return False
+    else:
+        return True
+
+
+def select_categories(driver: Chrome, records):
+    span_elements = driver.find_elements(by=By.XPATH, value='//div[@class="labelInPut labelInPutRadio"]/span')
+    for element in span_elements:
+        qualification = element.text
+        if qualification not in records:
+            logger.info(f'>> 企业资质类别:{qualification} <<')
+            records.setdefault(qualification, [])
+
+        provinces = records.get(qualification)
+        if provinces is not None:
+            if len(provinces) < 32:
+                click(driver, element, wait_time=1.5)
+                crawl_finished = select_province(driver, provinces)
+                if not crawl_finished:
+                    click_query(driver, wait_time=2)
+                    return False
+    else:
+        return True
+
+
 def start(enable_remote_driver=False):
     options = webdriver.ChromeOptions()
     if enable_remote_driver:
@@ -427,7 +462,7 @@ def start(enable_remote_driver=False):
     options.add_argument("--disable-gpu")
     chrome_driver = webdriver.Chrome(options=options)
     main_handler = chrome_driver.current_window_handle  # 获取句柄
-    '''清除窗口'''
+    '''清除其余窗口'''
     for handler in chrome_driver.window_handles:
         if handler != main_handler:
             chrome_driver.switch_to.window(handler)
@@ -437,10 +472,12 @@ def start(enable_remote_driver=False):
     chrome_driver.get(CRAWL_SITE)
     time.sleep(3)
     '''采集记录'''
-    records = ['全部', '造价咨询企业']
+    records = {
+        '全部': None,
+        '造价咨询企业': None,
+    }
     while True:
-        '''选择资质类别'''
-        crawl_finished = select_qualify_category(chrome_driver, records)
+        crawl_finished = select_categories(chrome_driver, records)
         if crawl_finished:
             logger.info('任务结束')
             break
@@ -454,11 +491,11 @@ def start(enable_remote_driver=False):
 
 
 if __name__ == '__main__':
-    while True:
-        try:
-            start(enable_remote_driver=True)
-        except:
-            logger.info("等待100秒")
-            time.sleep(100)
-
-    # start(enable_remote_driver=True)
+    # while True:
+    #     try:
+    #         start(enable_remote_driver=True)
+    #     except:
+    #         logger.info("等待100秒")
+    #         time.sleep(100)
+
+    start(enable_remote_driver=True)