萤火也是火 3 years ago
parent
commit
dbd5d3d8b5
1 changed files with 41 additions and 23 deletions
  1. 41 23
      jzsc/spider.py

+ 41 - 23
jzsc/spider.py

@@ -1,4 +1,5 @@
 import io
+import random
 import time
 
 import pandas as pd
@@ -177,8 +178,9 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
 def check_page(driver: Chrome, **kwargs):
     """检查页面"""
     prompt_popup(driver)
-    time.sleep(1)
+    time.sleep(3)
     geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
+    time.sleep(3)
 
 
 def click(driver: Chrome, button, allow_check_page=False, wait_time=1):
@@ -272,7 +274,7 @@ def crawl_spider(driver: Chrome, handler):
             logger.info(f"[重复数据]{title} - 丢弃")
             continue
         button = td_element.find_element_by_class_name("link")
-        click(driver, button, wait_time=10)
+        click(driver, button, wait_time=random.randint(5, 10))
         for current_handler in driver.window_handles:
             if current_handler == handler:
                 continue
@@ -320,26 +322,27 @@ def crawl_spider(driver: Chrome, handler):
             try:
                 company_staff = driver.find_element_by_id("tab-companyStaff")
                 click(driver, company_staff, allow_check_page=True)
-                registrar = []
                 reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
                 for btn in reg_buttons:
-                    '''点击分类'''
-                    driver.execute_script("arguments[0].click();", btn)
-                    element = html2element(driver.page_source)
-                    nodes = element.xpath('//div[@class="el-table__body-wrapper is-scrolling-none"]/table//tr')
-                    for node in nodes:
-                        name = "".join(node.xpath('./td[2]//span/text()')).strip()
-                        id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
-                        reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
-                        reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
-                        reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
-                        registrar.append({
-                            'name': name,  # 姓名
-                            'id_no': id_no,  # 身份证号
-                            'reg_type': reg_type,  # 注册类别
-                            'reg_no': reg_no,  # 注册号(执业印章号)
-                            'reg_major': reg_major,  # 注册专业
-                        })
+                    logger.info(f'[{btn.text}]')
+                    click(driver, btn)
+
+                registrar = []
+                element = html2element(driver.page_source)
+                nodes = element.xpath('//div[@id="pane-companyStaff"]//table[@class="el-table__body"]//tr')
+                for node in nodes:
+                    name = "".join(node.xpath('./td[2]//span/text()')).strip()
+                    id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
+                    reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
+                    reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
+                    reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
+                    registrar.append({
+                        'name': name,  # 姓名
+                        'id_no': id_no,  # 身份证号
+                        'reg_type': reg_type,  # 注册类别
+                        'reg_no': reg_no,  # 注册号(执业印章号)
+                        'reg_major': reg_major,  # 注册专业
+                    })
                 company['company_staff'] = registrar
             except IndexError:
                 pass
@@ -396,6 +399,8 @@ def crawl_spider(driver: Chrome, handler):
             driver.close()
             '''返回列表页'''
             driver.switch_to.window(handler)
+            '''下一条执行时间'''
+            time.sleep(2)
     else:
         return True
 
@@ -421,11 +426,17 @@ def start(enable_remote_driver=False):
         options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
     options.add_argument("--disable-gpu")
     chrome_driver = webdriver.Chrome(options=options)
-    main_handler = chrome_driver.current_window_handle  # 获取操作句柄
+    main_handler = chrome_driver.current_window_handle  # 获取句柄
+    '''清除多窗口'''
+    for handler in chrome_driver.window_handles:
+        if handler != main_handler:
+            chrome_driver.switch_to.window(handler)
+            chrome_driver.close()
+            chrome_driver.switch_to.window(main_handler)
+
     chrome_driver.get(CRAWL_SITE)
     time.sleep(3)
     '''采集记录'''
-    # records = ['全部', '勘察企业', '监理企业', '设计与施工一体化企业', '建筑业企业']
     records = ['全部', '造价咨询企业']
     while True:
         '''选择资质类别'''
@@ -443,4 +454,11 @@ def start(enable_remote_driver=False):
 
 
 if __name__ == '__main__':
-    start(enable_remote_driver=True)
+    while True:
+        try:
+            start(enable_remote_driver=True)
+        except:
+            logger.info("等待100秒")
+            time.sleep(100)
+
+    # start(enable_remote_driver=True)