Procházet zdrojové kódy

对lxml解析代码进行边界错误捕获,出现索引超标异常时,抛出不在处理

萤火也是火 před 3 roky
rodič
revize
bb585f01cb
1 změnil soubory, kde provedl 73 přidání a 58 odebrání
  1. 73 58
      jzsc/spider.py

+ 73 - 58
jzsc/spider.py

@@ -147,7 +147,7 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
 
         '''识别验证码'''
         captcha_result = chaojiying.PostPic(bytes_array.getvalue(), 9004)
-        logger.info(captcha_result)
+        logger.info(f'[识别结果]{captcha_result}')
         pic_id = captcha_result['pic_id']
 
         '''解析识别结果'''
@@ -155,12 +155,11 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
         locations = [[int(number) for number in group.split(',')] for group in groups]
 
         '''点击验证图片'''
-        for location in locations:
-            # logger.info(location)
+        for index, location in enumerate(locations):
             ActionChains(driver).move_to_element_with_offset(
                 touclick_element,
                 location[0] + 10,
-                location[1] + 47
+                location[1] + 53,
             ).click().perform()
             time.sleep(1)
 
@@ -178,6 +177,7 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
 def check_page(driver: Chrome, **kwargs):
     """检查页面"""
     prompt_popup(driver)
+    time.sleep(1)
     geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
 
 
@@ -272,7 +272,7 @@ def crawl_spider(driver: Chrome, handler):
             logger.info(f"[重复数据]{title} - 丢弃")
             continue
         button = td_element.find_element_by_class_name("link")
-        click(driver, button, wait_time=2)
+        click(driver, button, wait_time=10)
         for current_handler in driver.window_handles:
             if current_handler == handler:
                 continue
@@ -306,67 +306,82 @@ def crawl_spider(driver: Chrome, handler):
                 # logger.info(item)
 
             '''企业资质'''
-            element = html2element(driver.page_source)
-            node = element.xpath('//div[@class="panel-container"]')[0]
-            company_quality_html = element2html(node)
-            company_quality = extract_content(company_quality_html)
-            company['company_quality'] = company_quality
-            company['company_quality_html'] = {'html': company_quality_html}
+            try:
+                element = html2element(driver.page_source)
+                node = element.xpath('//div[@class="panel-container"]')[0]
+                company_quality_html = element2html(node)
+                company_quality = extract_content(company_quality_html)
+                company['company_quality'] = company_quality
+                company['company_quality_html'] = {'html': company_quality_html}
+            except IndexError:
+                pass
 
             '''注册人员'''
-            company_staff = driver.find_element_by_id("tab-companyStaff")
-            click(driver, company_staff, allow_check_page=True)
-            registrar = []
-            reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
-            for btn in reg_buttons:
-                '''点击分类'''
-                driver.execute_script("arguments[0].click();", btn)
-                element = html2element(driver.page_source)
-                nodes = element.xpath('//div[@class="el-table__body-wrapper is-scrolling-none"]/table//tr')
-                for node in nodes:
-                    name = "".join(node.xpath('./td[2]//span/text()')).strip()
-                    id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
-                    reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
-                    reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
-                    reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
-                    registrar.append({
-                        'name': name,  # 姓名
-                        'id_no': id_no,  # 身份证号
-                        'reg_type': reg_type,  # 注册类别
-                        'reg_no': reg_no,  # 注册号(执业印章号)
-                        'reg_major': reg_major,  # 注册专业
-                    })
-            company['company_staff'] = registrar
+            try:
+                company_staff = driver.find_element_by_id("tab-companyStaff")
+                click(driver, company_staff, allow_check_page=True)
+                registrar = []
+                reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
+                for btn in reg_buttons:
+                    '''点击分类'''
+                    driver.execute_script("arguments[0].click();", btn)
+                    element = html2element(driver.page_source)
+                    nodes = element.xpath('//div[@class="el-table__body-wrapper is-scrolling-none"]/table//tr')
+                    for node in nodes:
+                        name = "".join(node.xpath('./td[2]//span/text()')).strip()
+                        id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
+                        reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
+                        reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
+                        reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
+                        registrar.append({
+                            'name': name,  # 姓名
+                            'id_no': id_no,  # 身份证号
+                            'reg_type': reg_type,  # 注册类别
+                            'reg_no': reg_no,  # 注册号(执业印章号)
+                            'reg_major': reg_major,  # 注册专业
+                        })
+                company['company_staff'] = registrar
+            except IndexError:
+                pass
 
             '''不良行为'''
-            bad_behavior = driver.find_element_by_id('tab-badBehavior')
-            click(driver, bad_behavior, allow_check_page=True)
-            element = html2element(driver.page_source)
-            node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
-            bad_behavior_html = element2html(node)
-            bad_behaviors = extract_content(bad_behavior_html)
-            company['bad_behavior'] = bad_behaviors
-            company['bad_behavior_html'] = {'html': bad_behavior_html}
+            try:
+                bad_behavior = driver.find_element_by_id('tab-badBehavior')
+                click(driver, bad_behavior, allow_check_page=True)
+                element = html2element(driver.page_source)
+                node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
+                bad_behavior_html = element2html(node)
+                bad_behaviors = extract_content(bad_behavior_html)
+                company['bad_behavior'] = bad_behaviors
+                company['bad_behavior_html'] = {'html': bad_behavior_html}
+            except IndexError:
+                pass
 
             '''黑名单记录'''
-            black_list = driver.find_element_by_id('tab-blackList')
-            click(driver, black_list, allow_check_page=True)
-            element = html2element(driver.page_source)
-            node = element.xpath('//div[@id="pane-blackList"]/div')[0]
-            black_list_html = element2html(node)
-            black_list_array = extract_content(black_list_html)
-            company['black_list'] = black_list_array
-            company['black_list_html'] = {'html': black_list_html}
+            try:
+                black_list = driver.find_element_by_id('tab-blackList')
+                click(driver, black_list, allow_check_page=True)
+                element = html2element(driver.page_source)
+                node = element.xpath('//div[@id="pane-blackList"]/div')[0]
+                black_list_html = element2html(node)
+                black_list_array = extract_content(black_list_html)
+                company['black_list'] = black_list_array
+                company['black_list_html'] = {'html': black_list_html}
+            except IndexError:
+                pass
 
             '''失信联合惩戒记录'''
-            punish = driver.find_element_by_id('tab-punishLog')
-            click(driver, punish, allow_check_page=True)
-            element = html2element(driver.page_source)
-            node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
-            punish_html = element2html(node)
-            punish_array = extract_content(punish_html)
-            company['punish'] = punish_array
-            company['punish_html'] = {'html': punish_html}
+            try:
+                punish = driver.find_element_by_id('tab-punishLog')
+                click(driver, punish, allow_check_page=True)
+                element = html2element(driver.page_source)
+                node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
+                punish_html = element2html(node)
+                punish_array = extract_content(punish_html)
+                company['punish'] = punish_array
+                company['punish_html'] = {'html': punish_html}
+            except IndexError:
+                pass
 
             '''保存企业数据'''
             if len(company['credit_no']) > 0: