萤火也是火 3 سال پیش
والد
کامیت
991683d955
1فایلهای تغییر یافته به همراه7 افزوده شده و 5 حذف شده
  1. 7 5
      jzsc/spider.py

+ 7 - 5
jzsc/spider.py

@@ -99,6 +99,7 @@ def prompt_popup(driver: Chrome):
     while True:
         if not display_prompt_popup(driver.page_source):
             break
+        logger.info("处理提示框")
         driver.find_element_by_xpath('//div[@class="el-dialog captchaDilaog"]/div[3]/div/button[1]').click()
         time.sleep(2)
 
@@ -108,7 +109,7 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
     while True:
         if not display_geetest_panel(driver.page_source):
             break
-
+        logger.info("处理验证码")
         if pic_id is not None:
             '''打码平台失败'''
             captcha_result = chaojiying.ReportError(pic_id)
@@ -283,6 +284,7 @@ def crawl_spider(driver: Chrome, handler):
                 address = "".join(node.xpath('./div[2]/div[2]/div[2]/text()')).strip()
                 business_address = "".join(node.xpath('./div[3]/div[1]/div[2]/text()')).strip()
                 company = {
+                    'company_name': title,  # 企业名称
                     'credit_no': credit_no,  # 统一社会信用代码
                     'legal_person': legal_person,  # 企业法定代表人
                     'company_type': company_type,  # 企业登记注册类型
@@ -330,7 +332,7 @@ def crawl_spider(driver: Chrome, handler):
             element = html2element(driver.page_source)
             node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
             bad_behavior_html = element2html(node)
-            bad_behaviors = extract_content(company_quality_html)
+            bad_behaviors = extract_content(bad_behavior_html)
             company['bad_behavior'] = bad_behaviors
             company['bad_behavior_html'] = {'html': bad_behavior_html}
 
@@ -340,7 +342,7 @@ def crawl_spider(driver: Chrome, handler):
             element = html2element(driver.page_source)
             node = element.xpath('//div[@id="pane-blackList"]/div')[0]
             black_list_html = element2html(node)
-            black_list_array = extract_content(company_quality_html)
+            black_list_array = extract_content(black_list_html)
             company['black_list'] = black_list_array
             company['black_list_html'] = {'html': black_list_html}
 
@@ -350,7 +352,7 @@ def crawl_spider(driver: Chrome, handler):
             element = html2element(driver.page_source)
             node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
             punish_html = element2html(node)
-            punish_array = extract_content(company_quality_html)
+            punish_array = extract_content(punish_html)
             company['punish'] = punish_array
             company['punish_html'] = {'html': punish_html}
 
@@ -396,7 +398,7 @@ def start(enable_remote_driver=False):
     time.sleep(3)
     '''采集记录'''
     # records = ['全部', '勘察企业', '监理企业', '设计与施工一体化企业', '建筑业企业']
-    records = ['全部']
+    records = ['全部', '造价咨询企业']
     while True:
         '''选择资质类别'''
         crawl_finished = select_qualify_category(chrome_driver, records)