|
@@ -99,6 +99,7 @@ def prompt_popup(driver: Chrome):
|
|
|
while True:
|
|
|
if not display_prompt_popup(driver.page_source):
|
|
|
break
|
|
|
+ logger.info("处理提示框")
|
|
|
driver.find_element_by_xpath('//div[@class="el-dialog captchaDilaog"]/div[3]/div/button[1]').click()
|
|
|
time.sleep(2)
|
|
|
|
|
@@ -108,7 +109,7 @@ def geetest_panel(driver: Chrome, save_img_to_local=False):
|
|
|
while True:
|
|
|
if not display_geetest_panel(driver.page_source):
|
|
|
break
|
|
|
-
|
|
|
+ logger.info("处理验证码")
|
|
|
if pic_id is not None:
|
|
|
'''打码平台失败'''
|
|
|
captcha_result = chaojiying.ReportError(pic_id)
|
|
@@ -283,6 +284,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
address = "".join(node.xpath('./div[2]/div[2]/div[2]/text()')).strip()
|
|
|
business_address = "".join(node.xpath('./div[3]/div[1]/div[2]/text()')).strip()
|
|
|
company = {
|
|
|
+ 'company_name': title, # 企业名称
|
|
|
'credit_no': credit_no, # 统一社会信用代码
|
|
|
'legal_person': legal_person, # 企业法定代表人
|
|
|
'company_type': company_type, # 企业登记注册类型
|
|
@@ -330,7 +332,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
element = html2element(driver.page_source)
|
|
|
node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
|
|
|
bad_behavior_html = element2html(node)
|
|
|
- bad_behaviors = extract_content(company_quality_html)
|
|
|
+ bad_behaviors = extract_content(bad_behavior_html)
|
|
|
company['bad_behavior'] = bad_behaviors
|
|
|
company['bad_behavior_html'] = {'html': bad_behavior_html}
|
|
|
|
|
@@ -340,7 +342,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
element = html2element(driver.page_source)
|
|
|
node = element.xpath('//div[@id="pane-blackList"]/div')[0]
|
|
|
black_list_html = element2html(node)
|
|
|
- black_list_array = extract_content(company_quality_html)
|
|
|
+ black_list_array = extract_content(black_list_html)
|
|
|
company['black_list'] = black_list_array
|
|
|
company['black_list_html'] = {'html': black_list_html}
|
|
|
|
|
@@ -350,7 +352,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
|
element = html2element(driver.page_source)
|
|
|
node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
|
|
|
punish_html = element2html(node)
|
|
|
- punish_array = extract_content(company_quality_html)
|
|
|
+ punish_array = extract_content(punish_html)
|
|
|
company['punish'] = punish_array
|
|
|
company['punish_html'] = {'html': punish_html}
|
|
|
|
|
@@ -396,7 +398,7 @@ def start(enable_remote_driver=False):
|
|
|
time.sleep(3)
|
|
|
'''采集记录'''
|
|
|
# records = ['全部', '勘察企业', '监理企业', '设计与施工一体化企业', '建筑业企业']
|
|
|
- records = ['全部']
|
|
|
+ records = ['全部', '造价咨询企业']
|
|
|
while True:
|
|
|
'''选择资质类别'''
|
|
|
crawl_finished = select_qualify_category(chrome_driver, records)
|