|
@@ -1,362 +0,0 @@
|
|
|
-import time
|
|
|
-
|
|
|
-from selenium.common.exceptions import (
|
|
|
- WebDriverException,
|
|
|
- TimeoutException,
|
|
|
- InvalidSessionIdException,
|
|
|
- NoSuchElementException
|
|
|
-)
|
|
|
-from selenium.webdriver import ActionChains
|
|
|
-from selenium.webdriver.common.by import By
|
|
|
-
|
|
|
-from common.clean_html import cleaner
|
|
|
-from common.databases import int2long
|
|
|
-from common.log import logger
|
|
|
-from common.tools import html2element, element2html, verify_text, remove_node
|
|
|
-from common.webdriver import until_wait
|
|
|
-from crawler.params import (
|
|
|
- CRAWL_RECORDS,
|
|
|
- SETUP_TIME,
|
|
|
- SETUP_MAPS,
|
|
|
- CATEGORY_MAPS,
|
|
|
- CRAWL_MENU
|
|
|
-)
|
|
|
-
|
|
|
-
|
|
|
-def get_crawl_menu(category: str):
|
|
|
- """采集清单"""
|
|
|
- return CRAWL_MENU.get(category)
|
|
|
-
|
|
|
-
|
|
|
-def get_category_id(category: str):
|
|
|
- """分类id"""
|
|
|
- return CATEGORY_MAPS[category]
|
|
|
-
|
|
|
-
|
|
|
-def extract_text(html: str, feature: str):
|
|
|
- """抽取文本"""
|
|
|
- element = html2element(html)
|
|
|
- return element.xpath(feature)
|
|
|
-
|
|
|
-
|
|
|
-def extract_page_html(html: str, feature: str):
|
|
|
- """抽取页面源码"""
|
|
|
- element = html2element(html)
|
|
|
- '''移除空附件信息'''
|
|
|
- remove_target = element.xpath('//div[@id="isshow"]')
|
|
|
- if len(remove_target) > 0:
|
|
|
- remove_node(remove_target[0])
|
|
|
- try:
|
|
|
- node = element.xpath(feature)[0]
|
|
|
- return element2html(node)
|
|
|
- except IndexError:
|
|
|
- pass
|
|
|
-
|
|
|
-
|
|
|
-def init_crawl_records(driver, web_element, category: str):
|
|
|
- """初始记录"""
|
|
|
- if category not in CRAWL_RECORDS:
|
|
|
- goto(driver, web_element)
|
|
|
- # init_config = {'finished': False, 'pages': ['1']}
|
|
|
- init_config = {'finished': False, 'pages': []}
|
|
|
- CRAWL_RECORDS.setdefault(category, init_config)
|
|
|
- return True
|
|
|
- else:
|
|
|
- _record = CRAWL_RECORDS[category]
|
|
|
- if not _record['finished']:
|
|
|
- goto(driver, web_element)
|
|
|
- return True
|
|
|
- else:
|
|
|
- return False
|
|
|
-
|
|
|
-
|
|
|
-def update_crawl_records(category: str, finished: bool):
|
|
|
- """更新记录"""
|
|
|
- if category in CRAWL_RECORDS:
|
|
|
- _record = CRAWL_RECORDS[category]
|
|
|
- _record['finished'] = finished
|
|
|
- CRAWL_RECORDS.update(_record)
|
|
|
-
|
|
|
-
|
|
|
-def write_crawl_records(category: str, page_num: int):
|
|
|
- """写入记录"""
|
|
|
- if category in CRAWL_RECORDS:
|
|
|
- _record = CRAWL_RECORDS[category]
|
|
|
- '''记录采集页码,已记录页码不在访问'''
|
|
|
- finished_pages = _record['pages']
|
|
|
- finished_pages.append(str(page_num))
|
|
|
- _record.update({'pages': finished_pages})
|
|
|
- CRAWL_RECORDS.update({category: _record})
|
|
|
-
|
|
|
-
|
|
|
-def robots_alert(driver):
|
|
|
- """机器人警告"""
|
|
|
- wait = 0
|
|
|
- while wait < 20:
|
|
|
- '''等待验证模块加载'''
|
|
|
- element = html2element(driver.page_source)
|
|
|
- robot_alert = element.xpath('//span[@class="nc-lang-cnt"]/@data-nc-lang')
|
|
|
- click_alert = element.xpath('//div[@id="text"]/text()')
|
|
|
- if len(robot_alert) == 1 and "".join(robot_alert).strip() == '_Loading':
|
|
|
- time.sleep(0.5)
|
|
|
- elif len(robot_alert) > 1 and robot_alert[0] in ['_yesTEXT']:
|
|
|
- '''通过机器人验证'''
|
|
|
- return False, '0'
|
|
|
- elif len(robot_alert) > 1 and robot_alert[0] in ['_startTEXT']:
|
|
|
- '''机器人验证加载完成'''
|
|
|
- return True, '1'
|
|
|
- elif len(robot_alert) > 1 and robot_alert[0] in ['_errorNetwork']:
|
|
|
- '''网络不给力,请点击刷新,或提交反馈 (00)'''
|
|
|
- return True, '2'
|
|
|
- elif len(click_alert) > 0 and "".join(click_alert) == '请点击此处完成验证或咨询客服':
|
|
|
- return True, '3'
|
|
|
- else:
|
|
|
- return False, '0'
|
|
|
-
|
|
|
- wait += 1
|
|
|
- return True, '999'
|
|
|
-
|
|
|
-
|
|
|
-def check_robots_alert(driver):
|
|
|
- """检查并处理机器人警告"""
|
|
|
- while True:
|
|
|
- alert, alert_type = robots_alert(driver)
|
|
|
- if not alert:
|
|
|
- break
|
|
|
-
|
|
|
- if alert_type == '1':
|
|
|
- until_wait(driver, xpath='//span[contains(@class, "nc_iconfont btn_slide")]')
|
|
|
- element = driver.find_element_by_xpath('//span[contains(@class, "nc_iconfont btn_slide")]')
|
|
|
- if element.is_displayed():
|
|
|
- # 点击并且不松开鼠标,往右边移动258个位置,松开鼠标
|
|
|
- ActionChains(driver).click_and_hold(element).move_by_offset(xoffset=258, yoffset=0).release().perform()
|
|
|
-
|
|
|
- elif alert_type == '2':
|
|
|
- until_wait(driver, xpath='//span[contains(@class, "nc-lang-cnt")]/a[1]')
|
|
|
- element = driver.find_element_by_xpath('//span[contains(@class, "nc-lang-cnt")]/a[1]')
|
|
|
- if element.is_displayed():
|
|
|
- goto(driver, element, wait_time=2)
|
|
|
-
|
|
|
- elif alert_type == '3':
|
|
|
- # until_wait(driver, xpath='//div[@id="container"]')
|
|
|
- # element = driver.find_element_by_xpath('//div[@id="container"]')
|
|
|
- # if element.is_displayed():
|
|
|
- # goto(driver, element, wait_time=3)
|
|
|
- # driver.switch_to.alert.accept()
|
|
|
- logger.error("[机器人验证]触发浏览器指纹检测,无法自动处理.")
|
|
|
- raise ValueError()
|
|
|
-
|
|
|
- else:
|
|
|
- with open('robot.html', 'w') as wp:
|
|
|
- wp.write(driver.page_source)
|
|
|
- logger.error("[未知异常网页]页面源码保存在robot.html")
|
|
|
- raise ValueError()
|
|
|
- time.sleep(2)
|
|
|
-
|
|
|
-
|
|
|
-def refresh_page(driver):
|
|
|
- """刷新页面"""
|
|
|
- element = html2element(driver.page_source)
|
|
|
- node = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
|
|
|
- if "".join(node) == "暂无详细数据":
|
|
|
- driver.refresh()
|
|
|
- time.sleep(1)
|
|
|
- '''页面alert元素确定'''
|
|
|
- driver.switch_to.alert.accept()
|
|
|
- time.sleep(1.5)
|
|
|
- wait_load_detail(driver)
|
|
|
- check_robots_alert(driver)
|
|
|
-
|
|
|
-
|
|
|
-def goto(driver, web_element, wait_time=None, allow_check_page=False):
|
|
|
- """执行可点击js事件"""
|
|
|
- driver.execute_script("arguments[0].click();", web_element)
|
|
|
- _wait_time = (wait_time or 1)
|
|
|
- time.sleep(_wait_time)
|
|
|
- if allow_check_page:
|
|
|
- check_robots_alert(driver)
|
|
|
-
|
|
|
-
|
|
|
-def next_page(driver, category):
|
|
|
- """翻页"""
|
|
|
- _finished_pages = CRAWL_RECORDS[category]['pages']
|
|
|
- while True:
|
|
|
- next_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::a[last()]')
|
|
|
- if next_element.text == '下一页':
|
|
|
- goto(driver, next_element, wait_time=1.2)
|
|
|
- current_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::span[@class="current"]')
|
|
|
- val = current_element.text
|
|
|
- if val not in _finished_pages:
|
|
|
- return int(val)
|
|
|
- else:
|
|
|
- break
|
|
|
- time.sleep(1)
|
|
|
-
|
|
|
-
|
|
|
-def _select_category(driver, custom_category=None):
|
|
|
- """采集分类"""
|
|
|
- web_elements = driver.find_elements(by=By.XPATH, value='//ul[@id="myTab3"]/child::li')
|
|
|
- for element in web_elements:
|
|
|
- val = element.text
|
|
|
- if custom_category is None:
|
|
|
- success = init_crawl_records(driver, element, val)
|
|
|
- return val if success else None
|
|
|
- else:
|
|
|
- if val == custom_category:
|
|
|
- success = init_crawl_records(driver, element, custom_category)
|
|
|
- return val if success else None
|
|
|
-
|
|
|
-
|
|
|
-def select_category(driver, category: str):
|
|
|
- """选择分类"""
|
|
|
- try:
|
|
|
- _category = _select_category(driver, category)
|
|
|
- return _category
|
|
|
- except TimeoutException:
|
|
|
- driver.quit()
|
|
|
- logger.error('[访问超时]选择分类')
|
|
|
- return None
|
|
|
-
|
|
|
-
|
|
|
-def _select_date(driver, category: str, setup_time: str):
|
|
|
- """选择建立时间"""
|
|
|
- logger.info(f"[建立时间]{setup_time}")
|
|
|
- try:
|
|
|
- attr = SETUP_TIME[category][SETUP_MAPS[setup_time]]
|
|
|
- element = driver.find_element(by=By.XPATH, value=f'//a[@id="{attr}"]')
|
|
|
- goto(driver, element)
|
|
|
- except KeyError:
|
|
|
- raise KeyError(f'请设置"SETUP_TIME"中{category}对应的建立时间')
|
|
|
-
|
|
|
-
|
|
|
-def select_date(driver, category, setup_time):
|
|
|
- try:
|
|
|
- _select_date(driver, category, setup_time)
|
|
|
- return True
|
|
|
- except TimeoutException:
|
|
|
- driver.quit()
|
|
|
- logger.error('[访问超时]选择建立时间')
|
|
|
- return False
|
|
|
-
|
|
|
-
|
|
|
-def wait_load_detail(driver, check_feature=None, check_timeout=None):
|
|
|
- """等待二次加载页面结果并检测元素变化"""
|
|
|
- _check_timeout = (check_timeout or 10)
|
|
|
- sleep_interval = 0.5
|
|
|
- max_check_count = int(_check_timeout / sleep_interval)
|
|
|
- if check_feature is not None:
|
|
|
- check_count = 0
|
|
|
- while check_count < max_check_count:
|
|
|
- element = html2element(driver.page_source)
|
|
|
- check_node = element.xpath(check_feature)
|
|
|
- if len(check_node) > 0:
|
|
|
- break
|
|
|
- time.sleep(sleep_interval)
|
|
|
- check_count += 1
|
|
|
- else:
|
|
|
- check_count = 0
|
|
|
- while check_count < max_check_count:
|
|
|
- element = html2element(driver.page_source)
|
|
|
- root = element.xpath('//div[@id="xxnrList"]')
|
|
|
- if len(root) > 0:
|
|
|
- descendant = element.xpath('//div[@id="xxnrList"]/descendant::*')
|
|
|
- if len(descendant) > 0:
|
|
|
- text = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
|
|
|
- children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
|
|
|
- if "".join(text) != '暂无详细数据' and len(children) > 0:
|
|
|
- break
|
|
|
- time.sleep(sleep_interval)
|
|
|
- check_count += 1
|
|
|
- time.sleep(1)
|
|
|
-
|
|
|
-
|
|
|
-def wait_load_list(driver):
|
|
|
- while True:
|
|
|
- element = html2element(driver.page_source)
|
|
|
- node = element.xpath('//div[@id="myTab_Contenta0"]/div[2]/table//tr')
|
|
|
- if len(node) > 0:
|
|
|
- break
|
|
|
- time.sleep(0.5)
|
|
|
-
|
|
|
-
|
|
|
-def _handler_page_html(html, item):
|
|
|
- """页面源码处理"""
|
|
|
- if all([html is not None, verify_text(html)]):
|
|
|
- item['contenthtml'] = html
|
|
|
- item['detail'] = cleaner(html)
|
|
|
- item['comeintime'] = int2long(int(time.time()))
|
|
|
- else:
|
|
|
- logger.error(
|
|
|
- f'[文本异常-{item["channel"]}]{item["title"]} - {item["publishtime"]}')
|
|
|
- return item
|
|
|
-
|
|
|
-
|
|
|
-def crawl_show_details(driver, handler, item):
|
|
|
- for current_handler in driver.window_handles:
|
|
|
- if current_handler == handler:
|
|
|
- continue
|
|
|
- driver.switch_to.window(current_handler)
|
|
|
- '''加载等待并检查指定页面特征'''
|
|
|
- wait_load_detail(driver, check_feature='//div[@id="xxnrList"]/div[1]/div[2]/div[2]')
|
|
|
- '''检查机器人警告并处理'''
|
|
|
- check_robots_alert(driver)
|
|
|
- '''二次加载'''
|
|
|
- refresh_page(driver)
|
|
|
- '''加载等待'''
|
|
|
- wait_load_detail(driver)
|
|
|
- '''抽取源码'''
|
|
|
- content_html = extract_page_html(driver.page_source, feature='//div[@id="xxnrList"]')
|
|
|
- item = _handler_page_html(content_html, item)
|
|
|
- '''关闭当前页'''
|
|
|
- driver.close()
|
|
|
- '''切换主页'''
|
|
|
- driver.switch_to.window(handler)
|
|
|
- return item
|
|
|
-
|
|
|
-
|
|
|
-def crawl_psp_frame(driver, handler, item):
|
|
|
- """Frame页面"""
|
|
|
- for current_handler in driver.window_handles:
|
|
|
- if current_handler == handler:
|
|
|
- continue
|
|
|
- driver.switch_to.window(current_handler)
|
|
|
- wait_load_detail(
|
|
|
- driver,
|
|
|
- check_feature='//div[contains(@id, "mini-1")]',
|
|
|
- check_timeout=15
|
|
|
- )
|
|
|
- '''切换到frame'''
|
|
|
- try:
|
|
|
- driver.switch_to.frame('mini-iframe-6')
|
|
|
- except NoSuchElementException:
|
|
|
- driver.quit()
|
|
|
- logger.error(f'[未检测到iframe-{item["channel"]}]{item["title"]} - {item["competehref"]}')
|
|
|
- raise NoSuchElementException()
|
|
|
- '''等待加载数据'''
|
|
|
- wait_load_detail(driver, check_feature='//div[contains(@role, "accordion")]')
|
|
|
- content_html = extract_page_html(driver.page_source, feature='//div[@class="fui-accordions"]')
|
|
|
- item = _handler_page_html(content_html, item)
|
|
|
- '''关闭当前页'''
|
|
|
- driver.close()
|
|
|
- '''切换主页'''
|
|
|
- driver.switch_to.window(handler)
|
|
|
- return item
|
|
|
-
|
|
|
-
|
|
|
-def crawl_request(driver, url):
|
|
|
- try:
|
|
|
- driver.get(url)
|
|
|
- return True
|
|
|
- except WebDriverException:
|
|
|
- driver.quit()
|
|
|
- return False
|
|
|
-
|
|
|
-
|
|
|
-def parser_list_elements(driver, category):
|
|
|
- try:
|
|
|
- web_elements = driver.find_elements(by=By.XPATH, value=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr')
|
|
|
- return web_elements
|
|
|
- except InvalidSessionIdException:
|
|
|
- driver.quit()
|
|
|
- logger.error('[数据解析]获取列表失败')
|
|
|
- return None
|