|
@@ -0,0 +1,356 @@
|
|
|
+import time
|
|
|
+
|
|
|
+from selenium.common.exceptions import (
|
|
|
+ WebDriverException,
|
|
|
+ TimeoutException,
|
|
|
+ InvalidSessionIdException
|
|
|
+)
|
|
|
+from selenium.webdriver import ActionChains
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+
|
|
|
+from common.clean_html import cleaner
|
|
|
+from common.databases import int2long
|
|
|
+from common.log import logger
|
|
|
+from common.tools import html2element, element2html, verify_text, remove_node
|
|
|
+from common.webdriver import until_wait
|
|
|
+from crawler.params import (
|
|
|
+ CRAWL_RECORDS,
|
|
|
+ SETUP_TIME,
|
|
|
+ SETUP_MAPS,
|
|
|
+ CATEGORY_MAPS,
|
|
|
+ CRAWL_MENU
|
|
|
+)
|
|
|
+
|
|
|
+
|
|
|
+def get_crawl_menu(category: str):
|
|
|
+ """采集清单"""
|
|
|
+ return CRAWL_MENU.get(category)
|
|
|
+
|
|
|
+
|
|
|
+def get_category_id(category: str):
|
|
|
+ """分类id"""
|
|
|
+ return CATEGORY_MAPS[category]
|
|
|
+
|
|
|
+
|
|
|
+def extract_text(html: str, feature: str):
|
|
|
+ """抽取文本"""
|
|
|
+ element = html2element(html)
|
|
|
+ return element.xpath(feature)
|
|
|
+
|
|
|
+
|
|
|
+def extract_page_html(html: str, feature: str):
|
|
|
+ """抽取页面源码"""
|
|
|
+ element = html2element(html)
|
|
|
+ '''移除空附件信息'''
|
|
|
+ remove_target = element.xpath('//div[@id="isshow"]')
|
|
|
+ if len(remove_target) > 0:
|
|
|
+ remove_node(remove_target[0])
|
|
|
+ try:
|
|
|
+ node = element.xpath(feature)[0]
|
|
|
+ return element2html(node)
|
|
|
+ except IndexError:
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+def init_crawl_records(driver, web_element, category: str):
|
|
|
+ """初始记录"""
|
|
|
+ if category not in CRAWL_RECORDS:
|
|
|
+ goto(driver, web_element)
|
|
|
+ # init_config = {'finished': False, 'pages': ['1']}
|
|
|
+ init_config = {'finished': False, 'pages': []}
|
|
|
+ CRAWL_RECORDS.setdefault(category, init_config)
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ _record = CRAWL_RECORDS[category]
|
|
|
+ if not _record['finished']:
|
|
|
+ goto(driver, web_element)
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def update_crawl_records(category: str, finished: bool):
|
|
|
+ """更新记录"""
|
|
|
+ if category in CRAWL_RECORDS:
|
|
|
+ _record = CRAWL_RECORDS[category]
|
|
|
+ _record['finished'] = finished
|
|
|
+ CRAWL_RECORDS.update(_record)
|
|
|
+
|
|
|
+
|
|
|
+def write_crawl_records(category: str, page_num: int):
|
|
|
+ """写入记录"""
|
|
|
+ if category in CRAWL_RECORDS:
|
|
|
+ _record = CRAWL_RECORDS[category]
|
|
|
+ '''记录采集页码,已记录页码不在访问'''
|
|
|
+ finished_pages = _record['pages']
|
|
|
+ finished_pages.append(str(page_num))
|
|
|
+ _record.update({'pages': finished_pages})
|
|
|
+ CRAWL_RECORDS.update({category: _record})
|
|
|
+
|
|
|
+
|
|
|
+def robots_alert(driver):
|
|
|
+ """机器人警告"""
|
|
|
+ wait = 0
|
|
|
+ while wait < 20:
|
|
|
+ '''等待验证模块加载'''
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ robot_alert = element.xpath('//span[@class="nc-lang-cnt"]/@data-nc-lang')
|
|
|
+ click_alert = element.xpath('//div[@id="text"]/text()')
|
|
|
+ if len(robot_alert) == 1 and "".join(robot_alert).strip() == '_Loading':
|
|
|
+ time.sleep(0.5)
|
|
|
+ elif len(robot_alert) > 1 and robot_alert[0] in ['_yesTEXT']:
|
|
|
+ '''通过机器人验证'''
|
|
|
+ return False, '0'
|
|
|
+ elif len(robot_alert) > 1 and robot_alert[0] in ['_startTEXT']:
|
|
|
+ '''机器人验证加载完成'''
|
|
|
+ return True, '1'
|
|
|
+ elif len(robot_alert) > 1 and robot_alert[0] in ['_errorNetwork']:
|
|
|
+ '''网络不给力,请点击刷新,或提交反馈 (00)'''
|
|
|
+ return True, '2'
|
|
|
+ elif len(click_alert) > 0 and "".join(click_alert) == '请点击此处完成验证或咨询客服':
|
|
|
+ return True, '3'
|
|
|
+ else:
|
|
|
+ return False, '0'
|
|
|
+
|
|
|
+ wait += 1
|
|
|
+ return True, '999'
|
|
|
+
|
|
|
+
|
|
|
+def check_robots_alert(driver):
|
|
|
+ """检查并处理机器人警告"""
|
|
|
+ while True:
|
|
|
+ alert, alert_type = robots_alert(driver)
|
|
|
+ if not alert:
|
|
|
+ break
|
|
|
+
|
|
|
+ if alert_type == '1':
|
|
|
+ until_wait(driver, xpath='//span[contains(@class, "nc_iconfont btn_slide")]')
|
|
|
+ element = driver.find_element_by_xpath('//span[contains(@class, "nc_iconfont btn_slide")]')
|
|
|
+ if element.is_displayed():
|
|
|
+ # 点击并且不松开鼠标,往右边移动258个位置,松开鼠标
|
|
|
+ ActionChains(driver).click_and_hold(element).move_by_offset(xoffset=258, yoffset=0).release().perform()
|
|
|
+
|
|
|
+ elif alert_type == '2':
|
|
|
+ until_wait(driver, xpath='//span[contains(@class, "nc-lang-cnt")]/a[1]')
|
|
|
+ element = driver.find_element_by_xpath('//span[contains(@class, "nc-lang-cnt")]/a[1]')
|
|
|
+ if element.is_displayed():
|
|
|
+ goto(driver, element, wait_time=2)
|
|
|
+
|
|
|
+ elif alert_type == '3':
|
|
|
+ until_wait(driver, xpath='//div[@id="container"]')
|
|
|
+ element = driver.find_element_by_xpath('//div[@id="container"]')
|
|
|
+ if element.is_displayed():
|
|
|
+ goto(driver, element, wait_time=3)
|
|
|
+ driver.switch_to.alert.accept()
|
|
|
+ logger.error("[机器人验证]触发浏览器指纹检测,无法自动处理.等待重试")
|
|
|
+ # raise ValueError()
|
|
|
+
|
|
|
+ else:
|
|
|
+ with open('robot.html', 'w') as wp:
|
|
|
+ wp.write(driver.page_source)
|
|
|
+ logger.error("[未知异常网页]页面源码保存在robot.html")
|
|
|
+ raise ValueError()
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+
|
|
|
+def refresh_page(driver):
|
|
|
+ """刷新页面"""
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ node = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
|
|
|
+ if "".join(node) == "暂无详细数据":
|
|
|
+ driver.refresh()
|
|
|
+ time.sleep(1)
|
|
|
+ '''页面alert元素确定'''
|
|
|
+ driver.switch_to.alert.accept()
|
|
|
+ time.sleep(1.5)
|
|
|
+ wait_load_detail(driver)
|
|
|
+ check_robots_alert(driver)
|
|
|
+
|
|
|
+
|
|
|
+def goto(driver, web_element, wait_time=None, allow_check_page=False):
|
|
|
+ """执行可点击js事件"""
|
|
|
+ driver.execute_script("arguments[0].click();", web_element)
|
|
|
+ _wait_time = (wait_time or 1)
|
|
|
+ time.sleep(_wait_time)
|
|
|
+ if allow_check_page:
|
|
|
+ check_robots_alert(driver)
|
|
|
+
|
|
|
+
|
|
|
+def next_page(driver, category):
|
|
|
+ """翻页"""
|
|
|
+ _finished_pages = CRAWL_RECORDS[category]['pages']
|
|
|
+ # web_elements = driver.find_elements(by=By.XPATH, value='//div[@id="Pagination"]/div[1]/child::*')
|
|
|
+ # for element in web_elements[1:-1]:
|
|
|
+ # val = element.text
|
|
|
+ # if val not in _finished_pages:
|
|
|
+ # goto(driver, element, wait_time=1.2)
|
|
|
+ # return int(val)
|
|
|
+ # else:
|
|
|
+ while True:
|
|
|
+ next_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::a[last()]')
|
|
|
+ if next_element.text == '下一页':
|
|
|
+ goto(driver, next_element, wait_time=1.2)
|
|
|
+ current_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::span[@class="current"]')
|
|
|
+ val = current_element.text
|
|
|
+ if val not in _finished_pages:
|
|
|
+ return int(val)
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+
|
|
|
+def _select_category(driver, custom_category=None):
|
|
|
+ """采集分类"""
|
|
|
+ web_elements = driver.find_elements(by=By.XPATH, value='//ul[@id="myTab3"]/child::li')
|
|
|
+ for element in web_elements:
|
|
|
+ val = element.text
|
|
|
+ if custom_category is None:
|
|
|
+ success = init_crawl_records(driver, element, val)
|
|
|
+ return val if success else None
|
|
|
+ else:
|
|
|
+ if val == custom_category:
|
|
|
+ success = init_crawl_records(driver, element, custom_category)
|
|
|
+ return val if success else None
|
|
|
+
|
|
|
+
|
|
|
+def select_category(driver, category: str):
|
|
|
+ """选择分类"""
|
|
|
+ try:
|
|
|
+ _category = _select_category(driver, category)
|
|
|
+ return _category
|
|
|
+ except TimeoutException:
|
|
|
+ driver.quit()
|
|
|
+ logger.error('[访问超时]选择分类')
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def _select_date(driver, category: str, setup_time: str):
|
|
|
+ """选择建立时间"""
|
|
|
+ logger.info(f"[建立时间]{setup_time}")
|
|
|
+ try:
|
|
|
+ attr = SETUP_TIME[category][SETUP_MAPS[setup_time]]
|
|
|
+ element = driver.find_element(by=By.XPATH, value=f'//a[@id="{attr}"]')
|
|
|
+ goto(driver, element)
|
|
|
+ except KeyError:
|
|
|
+ raise KeyError(f'请设置"SETUP_TIME"中{category}对应的建立时间')
|
|
|
+
|
|
|
+
|
|
|
+def select_date(driver, category, setup_time):
|
|
|
+ try:
|
|
|
+ _select_date(driver, category, setup_time)
|
|
|
+ return True
|
|
|
+ except TimeoutException:
|
|
|
+ driver.quit()
|
|
|
+ logger.error('[访问超时]选择建立时间')
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def wait_load_detail(driver, check_feature=None):
|
|
|
+ """等待二次加载页面结果并检测元素变化"""
|
|
|
+ if check_feature is not None:
|
|
|
+ check_count = 0
|
|
|
+ while check_count < 20:
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ check_node = element.xpath(check_feature)
|
|
|
+ if len(check_node) > 0:
|
|
|
+ break
|
|
|
+ time.sleep(0.5)
|
|
|
+ check_count += 1
|
|
|
+ else:
|
|
|
+ check_count = 0
|
|
|
+ while check_count < 20:
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ root = element.xpath('//div[@id="xxnrList"]')
|
|
|
+ if len(root) > 0:
|
|
|
+ descendant = element.xpath('//div[@id="xxnrList"]/descendant::*')
|
|
|
+ if len(descendant) > 0:
|
|
|
+ text = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
|
|
|
+ children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
|
|
|
+ if "".join(text) != '暂无详细数据' and len(children) > 0:
|
|
|
+ break
|
|
|
+ time.sleep(0.5)
|
|
|
+ check_count += 1
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+
|
|
|
+def wait_load_list(driver):
|
|
|
+ while True:
|
|
|
+ element = html2element(driver.page_source)
|
|
|
+ node = element.xpath('//div[@id="myTab_Contenta0"]/div[2]/table//tr')
|
|
|
+ if len(node) > 0:
|
|
|
+ break
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+def _handler_page_html(html, item):
|
|
|
+ """页面源码处理"""
|
|
|
+ if all([html is not None, verify_text(html)]):
|
|
|
+ item['contenthtml'] = html
|
|
|
+ item['detail'] = cleaner(html)
|
|
|
+ item['comeintime'] = int2long(int(time.time()))
|
|
|
+ else:
|
|
|
+ logger.error(
|
|
|
+ f'[文本异常-{item["channel"]}]{item["title"]} - {item["publishtime"]}')
|
|
|
+ return item
|
|
|
+
|
|
|
+
|
|
|
+def crawl_show_details(driver, handler, item):
|
|
|
+ for current_handler in driver.window_handles:
|
|
|
+ if current_handler == handler:
|
|
|
+ continue
|
|
|
+ driver.switch_to.window(current_handler)
|
|
|
+ '''加载等待并检查指定页面特征'''
|
|
|
+ wait_load_detail(driver, check_feature='//div[@id="xxnrList"]/div[1]/div[2]/div[2]')
|
|
|
+ '''检查机器人警告并处理'''
|
|
|
+ check_robots_alert(driver)
|
|
|
+ '''二次加载'''
|
|
|
+ refresh_page(driver)
|
|
|
+ '''加载等待'''
|
|
|
+ wait_load_detail(driver)
|
|
|
+ '''抽取源码'''
|
|
|
+ content_html = extract_page_html(driver.page_source, feature='//div[@id="xxnrList"]')
|
|
|
+ item = _handler_page_html(content_html, item)
|
|
|
+ '''关闭当前页'''
|
|
|
+ driver.close()
|
|
|
+ '''切换主页'''
|
|
|
+ driver.switch_to.window(handler)
|
|
|
+ return item
|
|
|
+
|
|
|
+
|
|
|
+def crawl_psp_frame(driver, handler, item):
|
|
|
+ """Frame页面"""
|
|
|
+ for current_handler in driver.window_handles:
|
|
|
+ if current_handler == handler:
|
|
|
+ continue
|
|
|
+ driver.switch_to.window(current_handler)
|
|
|
+ wait_load_detail(driver, check_feature='//div[contains(@id, "mini-1")]')
|
|
|
+ '''切换到frame'''
|
|
|
+ driver.switch_to.frame('mini-iframe-6')
|
|
|
+ '''等待加载数据'''
|
|
|
+ wait_load_detail(driver, check_feature='//div[contains(@role, "accordion")]')
|
|
|
+ content_html = extract_page_html(driver.page_source, feature='//div[@class="fui-accordions"]')
|
|
|
+ item = _handler_page_html(content_html, item)
|
|
|
+ '''关闭当前页'''
|
|
|
+ driver.close()
|
|
|
+ '''切换主页'''
|
|
|
+ driver.switch_to.window(handler)
|
|
|
+ return item
|
|
|
+
|
|
|
+
|
|
|
+def crawl_request(driver, url):
|
|
|
+ try:
|
|
|
+ driver.get(url)
|
|
|
+ return True
|
|
|
+ except WebDriverException:
|
|
|
+ driver.quit()
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def parser_list_elements(driver, category):
|
|
|
+ try:
|
|
|
+ web_elements = driver.find_elements(by=By.XPATH, value=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr')
|
|
|
+ return web_elements
|
|
|
+ except InvalidSessionIdException:
|
|
|
+ driver.quit()
|
|
|
+ logger.error('[数据解析]获取列表失败')
|
|
|
+ return None
|