dongzhaorui 3 yıl önce
ebeveyn
işleme
b32a05f18d
2 değiştirilmiş dosya ile 426 ekleme ve 0 silme
  1. 356 0
      zgzb/crawler/defaults.py
  2. 70 0
      zgzb/crawler/params.py

+ 356 - 0
zgzb/crawler/defaults.py

@@ -0,0 +1,356 @@
+import time
+
+from selenium.common.exceptions import (
+    WebDriverException,
+    TimeoutException,
+    InvalidSessionIdException
+)
+from selenium.webdriver import ActionChains
+from selenium.webdriver.common.by import By
+
+from common.clean_html import cleaner
+from common.databases import int2long
+from common.log import logger
+from common.tools import html2element, element2html, verify_text, remove_node
+from common.webdriver import until_wait
+from crawler.params import (
+    CRAWL_RECORDS,
+    SETUP_TIME,
+    SETUP_MAPS,
+    CATEGORY_MAPS,
+    CRAWL_MENU
+)
+
+
+def get_crawl_menu(category: str):
+    """采集清单"""
+    return CRAWL_MENU.get(category)
+
+
+def get_category_id(category: str):
+    """分类id"""
+    return CATEGORY_MAPS[category]
+
+
+def extract_text(html: str, feature: str):
+    """抽取文本"""
+    element = html2element(html)
+    return element.xpath(feature)
+
+
+def extract_page_html(html: str, feature: str):
+    """抽取页面源码"""
+    element = html2element(html)
+    '''移除空附件信息'''
+    remove_target = element.xpath('//div[@id="isshow"]')
+    if len(remove_target) > 0:
+        remove_node(remove_target[0])
+    try:
+        node = element.xpath(feature)[0]
+        return element2html(node)
+    except IndexError:
+        pass
+
+
+def init_crawl_records(driver, web_element, category: str):
+    """初始记录"""
+    if category not in CRAWL_RECORDS:
+        goto(driver, web_element)
+        # init_config = {'finished': False, 'pages': ['1']}
+        init_config = {'finished': False, 'pages': []}
+        CRAWL_RECORDS.setdefault(category, init_config)
+        return True
+    else:
+        _record = CRAWL_RECORDS[category]
+        if not _record['finished']:
+            goto(driver, web_element)
+            return True
+        else:
+            return False
+
+
+def update_crawl_records(category: str, finished: bool):
+    """更新记录"""
+    if category in CRAWL_RECORDS:
+        _record = CRAWL_RECORDS[category]
+        _record['finished'] = finished
+        CRAWL_RECORDS.update(_record)
+
+
+def write_crawl_records(category: str, page_num: int):
+    """写入记录"""
+    if category in CRAWL_RECORDS:
+        _record = CRAWL_RECORDS[category]
+        '''记录采集页码,已记录页码不在访问'''
+        finished_pages = _record['pages']
+        finished_pages.append(str(page_num))
+        _record.update({'pages': finished_pages})
+        CRAWL_RECORDS.update({category: _record})
+
+
+def robots_alert(driver):
+    """机器人警告"""
+    wait = 0
+    while wait < 20:
+        '''等待验证模块加载'''
+        element = html2element(driver.page_source)
+        robot_alert = element.xpath('//span[@class="nc-lang-cnt"]/@data-nc-lang')
+        click_alert = element.xpath('//div[@id="text"]/text()')
+        if len(robot_alert) == 1 and "".join(robot_alert).strip() == '_Loading':
+            time.sleep(0.5)
+        elif len(robot_alert) > 1 and robot_alert[0] in ['_yesTEXT']:
+            '''通过机器人验证'''
+            return False, '0'
+        elif len(robot_alert) > 1 and robot_alert[0] in ['_startTEXT']:
+            '''机器人验证加载完成'''
+            return True, '1'
+        elif len(robot_alert) > 1 and robot_alert[0] in ['_errorNetwork']:
+            '''网络不给力,请点击刷新,或提交反馈 (00)'''
+            return True, '2'
+        elif len(click_alert) > 0 and "".join(click_alert) == '请点击此处完成验证或咨询客服':
+            return True, '3'
+        else:
+            return False, '0'
+
+        wait += 1
+    return True, '999'
+
+
+def check_robots_alert(driver):
+    """检查并处理机器人警告"""
+    while True:
+        alert, alert_type = robots_alert(driver)
+        if not alert:
+            break
+
+        if alert_type == '1':
+            until_wait(driver, xpath='//span[contains(@class, "nc_iconfont btn_slide")]')
+            element = driver.find_element_by_xpath('//span[contains(@class, "nc_iconfont btn_slide")]')
+            if element.is_displayed():
+                # 点击并且不松开鼠标,往右边移动258个位置,松开鼠标
+                ActionChains(driver).click_and_hold(element).move_by_offset(xoffset=258, yoffset=0).release().perform()
+
+        elif alert_type == '2':
+            until_wait(driver, xpath='//span[contains(@class, "nc-lang-cnt")]/a[1]')
+            element = driver.find_element_by_xpath('//span[contains(@class, "nc-lang-cnt")]/a[1]')
+            if element.is_displayed():
+                goto(driver, element, wait_time=2)
+
+        elif alert_type == '3':
+            until_wait(driver, xpath='//div[@id="container"]')
+            element = driver.find_element_by_xpath('//div[@id="container"]')
+            if element.is_displayed():
+                goto(driver, element, wait_time=3)
+                driver.switch_to.alert.accept()
+            logger.error("[机器人验证]触发浏览器指纹检测,无法自动处理.等待重试")
+            # raise ValueError()
+
+        else:
+            with open('robot.html', 'w') as wp:
+                wp.write(driver.page_source)
+            logger.error("[未知异常网页]页面源码保存在robot.html")
+            raise ValueError()
+        time.sleep(2)
+
+
+def refresh_page(driver):
+    """刷新页面"""
+    element = html2element(driver.page_source)
+    node = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
+    if "".join(node) == "暂无详细数据":
+        driver.refresh()
+        time.sleep(1)
+        '''页面alert元素确定'''
+        driver.switch_to.alert.accept()
+    time.sleep(1.5)
+    wait_load_detail(driver)
+    check_robots_alert(driver)
+
+
+def goto(driver, web_element, wait_time=None, allow_check_page=False):
+    """执行可点击js事件"""
+    driver.execute_script("arguments[0].click();", web_element)
+    _wait_time = (wait_time or 1)
+    time.sleep(_wait_time)
+    if allow_check_page:
+        check_robots_alert(driver)
+
+
+def next_page(driver, category):
+    """翻页"""
+    _finished_pages = CRAWL_RECORDS[category]['pages']
+    # web_elements = driver.find_elements(by=By.XPATH, value='//div[@id="Pagination"]/div[1]/child::*')
+    # for element in web_elements[1:-1]:
+    #     val = element.text
+    #     if val not in _finished_pages:
+    #         goto(driver, element, wait_time=1.2)
+    #         return int(val)
+    # else:
+    while True:
+        next_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::a[last()]')
+        if next_element.text == '下一页':
+            goto(driver, next_element, wait_time=1.2)
+            current_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::span[@class="current"]')
+            val = current_element.text
+            if val not in _finished_pages:
+                return int(val)
+        else:
+            break
+    time.sleep(1)
+
+
+def _select_category(driver, custom_category=None):
+    """采集分类"""
+    web_elements = driver.find_elements(by=By.XPATH, value='//ul[@id="myTab3"]/child::li')
+    for element in web_elements:
+        val = element.text
+        if custom_category is None:
+            success = init_crawl_records(driver, element, val)
+            return val if success else None
+        else:
+            if val == custom_category:
+                success = init_crawl_records(driver, element, custom_category)
+                return val if success else None
+
+
+def select_category(driver, category: str):
+    """选择分类"""
+    try:
+        _category = _select_category(driver, category)
+        return _category
+    except TimeoutException:
+        driver.quit()
+        logger.error('[访问超时]选择分类')
+        return None
+
+
+def _select_date(driver, category: str, setup_time: str):
+    """选择建立时间"""
+    logger.info(f"[建立时间]{setup_time}")
+    try:
+        attr = SETUP_TIME[category][SETUP_MAPS[setup_time]]
+        element = driver.find_element(by=By.XPATH, value=f'//a[@id="{attr}"]')
+        goto(driver, element)
+    except KeyError:
+        raise KeyError(f'请设置"SETUP_TIME"中{category}对应的建立时间')
+
+
+def select_date(driver, category, setup_time):
+    try:
+        _select_date(driver, category, setup_time)
+        return True
+    except TimeoutException:
+        driver.quit()
+        logger.error('[访问超时]选择建立时间')
+        return False
+
+
+def wait_load_detail(driver, check_feature=None):
+    """等待二次加载页面结果并检测元素变化"""
+    if check_feature is not None:
+        check_count = 0
+        while check_count < 20:
+            element = html2element(driver.page_source)
+            check_node = element.xpath(check_feature)
+            if len(check_node) > 0:
+                break
+            time.sleep(0.5)
+            check_count += 1
+    else:
+        check_count = 0
+        while check_count < 20:
+            element = html2element(driver.page_source)
+            root = element.xpath('//div[@id="xxnrList"]')
+            if len(root) > 0:
+                descendant = element.xpath('//div[@id="xxnrList"]/descendant::*')
+                if len(descendant) > 0:
+                    text = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
+                    children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
+                    if "".join(text) != '暂无详细数据' and len(children) > 0:
+                        break
+            time.sleep(0.5)
+            check_count += 1
+    time.sleep(1)
+
+
+def wait_load_list(driver):
+    while True:
+        element = html2element(driver.page_source)
+        node = element.xpath('//div[@id="myTab_Contenta0"]/div[2]/table//tr')
+        if len(node) > 0:
+            break
+        time.sleep(0.5)
+
+
+def _handler_page_html(html, item):
+    """页面源码处理"""
+    if all([html is not None, verify_text(html)]):
+        item['contenthtml'] = html
+        item['detail'] = cleaner(html)
+        item['comeintime'] = int2long(int(time.time()))
+    else:
+        logger.error(
+            f'[文本异常-{item["channel"]}]{item["title"]} - {item["publishtime"]}')
+    return item
+
+
+def crawl_show_details(driver, handler, item):
+    for current_handler in driver.window_handles:
+        if current_handler == handler:
+            continue
+        driver.switch_to.window(current_handler)
+        '''加载等待并检查指定页面特征'''
+        wait_load_detail(driver, check_feature='//div[@id="xxnrList"]/div[1]/div[2]/div[2]')
+        '''检查机器人警告并处理'''
+        check_robots_alert(driver)
+        '''二次加载'''
+        refresh_page(driver)
+        '''加载等待'''
+        wait_load_detail(driver)
+        '''抽取源码'''
+        content_html = extract_page_html(driver.page_source, feature='//div[@id="xxnrList"]')
+        item = _handler_page_html(content_html, item)
+    '''关闭当前页'''
+    driver.close()
+    '''切换主页'''
+    driver.switch_to.window(handler)
+    return item
+
+
+def crawl_psp_frame(driver, handler, item):
+    """Frame页面"""
+    for current_handler in driver.window_handles:
+        if current_handler == handler:
+            continue
+        driver.switch_to.window(current_handler)
+        wait_load_detail(driver, check_feature='//div[contains(@id, "mini-1")]')
+        '''切换到frame'''
+        driver.switch_to.frame('mini-iframe-6')
+        '''等待加载数据'''
+        wait_load_detail(driver, check_feature='//div[contains(@role, "accordion")]')
+        content_html = extract_page_html(driver.page_source, feature='//div[@class="fui-accordions"]')
+        item = _handler_page_html(content_html, item)
+    '''关闭当前页'''
+    driver.close()
+    '''切换主页'''
+    driver.switch_to.window(handler)
+    return item
+
+
+def crawl_request(driver, url):
+    try:
+        driver.get(url)
+        return True
+    except WebDriverException:
+        driver.quit()
+        return False
+
+
+def parser_list_elements(driver, category):
+    try:
+        web_elements = driver.find_elements(by=By.XPATH, value=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr')
+        return web_elements
+    except InvalidSessionIdException:
+        driver.quit()
+        logger.error('[数据解析]获取列表失败')
+        return None

+ 70 - 0
zgzb/crawler/params.py

@@ -0,0 +1,70 @@
+from collections import namedtuple
+
+__all__ = [
+    'CRAWL_RECORDS',
+    'CATEGORY_MAPS',
+    'SETUP_MAPS',
+    'SETUP_TIME',
+    'CRAWL_MENU'
+]
+
+'''采集记录'''
+CRAWL_RECORDS = {}
+'''分类'''
+CATEGORY_MAPS = {
+    '招标项目': 'tenderProjectTab',
+    '招标公告': 'tenderBulletin',
+    '开标记录': 'openBidRecord',
+    '评标公示': 'bidEvaluation',
+    '中标公告': 'winBidBulletin',
+    # '签约履行': '',
+}
+'''建立时间'''
+SETUP_MAPS = {
+    '今天': 'jt',
+    '2天内': '2tq',
+    '3天内': '3tq',
+    '1周内': '1zn',
+}
+SETUP_TIME = {
+    '招标项目': {
+        'jt': 'tenderProject_begin1',
+        '2tq': 'tenderProject_begin2',
+        '3tq': 'tenderProject_begin3',
+        '1zn': 'tenderProject_begin7'
+    },
+    '招标公告': {
+        'jt': 'tenderBulletin_begin1',
+        '2tq': 'tenderBulletin_begin2',
+        '3tq': 'tenderBulletin_begin3',
+        '1zn': 'tenderBulletin_begin7'
+    },
+    '开标记录': {
+        'jt': 'openBidRecord_1',
+        '2tq': 'openBidRecord_2',
+        '3tq': 'openBidRecord_3',
+        '1zn': 'openBidRecord_7'
+    },
+    '评标公示': {
+        'jt': 'bidEvaluation_1',
+        '2tq': 'bidEvaluation_2',
+        '3tq': 'bidEvaluation_3',
+        '1zn': 'bidEvaluation_7'
+    },
+    '中标公告': {
+        'jt': 'winBidBulletin_1',
+        '2tq': 'winBidBulletin_2',
+        '3tq': 'winBidBulletin_3',
+        '1zn': 'winBidBulletin_7'
+    }
+}
+'''爬虫清单'''
+CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode'])
+CRAWL_MENU = {
+    '招标项目': CrawlMenu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm'),
+    '招标公告': CrawlMenu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg'),
+    '开标记录': CrawlMenu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl'),
+    '评标公示': CrawlMenu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs'),
+    '中标公告': CrawlMenu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg'),
+    '签约履行': CrawlMenu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx'),
+}