|
@@ -0,0 +1,434 @@
|
|
|
|
+import time
|
|
|
|
+from collections import namedtuple
|
|
|
|
+
|
|
|
|
+from selenium.webdriver import ActionChains
|
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
|
+
|
|
|
|
+from common.clean_html import cleaner
|
|
|
|
+from common.databases import mongo_table, int2long, redis_client
|
|
|
|
+from common.log import logger
|
|
|
|
+from common.socks5 import Proxy
|
|
|
|
+from common.tools import html2element, element2html, verify_text, sha1
|
|
|
|
+from common.webdriver import WebDriver, until_wait
|
|
|
|
+
|
|
|
|
+crawl_tab = mongo_table(db='py_spider', coll='zgzb_wagf_list')
|
|
|
|
+save_tab = mongo_table(db='py_spider', coll='data_bak')
|
|
|
|
+redis_key = 'zgzb_wagf_2022'
|
|
|
|
+r = redis_client()
|
|
|
|
+'''采集记录'''
|
|
|
|
+CRAWL_RECORDS = {}
|
|
|
|
+'''分类'''
|
|
|
|
+CATEGORY_MAPS = {
|
|
|
|
+ '招标项目': 'tenderProjectTab',
|
|
|
|
+ '招标公告': 'tenderBulletin',
|
|
|
|
+ '开标记录': 'openBidRecord',
|
|
|
|
+ '评标公示': 'bidEvaluation',
|
|
|
|
+ '中标公告': 'winBidBulletin',
|
|
|
|
+ # '签约履行': '',
|
|
|
|
+}
|
|
|
|
+'''建立时间'''
|
|
|
|
+SETUP_MAPS = {
|
|
|
|
+ '今天': 'jt',
|
|
|
|
+ '2天内': '2tq',
|
|
|
|
+ '3天内': '3tq',
|
|
|
|
+ '1周内': '1zn',
|
|
|
|
+}
|
|
|
|
+SETUP_TIME = {
|
|
|
|
+ '招标项目': {
|
|
|
|
+ 'jt': 'tenderProject_begin1',
|
|
|
|
+ '2tq': 'tenderProject_begin2',
|
|
|
|
+ '3tq': 'tenderProject_begin3',
|
|
|
|
+ '1zn': 'tenderProject_begin7'
|
|
|
|
+ },
|
|
|
|
+ '招标公告': {
|
|
|
|
+ 'jt': 'tenderBulletin_begin1',
|
|
|
|
+ '2tq': 'tenderBulletin_begin2',
|
|
|
|
+ '3tq': 'tenderBulletin_begin3',
|
|
|
|
+ '1zn': 'tenderBulletin_begin7'
|
|
|
|
+ },
|
|
|
|
+ '开标记录': {
|
|
|
|
+ 'jt': 'openBidRecord_1',
|
|
|
|
+ '2tq': 'openBidRecord_2',
|
|
|
|
+ '3tq': 'openBidRecord_3',
|
|
|
|
+ '1zn': 'openBidRecord_7'
|
|
|
|
+ },
|
|
|
|
+ '评标公示': {
|
|
|
|
+ 'jt': 'bidEvaluation_1',
|
|
|
|
+ '2tq': 'bidEvaluation_2',
|
|
|
|
+ '3tq': 'bidEvaluation_3',
|
|
|
|
+ '1zn': 'bidEvaluation_7'
|
|
|
|
+ },
|
|
|
|
+ '中标公告': {
|
|
|
|
+ 'jt': 'winBidBulletin_1',
|
|
|
|
+ '2tq': 'winBidBulletin_2',
|
|
|
|
+ '3tq': 'winBidBulletin_3',
|
|
|
|
+ '1zn': 'winBidBulletin_7'
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+'''爬虫清单'''
|
|
|
|
+CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode'])
|
|
|
|
+CRAWL_MENU = {
|
|
|
|
+ '招标项目': CrawlMenu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm'),
|
|
|
|
+ '招标公告': CrawlMenu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg'),
|
|
|
|
+ '开标记录': CrawlMenu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl'),
|
|
|
|
+ '评标公示': CrawlMenu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs'),
|
|
|
|
+ '中标公告': CrawlMenu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg'),
|
|
|
|
+ '签约履行': CrawlMenu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx'),
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def robots_alert(driver):
|
|
|
|
+ """机器人警告"""
|
|
|
|
+ wait = 0
|
|
|
|
+ while wait < 20:
|
|
|
|
+ '''等待验证模块加载'''
|
|
|
|
+ element = html2element(driver.page_source)
|
|
|
|
+ robot_alert = element.xpath('//span[@class="nc-lang-cnt"]/@data-nc-lang')
|
|
|
|
+ click_alert = element.xpath('//div[@id="text"]/text()')
|
|
|
|
+ if len(robot_alert) == 1 and "".join(robot_alert).strip() == '_Loading':
|
|
|
|
+ time.sleep(0.5)
|
|
|
|
+ elif len(robot_alert) > 1 and robot_alert[0] in ['_yesTEXT']:
|
|
|
|
+ '''通过机器人验证'''
|
|
|
|
+ return False, '0'
|
|
|
|
+ elif len(robot_alert) > 1 and robot_alert[0] in ['_startTEXT']:
|
|
|
|
+ '''机器人验证加载完成'''
|
|
|
|
+ return True, '1'
|
|
|
|
+ elif len(robot_alert) > 1 and robot_alert[0] in ['_errorNetwork']:
|
|
|
|
+ '''网络不给力,请点击刷新,或提交反馈 (00)'''
|
|
|
|
+ return True, '2'
|
|
|
|
+ elif len(click_alert) > 0 and "".join(click_alert) == '请点击此处完成验证或咨询客服':
|
|
|
|
+ return True, '3'
|
|
|
|
+ else:
|
|
|
|
+ return False, '0'
|
|
|
|
+
|
|
|
|
+ wait += 1
|
|
|
|
+ return True, '999'
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def check_robots_alert(driver):
|
|
|
|
+ """检查并处理机器人警告"""
|
|
|
|
+ while True:
|
|
|
|
+ alert, alert_type = robots_alert(driver)
|
|
|
|
+ if not alert:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ if alert_type == '1':
|
|
|
|
+ until_wait(driver, xpath='//span[contains(@class, "nc_iconfont btn_slide")]')
|
|
|
|
+ element = driver.find_element_by_xpath('//span[contains(@class, "nc_iconfont btn_slide")]')
|
|
|
|
+ if element.is_displayed():
|
|
|
|
+ # 点击并且不松开鼠标,往右边移动258个位置,松开鼠标
|
|
|
|
+ ActionChains(driver).click_and_hold(element).move_by_offset(xoffset=258, yoffset=0).release().perform()
|
|
|
|
+
|
|
|
|
+ elif alert_type == '2':
|
|
|
|
+ until_wait(driver, xpath='//span[contains(@class, "nc-lang-cnt")]/a[1]')
|
|
|
|
+ element = driver.find_element_by_xpath('//span[contains(@class, "nc-lang-cnt")]/a[1]')
|
|
|
|
+ if element.is_displayed():
|
|
|
|
+ goto(driver, element, wait_time=2)
|
|
|
|
+
|
|
|
|
+ elif alert_type == '3':
|
|
|
|
+ # until_wait(driver, xpath='//div[@id="container"]')
|
|
|
|
+ # element = driver.find_element_by_xpath('//div[@id="container"]')
|
|
|
|
+ # if element.is_displayed():
|
|
|
|
+ # goto(driver, element, wait_time=2)
|
|
|
|
+ # driver.switch_to.alert.accept()
|
|
|
|
+ raise ValueError('浏览器指纹被识别,等待重试')
|
|
|
|
+
|
|
|
|
+ else:
|
|
|
|
+ with open('robot.html', 'w') as wp:
|
|
|
|
+ wp.write(driver.page_source)
|
|
|
|
+ raise ValueError('未知异常网页,保存在robot.html')
|
|
|
|
+ time.sleep(2)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def refresh_page(driver):
|
|
|
|
+ """刷新页面"""
|
|
|
|
+ element = html2element(driver.page_source)
|
|
|
|
+ node = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
|
|
|
|
+ if "".join(node) == "暂无详细数据":
|
|
|
|
+ driver.refresh()
|
|
|
|
+ time.sleep(1)
|
|
|
|
+ '''页面alert元素确定'''
|
|
|
|
+ driver.switch_to.alert.accept()
|
|
|
|
+ time.sleep(1.5)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def goto(driver, web_element, wait_time=None, allow_check_page=False):
|
|
|
|
+ """执行可点击js事件"""
|
|
|
|
+ driver.execute_script("arguments[0].click();", web_element)
|
|
|
|
+ _wait_time = (wait_time or 1)
|
|
|
|
+ time.sleep(_wait_time)
|
|
|
|
+ if allow_check_page:
|
|
|
|
+ check_robots_alert(driver)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_text(html: str, feature: str):
|
|
|
|
+ """抽取文本"""
|
|
|
|
+ element = html2element(html)
|
|
|
|
+ return element.xpath(feature)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_page_html(html: str, feature: str):
|
|
|
|
+ """抽取页面源码"""
|
|
|
|
+ element = html2element(html)
|
|
|
|
+ try:
|
|
|
|
+ node = element.xpath(feature)[0]
|
|
|
|
+ return element2html(node)
|
|
|
|
+ except IndexError:
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def wait_load_detail(driver, check_feature=None):
|
|
|
|
+ """等待二次加载页面结果并检测元素变化"""
|
|
|
|
+ if check_feature is not None:
|
|
|
|
+ check_count = 0
|
|
|
|
+ while check_count < 10:
|
|
|
|
+ element = html2element(driver.page_source)
|
|
|
|
+ check_node = element.xpath(check_feature)
|
|
|
|
+ if len(check_node) > 0:
|
|
|
|
+ break
|
|
|
|
+ time.sleep(0.5)
|
|
|
|
+ check_count += 1
|
|
|
|
+ else:
|
|
|
|
+ check_count = 0
|
|
|
|
+ while check_count < 10:
|
|
|
|
+ element = html2element(driver.page_source)
|
|
|
|
+ root = element.xpath('//div[@id="xxnrList"]')
|
|
|
|
+ if len(root) > 0:
|
|
|
|
+ descendant = element.xpath('//div[@id="xxnrList"]/descendant::*')
|
|
|
|
+ if len(descendant) > 0:
|
|
|
|
+ text = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/div[2]/div[1]/text()')
|
|
|
|
+ children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
|
|
|
|
+ if "".join(text) != '暂无详细数据' and len(children) > 0:
|
|
|
|
+ break
|
|
|
|
+ time.sleep(0.5)
|
|
|
|
+ check_count += 1
|
|
|
|
+ time.sleep(1)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def wait_load_list(driver):
|
|
|
|
+ while True:
|
|
|
|
+ element = html2element(driver.page_source)
|
|
|
|
+ node = element.xpath('//div[@id="myTab_Contenta0"]/div[2]/table//tr')
|
|
|
|
+ if len(node) > 0:
|
|
|
|
+ break
|
|
|
|
+ time.sleep(0.5)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def next_page(driver, category):
|
|
|
|
+ """翻页"""
|
|
|
|
+ _finished_pages = CRAWL_RECORDS[category]['pages']
|
|
|
|
+ web_elements = driver.find_elements(by=By.XPATH, value='//div[@id="Pagination"]/div[1]/child::*')
|
|
|
|
+ for element in web_elements[1:-1]:
|
|
|
|
+ val = element.text
|
|
|
|
+ if val not in _finished_pages:
|
|
|
|
+ goto(driver, element, wait_time=1.2)
|
|
|
|
+ return int(val)
|
|
|
|
+ time.sleep(1)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def update_crawl_records(category: str, finished: bool):
|
|
|
|
+ if category in CRAWL_RECORDS:
|
|
|
|
+ _category = CRAWL_RECORDS[category]
|
|
|
|
+ _category['finished'] = finished
|
|
|
|
+ CRAWL_RECORDS.update(_category)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def init_crawl_records(driver, web_element, category: str):
|
|
|
|
+ """"""
|
|
|
|
+ if category not in CRAWL_RECORDS:
|
|
|
|
+ goto(driver, web_element)
|
|
|
|
+ init_config = {'finished': False, 'pages': ['1']}
|
|
|
|
+ CRAWL_RECORDS.setdefault(category, init_config)
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ _category = CRAWL_RECORDS[category]
|
|
|
|
+ if not _category['finished']:
|
|
|
|
+ goto(driver, web_element)
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def select_category(driver, custom_category=None):
|
|
|
|
+ """采集分类"""
|
|
|
|
+ web_elements = driver.find_elements(by=By.XPATH, value='//ul[@id="myTab3"]/child::li')
|
|
|
|
+ for element in web_elements:
|
|
|
|
+ val = element.text
|
|
|
|
+ if custom_category is None:
|
|
|
|
+ success = init_crawl_records(driver, element, val)
|
|
|
|
+ return val if success else None
|
|
|
|
+ else:
|
|
|
|
+ if val == custom_category:
|
|
|
|
+ success = init_crawl_records(driver, element, custom_category)
|
|
|
|
+ return val if success else None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def select_date(driver, category: str, setup_time: str):
|
|
|
|
+ """选择建立时间"""
|
|
|
|
+ logger.info(f"[建立时间]{setup_time}")
|
|
|
|
+ try:
|
|
|
|
+ attr = SETUP_TIME[category][SETUP_MAPS[setup_time]]
|
|
|
|
+ element = driver.find_element(by=By.XPATH, value=f'//a[@id="{attr}"]')
|
|
|
|
+ goto(driver, element)
|
|
|
|
+ except KeyError:
|
|
|
|
+ raise KeyError(f'请设置"SETUP_TIME"中{category}对应的建立时间')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def crawl_detail(driver, handler, item):
|
|
|
|
+ for current_handler in driver.window_handles:
|
|
|
|
+ if current_handler == handler:
|
|
|
|
+ continue
|
|
|
|
+ driver.switch_to.window(current_handler)
|
|
|
|
+ '''加载等待并检查指定页面特征'''
|
|
|
|
+ wait_load_detail(driver, check_feature='//div[@id="xxnrList"]/div[1]/div[2]/div[2]')
|
|
|
|
+ '''检查机器人警告并处理'''
|
|
|
|
+ check_robots_alert(driver)
|
|
|
|
+ '''二次加载'''
|
|
|
|
+ refresh_page(driver)
|
|
|
|
+ '''加载等待'''
|
|
|
|
+ wait_load_detail(driver)
|
|
|
|
+ '''抽取源码'''
|
|
|
|
+ content_html = extract_page_html(driver.page_source, feature='//div[@id="xxnrList"]')
|
|
|
|
+ if all([content_html is not None, verify_text(content_html)]):
|
|
|
|
+ item['contenthtml'] = content_html
|
|
|
|
+ item['detail'] = cleaner(content_html)
|
|
|
|
+ item['comeintime'] = int2long(int(time.time()))
|
|
|
|
+ '''保存详情'''
|
|
|
|
+ save_tab.insert_one(item)
|
|
|
|
+ else:
|
|
|
|
+ logger.error(f'[文本异常]{item["channel"]} - {item["title"]}')
|
|
|
|
+ '''关闭当前页'''
|
|
|
|
+ driver.close()
|
|
|
|
+ '''切换主页'''
|
|
|
|
+ driver.switch_to.window(handler)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def crawl_spider(
|
|
|
|
+ crawl_max_page=1,
|
|
|
|
+ enable_proxy=False,
|
|
|
|
+ max_request_times=15,
|
|
|
|
+ **kw
|
|
|
|
+):
|
|
|
|
+ proxy = Proxy(enable_proxy)
|
|
|
|
+ crawl_category = kw.get('crawl_category')
|
|
|
|
+ cache_cookies = {}
|
|
|
|
+ current_request_time = 0
|
|
|
|
+ headless = kw.get('headless', True)
|
|
|
|
+ while True:
|
|
|
|
+ proxies = proxy.proxies
|
|
|
|
+ logger.info(f"[采集代理]{proxies}")
|
|
|
|
+ list_page_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2'
|
|
|
|
+ with WebDriver(load_images=False, proxy=proxies, headless=headless) as browser:
|
|
|
|
+ browser.get(list_page_url)
|
|
|
|
+ '''设置缓存cookies'''
|
|
|
|
+ if len(cache_cookies) > 0:
|
|
|
|
+ browser.cookies = cache_cookies
|
|
|
|
+ browser.get(list_page_url)
|
|
|
|
+ '''等待加载主页'''
|
|
|
|
+ wait_load_list(browser)
|
|
|
|
+ '''获取主页句柄'''
|
|
|
|
+ main_handler = browser.current_window_handle
|
|
|
|
+ '''选择分类'''
|
|
|
|
+ category = select_category(browser, crawl_category)
|
|
|
|
+ crawl_menu = CRAWL_MENU.get(category)
|
|
|
|
+ if crawl_menu is None:
|
|
|
|
+ logger.info("任务结束")
|
|
|
|
+ break
|
|
|
|
+ logger.info(f"[分类栏目]{category}")
|
|
|
|
+ '''选择建立时间'''
|
|
|
|
+ select_date(browser, category, '今天')
|
|
|
|
+ while True:
|
|
|
|
+ _allow_next_page = True
|
|
|
|
+ '''详情清单'''
|
|
|
|
+ web_elements = browser.find_elements(by=By.XPATH, value=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr')
|
|
|
|
+ for index, element in enumerate(web_elements):
|
|
|
|
+ index += 1
|
|
|
|
+ item = {
|
|
|
|
+ "site": "中国招标投标公共服务平台",
|
|
|
|
+ "channel": crawl_menu.channel,
|
|
|
|
+ "spidercode": crawl_menu.spidercode,
|
|
|
|
+ "T": "bidding",
|
|
|
|
+ "sendflag": "false",
|
|
|
|
+ "iscompete": "true",
|
|
|
|
+ "_d": "comeintime",
|
|
|
|
+ "comeintime": '',
|
|
|
|
+ "area": '',
|
|
|
|
+ "city": '',
|
|
|
|
+ "publishdept": "",
|
|
|
|
+ "title": "",
|
|
|
|
+ "href": "",
|
|
|
|
+ "publishtime": "",
|
|
|
|
+ "l_np_publishtime": "",
|
|
|
|
+ }
|
|
|
|
+ html = browser.page_source
|
|
|
|
+ detail_js = "".join(extract_text(html, feature=f'//*[@id="{CATEGORY_MAPS[category]}"]//tr[{index}]/td[1]/a/@onclick')).strip()
|
|
|
|
+ sign = sha1(detail_js)
|
|
|
|
+ if r.hexists(redis_key, sign):
|
|
|
|
+ continue
|
|
|
|
+ item['href'] = detail_js
|
|
|
|
+
|
|
|
|
+ node1 = element.find_element_by_xpath('./td[1]/a')
|
|
|
|
+ title = node1.text
|
|
|
|
+ item['title'] = title
|
|
|
|
+
|
|
|
|
+ node2 = element.find_element_by_xpath('./td[3]/span')
|
|
|
|
+ region = str(node2.text).replace('【', '').replace('】', '')
|
|
|
|
+ if region.find(" ") > 0:
|
|
|
|
+ province, city = region.split(' ')
|
|
|
|
+ else:
|
|
|
|
+ province = region
|
|
|
|
+ city = ''
|
|
|
|
+ item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
|
|
|
|
+ item['city'] = city
|
|
|
|
+
|
|
|
|
+ node3 = element.find_element_by_xpath('./td[5]')
|
|
|
|
+ publish_time = node3.text
|
|
|
|
+ item['publishtime'] = publish_time
|
|
|
|
+ item['l_np_publishtime'] = int2long(int(time.mktime(time.strptime(publish_time, "%Y-%m-%d"))))
|
|
|
|
+ item['comeintime'] = int2long(int(time.time()))
|
|
|
|
+ '''保存列表'''
|
|
|
|
+ crawl_tab.insert_one(item)
|
|
|
|
+ '''访问详情页'''
|
|
|
|
+ goto(browser, node1, wait_time=2)
|
|
|
|
+ '''详情页'''
|
|
|
|
+ crawl_detail(browser, main_handler, item)
|
|
|
|
+ '''添加数据指纹'''
|
|
|
|
+ r.hset(redis_key, sign, '')
|
|
|
|
+ logger.info(f'[采集成功]{title} - {publish_time}')
|
|
|
|
+ '''请求次数计数器,限制ip代理的访问次数'''
|
|
|
|
+ current_request_time += 1
|
|
|
|
+ if current_request_time > max_request_times:
|
|
|
|
+ '''切换代理并清空cookies'''
|
|
|
|
+ proxy.switch()
|
|
|
|
+ cache_cookies.clear()
|
|
|
|
+ current_request_time = 0
|
|
|
|
+ _allow_next_page = False
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ cache_cookies.update(browser.cookies)
|
|
|
|
+
|
|
|
|
+ if _allow_next_page:
|
|
|
|
+ page_num = next_page(browser, category)
|
|
|
|
+ logger.info(f"[{category}-第{page_num}页]已完成")
|
|
|
|
+ if page_num is None:
|
|
|
|
+ update_crawl_records(category, True)
|
|
|
|
+ break
|
|
|
|
+ elif page_num > crawl_max_page:
|
|
|
|
+ update_crawl_records(category, True)
|
|
|
|
+ break
|
|
|
|
+ else:
|
|
|
|
+ '''记录采集页码,已记录页码不在访问'''
|
|
|
|
+ finished_pages = CRAWL_RECORDS[category]['pages']
|
|
|
|
+ finished_pages.append(str(page_num))
|
|
|
|
+ CRAWL_RECORDS.update({category: finished_pages})
|
|
|
|
+ else:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# if __name__ == '__main__':
|
|
|
|
+# crawl_spider(
|
|
|
|
+# crawl_category='',
|
|
|
|
+# crawl_max_page=1,
|
|
|
|
+# max_request_times=40,
|
|
|
|
+# enable_proxy=True,
|
|
|
|
+# )
|