|
@@ -44,158 +44,158 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
|
|
|
proxies = proxy.proxies
|
|
|
logger.info(f"[采集代理]{proxies}")
|
|
|
list_page_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2'
|
|
|
- browser = WebDriver(load_images=False, proxy=proxies, headless=headless)
|
|
|
- ua = browser.execute_script('return navigator.userAgent')
|
|
|
- print('>>> ', ua)
|
|
|
- success_reqeust = crawl_request(browser, list_page_url)
|
|
|
- if not success_reqeust:
|
|
|
- proxy.switch()
|
|
|
- logger.error('[访问超时]请求列表页')
|
|
|
- continue
|
|
|
-
|
|
|
- '''等待加载主页'''
|
|
|
- wait_load_list(browser)
|
|
|
- '''获取主页句柄'''
|
|
|
- main_handler = browser.current_window_handle
|
|
|
- '''选择分类'''
|
|
|
- category = select_category(browser, crawl_category)
|
|
|
- '''分类栏目列表'''
|
|
|
- crawl_menu = get_crawl_menu(category)
|
|
|
- if crawl_menu is None:
|
|
|
- browser.quit()
|
|
|
- logger.info("任务结束")
|
|
|
- break
|
|
|
+ with WebDriver(load_images=False, proxy=proxies, headless=headless) as browser:
|
|
|
+ ua = browser.execute_script('return navigator.userAgent')
|
|
|
+ print('>>> ', ua)
|
|
|
+ success_reqeust = crawl_request(browser, list_page_url)
|
|
|
+ if not success_reqeust:
|
|
|
+ proxy.switch()
|
|
|
+ logger.error('[访问超时]请求列表页')
|
|
|
+ continue
|
|
|
|
|
|
- logger.info(f"[分类栏目]{category}")
|
|
|
+ '''等待加载主页'''
|
|
|
+ wait_load_list(browser)
|
|
|
+ '''获取主页句柄'''
|
|
|
+ main_handler = browser.current_window_handle
|
|
|
+ '''选择分类'''
|
|
|
+ category = select_category(browser, crawl_category)
|
|
|
+ '''分类栏目列表'''
|
|
|
+ crawl_menu = get_crawl_menu(category)
|
|
|
+ if crawl_menu is None:
|
|
|
+ browser.quit()
|
|
|
+ logger.info("任务结束")
|
|
|
+ break
|
|
|
|
|
|
- '''选择建立时间'''
|
|
|
- success_select_date = select_date(browser, category, crawl_date)
|
|
|
- if not success_select_date:
|
|
|
- proxy.switch()
|
|
|
- continue
|
|
|
+ logger.info(f"[分类栏目]{category}")
|
|
|
|
|
|
- exit_crawl = False
|
|
|
- allow_next_page = False
|
|
|
- while True:
|
|
|
- if exit_crawl:
|
|
|
+ '''选择建立时间'''
|
|
|
+ success_select_date = select_date(browser, category, crawl_date)
|
|
|
+ if not success_select_date:
|
|
|
proxy.switch()
|
|
|
- break
|
|
|
+ continue
|
|
|
|
|
|
- if allow_next_page:
|
|
|
- allow_next_page = True
|
|
|
- try:
|
|
|
- page_num = next_page(browser, category)
|
|
|
- if page_num is None or (page_num > crawl_max_page):
|
|
|
- browser.quit()
|
|
|
- proxy.switch()
|
|
|
- update_crawl_records(category, True)
|
|
|
- break
|
|
|
- elif page_num != prev_num and page_num % 2 == 0:
|
|
|
- '''每个代理IP仅采集2页,轮询使用代理'''
|
|
|
- browser.quit()
|
|
|
- proxy.switch()
|
|
|
- prev_num = page_num
|
|
|
- break
|
|
|
- except TimeoutException:
|
|
|
- browser.quit()
|
|
|
+ exit_crawl = False
|
|
|
+ allow_next_page = False
|
|
|
+ while True:
|
|
|
+ if exit_crawl:
|
|
|
proxy.switch()
|
|
|
- logger.error('[访问超时]请求翻页')
|
|
|
break
|
|
|
- else:
|
|
|
- allow_next_page = True
|
|
|
-
|
|
|
- '''详情页'''
|
|
|
- web_elements = parser_list_elements(browser, category)
|
|
|
- if web_elements is None:
|
|
|
- proxy.switch()
|
|
|
- break
|
|
|
|
|
|
- for index, element in enumerate(web_elements):
|
|
|
- index += 1
|
|
|
- item = {
|
|
|
- "site": "中国招标投标公共服务平台",
|
|
|
- "channel": crawl_menu.channel,
|
|
|
- "spidercode": crawl_menu.spidercode,
|
|
|
- "T": "bidding",
|
|
|
- "sendflag": "false",
|
|
|
- "_d": "comeintime",
|
|
|
- "comeintime": '',
|
|
|
- "area": '',
|
|
|
- "city": '',
|
|
|
- "publishdept": "",
|
|
|
- "title": "",
|
|
|
- "href": "",
|
|
|
- "publishtime": "",
|
|
|
- "l_np_publishtime": "",
|
|
|
- }
|
|
|
- html = browser.page_source
|
|
|
- category_id = get_category_id(category)
|
|
|
- click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
|
|
|
- href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
|
|
|
- detail_js = (click_detail_js or href_)
|
|
|
- sign = sha1(detail_js)
|
|
|
- print(f'>>> {sign}')
|
|
|
- if r.hexists(redis_key, sign):
|
|
|
- continue
|
|
|
- '''发布标题'''
|
|
|
- node1 = element.find_element_by_xpath('./td[1]/a')
|
|
|
- title = node1.text
|
|
|
- item['title'] = title
|
|
|
- '''省市'''
|
|
|
- node2 = element.find_element_by_xpath('./td[3]/span')
|
|
|
- region = str(node2.text).replace('【', '').replace('】', '')
|
|
|
- if region.find(" ") > 0:
|
|
|
- province, city = region.split(' ')
|
|
|
- else:
|
|
|
- province = region
|
|
|
- city = ''
|
|
|
- item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
|
|
|
- item['city'] = city
|
|
|
- '''发布时间'''
|
|
|
- node3 = element.find_element_by_xpath('./td[5]')
|
|
|
- publish_time = node3.text
|
|
|
- item['publishtime'] = publish_time
|
|
|
- item['l_np_publishtime'] = int2long(date2ts(publish_time))
|
|
|
- item['comeintime'] = int2long(int(time.time()))
|
|
|
- '''访问详情页'''
|
|
|
- goto(browser, node1, wait_time=2)
|
|
|
- '''详情页'''
|
|
|
- item['href'] = '#'
|
|
|
- detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
|
|
|
- if detail_js.startswith('showDetails') is False:
|
|
|
- item['competehref'] = detail_url
|
|
|
- try:
|
|
|
- item = crawl_psp_frame(browser, main_handler, item)
|
|
|
- except NoSuchElementException:
|
|
|
- exit_crawl = True
|
|
|
- break
|
|
|
- else:
|
|
|
- item['competehref'] = '{}/{}'.format(detail_url, sign)
|
|
|
+ if allow_next_page:
|
|
|
+ allow_next_page = True
|
|
|
try:
|
|
|
- item = crawl_show_details(browser, main_handler, item)
|
|
|
- except (ValueError, WebDriverException) as e:
|
|
|
+ page_num = next_page(browser, category)
|
|
|
+ if page_num is None or (page_num > crawl_max_page):
|
|
|
+ browser.quit()
|
|
|
+ proxy.switch()
|
|
|
+ update_crawl_records(category, True)
|
|
|
+ break
|
|
|
+ elif page_num != prev_num and page_num % 2 == 0:
|
|
|
+ '''每个代理IP仅采集2页,轮询使用代理'''
|
|
|
+ browser.quit()
|
|
|
+ proxy.switch()
|
|
|
+ prev_num = page_num
|
|
|
+ break
|
|
|
+ except TimeoutException:
|
|
|
browser.quit()
|
|
|
- exit_crawl = True
|
|
|
- if e.__class__.__name__ == 'ValueError':
|
|
|
- logger.error("[机器人验证]验证失败")
|
|
|
+ proxy.switch()
|
|
|
+ logger.error('[访问超时]请求翻页')
|
|
|
break
|
|
|
- '''入库处理'''
|
|
|
- if 'contenthtml' not in item:
|
|
|
- item['crawl_status'] = 'detail_err'
|
|
|
else:
|
|
|
- item['crawl_status'] = 'success'
|
|
|
- '''保存详情'''
|
|
|
- save_tab.insert_one(item)
|
|
|
- del item['contenthtml'], item['detail']
|
|
|
- if '_id' in item:
|
|
|
- del item['_id']
|
|
|
- logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
|
|
|
- '''备注:详情页访问参数'''
|
|
|
- item['remark'] = detail_js
|
|
|
- '''添加数据指纹'''
|
|
|
- r.hset(redis_key, sign, '')
|
|
|
- '''保存列表'''
|
|
|
- crawl_tab.insert_one(item)
|
|
|
+ allow_next_page = True
|
|
|
+
|
|
|
+ '''详情页'''
|
|
|
+ web_elements = parser_list_elements(browser, category)
|
|
|
+ if web_elements is None:
|
|
|
+ proxy.switch()
|
|
|
+ break
|
|
|
+
|
|
|
+ for index, element in enumerate(web_elements):
|
|
|
+ index += 1
|
|
|
+ item = {
|
|
|
+ "site": "中国招标投标公共服务平台",
|
|
|
+ "channel": crawl_menu.channel,
|
|
|
+ "spidercode": crawl_menu.spidercode,
|
|
|
+ "T": "bidding",
|
|
|
+ "sendflag": "false",
|
|
|
+ "_d": "comeintime",
|
|
|
+ "comeintime": '',
|
|
|
+ "area": '',
|
|
|
+ "city": '',
|
|
|
+ "publishdept": "",
|
|
|
+ "title": "",
|
|
|
+ "href": "",
|
|
|
+ "publishtime": "",
|
|
|
+ "l_np_publishtime": "",
|
|
|
+ }
|
|
|
+ html = browser.page_source
|
|
|
+ category_id = get_category_id(category)
|
|
|
+ click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
|
|
|
+ href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
|
|
|
+ detail_js = (click_detail_js or href_)
|
|
|
+ sign = sha1(detail_js)
|
|
|
+ print(f'>>> {sign}')
|
|
|
+ if r.hexists(redis_key, sign):
|
|
|
+ continue
|
|
|
+ '''发布标题'''
|
|
|
+ node1 = element.find_element_by_xpath('./td[1]/a')
|
|
|
+ title = node1.text
|
|
|
+ item['title'] = title
|
|
|
+ '''省市'''
|
|
|
+ node2 = element.find_element_by_xpath('./td[3]/span')
|
|
|
+ region = str(node2.text).replace('【', '').replace('】', '')
|
|
|
+ if region.find(" ") > 0:
|
|
|
+ province, city = region.split(' ')
|
|
|
+ else:
|
|
|
+ province = region
|
|
|
+ city = ''
|
|
|
+ item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
|
|
|
+ item['city'] = city
|
|
|
+ '''发布时间'''
|
|
|
+ node3 = element.find_element_by_xpath('./td[5]')
|
|
|
+ publish_time = node3.text
|
|
|
+ item['publishtime'] = publish_time
|
|
|
+ item['l_np_publishtime'] = int2long(date2ts(publish_time))
|
|
|
+ item['comeintime'] = int2long(int(time.time()))
|
|
|
+ '''访问详情页'''
|
|
|
+ goto(browser, node1, wait_time=2)
|
|
|
+ '''详情页'''
|
|
|
+ item['href'] = '#'
|
|
|
+ detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
|
|
|
+ if detail_js.startswith('showDetails') is False:
|
|
|
+ item['competehref'] = detail_url
|
|
|
+ try:
|
|
|
+ item = crawl_psp_frame(browser, main_handler, item)
|
|
|
+ except NoSuchElementException:
|
|
|
+ exit_crawl = True
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ item['competehref'] = '{}/{}'.format(detail_url, sign)
|
|
|
+ try:
|
|
|
+ item = crawl_show_details(browser, main_handler, item)
|
|
|
+ except (ValueError, WebDriverException) as e:
|
|
|
+ browser.quit()
|
|
|
+ exit_crawl = True
|
|
|
+ if e.__class__.__name__ == 'ValueError':
|
|
|
+ logger.error("[机器人验证]验证失败")
|
|
|
+ break
|
|
|
+ '''入库处理'''
|
|
|
+ if 'contenthtml' not in item:
|
|
|
+ item['crawl_status'] = 'detail_err'
|
|
|
+ else:
|
|
|
+ item['crawl_status'] = 'success'
|
|
|
+ '''保存详情'''
|
|
|
+ save_tab.insert_one(item)
|
|
|
+ del item['contenthtml'], item['detail']
|
|
|
+ if '_id' in item:
|
|
|
+ del item['_id']
|
|
|
+ logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
|
|
|
+ '''备注:详情页访问参数'''
|
|
|
+ item['remark'] = detail_js
|
|
|
+ '''添加数据指纹'''
|
|
|
+ r.hset(redis_key, sign, '')
|
|
|
+ '''保存列表'''
|
|
|
+ crawl_tab.insert_one(item)
|
|
|
|
|
|
- logger.info(f"[{category}-第{page_num}页]采集完成")
|
|
|
- write_crawl_records(category, page_num)
|
|
|
+ logger.info(f"[{category}-第{page_num}页]采集完成")
|
|
|
+ write_crawl_records(category, page_num)
|