|
@@ -7,34 +7,22 @@ Created on 2025-03-19
|
|
|
@author: lzz
|
|
|
"""
|
|
|
import re
|
|
|
+import time
|
|
|
+from collections import namedtuple
|
|
|
|
|
|
import feapder
|
|
|
-from items.njpc_item import NjpcListItem
|
|
|
from feapder.network.selector import Selector
|
|
|
-from collections import namedtuple
|
|
|
-import time
|
|
|
+from items.njpc_item import DataNjpcItem
|
|
|
|
|
|
|
|
|
-class SeleniumFeapder(feapder.BiddingListSpider):
|
|
|
- __custom_setting__ = dict(
|
|
|
- WEBDRIVER=dict(
|
|
|
- driver_type="FIREFOX",
|
|
|
- pool_size=1,
|
|
|
- usages_local_driver=True,
|
|
|
- headless=False,
|
|
|
- )
|
|
|
- )
|
|
|
+class Spider(feapder.BiddingListSpider):
|
|
|
|
|
|
def start_callback(self):
|
|
|
-
|
|
|
- self.site = "全国建设项目竣工环境保护验收项目公开信息"
|
|
|
-
|
|
|
Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
|
|
|
-
|
|
|
self.menus = [
|
|
|
- Menu('项目公开信息', 'a_qgjsxmjghjbhysxmgkxx_xmgkxx', 1),
|
|
|
+ Menu('项目公开信息', 'a_qgjsxmjghjbhysxmgkxx_xmgkxx', 10),
|
|
|
]
|
|
|
-
|
|
|
+ self.site = "全国建设项目竣工环境保护验收项目公开信息"
|
|
|
self.headers = {
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
@@ -45,23 +33,26 @@ class SeleniumFeapder(feapder.BiddingListSpider):
|
|
|
}
|
|
|
|
|
|
def start_requests(self):
|
|
|
+ url = 'https://cepc.lem.org.cn/#/publicityProject'
|
|
|
for menu in self.menus:
|
|
|
- start_url = 'https://cepc.lem.org.cn/#/publicityProject'
|
|
|
- yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
|
|
|
- render=True, render_time=5, proxies=False)
|
|
|
+ yield feapder.Request(url,
|
|
|
+ render=True,
|
|
|
+ render_time=5,
|
|
|
+ proxies=False,
|
|
|
+ item=menu._asdict(),
|
|
|
+ page=1)
|
|
|
|
|
|
def download_midware(self, request):
|
|
|
- page = request.page
|
|
|
request.headers = self.headers
|
|
|
|
|
|
def parse(self, request, response):
|
|
|
+ page = request.page
|
|
|
driver = response.browser
|
|
|
- driver.maximize_window()
|
|
|
- time.sleep(1)
|
|
|
+ host = driver.tab.url
|
|
|
menu = request.item
|
|
|
info_list = response.xpath('//table[@class="vxe-table--body"]/tbody/tr')
|
|
|
for info in info_list:
|
|
|
- # href = info.xpath('').extract_first().strip()
|
|
|
+ ys_href = info.xpath('./td[6]/div/text()').extract_first("").strip().replace('网站', '')
|
|
|
title = info.xpath('./td[2]/div/a/span/text()').extract_first().strip()
|
|
|
projectaddr = info.xpath('./td[3]/div/span/text()').extract_first("").strip()
|
|
|
owner = info.xpath('./td[4]/div/span/text()').extract_first("").strip()
|
|
@@ -72,25 +63,17 @@ class SeleniumFeapder(feapder.BiddingListSpider):
|
|
|
city = "" # 城市
|
|
|
district = "" # 区县
|
|
|
|
|
|
- try:
|
|
|
- next_page = driver.find_element_by_xpath(f'//a/span[contains(text(),"{title[:20]}")]')
|
|
|
- except:
|
|
|
- try:
|
|
|
- next_page = driver.find_element_by_xpath(f'//a/span[contains(text(),"{title[:10]}")]') # 标题过长
|
|
|
- except:
|
|
|
- continue
|
|
|
+ ele = driver.tab.ele(f'x://a/span[contains(text(),"{title}")]')
|
|
|
+ ele.click()
|
|
|
|
|
|
- next_page.click()
|
|
|
- time.sleep(3)
|
|
|
+ href = host + f'?file={ys_href}' + f'&t={int(time.time())}'
|
|
|
|
|
|
- href = driver.current_url
|
|
|
-
|
|
|
- data_item = NjpcListItem() # 存储数据的管道
|
|
|
+ data_item = DataNjpcItem() # 存储数据的管道
|
|
|
data_item.href = href # 标书链接
|
|
|
data_item.unique_key = ('title', 'href') # 去重
|
|
|
data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
|
data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
|
- data_item.title = title # 标题
|
|
|
+ data_item.projectname = title # 标题
|
|
|
data_item.publishtime = publish_time # 标书发布时间
|
|
|
data_item.site = self.site
|
|
|
data_item.area = area or "全国" # 省份 默认:全国
|
|
@@ -100,7 +83,7 @@ class SeleniumFeapder(feapder.BiddingListSpider):
|
|
|
data_item.owner = owner
|
|
|
data_item.projectaddr = projectaddr
|
|
|
|
|
|
- detail_html = Selector(text=driver.page_source)
|
|
|
+ detail_html = Selector(text=driver.tab.html)
|
|
|
html = ""
|
|
|
dx_list = ['//div[@class="el-dialog__body"]', ]
|
|
|
for dx in dx_list:
|
|
@@ -108,18 +91,36 @@ class SeleniumFeapder(feapder.BiddingListSpider):
|
|
|
if html:
|
|
|
break
|
|
|
|
|
|
- data_item.contenthtml = html.replace('<span>关闭</span>','')
|
|
|
+ html = html.replace('<span>关闭</span>', '')
|
|
|
+ data_item.contenthtml = html
|
|
|
|
|
|
- cl = driver.find_element_by_xpath('//span[text()="关闭"]')
|
|
|
- cl.click()
|
|
|
- time.sleep(3)
|
|
|
+ button = driver.tab.ele('x://span[text()="关闭"]')
|
|
|
+ if button.states.is_clickable:
|
|
|
+ button.click()
|
|
|
|
|
|
yield data_item
|
|
|
|
|
|
# 翻页
|
|
|
request = self.infinite_pages(request, response)
|
|
|
+ if request is not None and request.page > page:
|
|
|
+ request.callback = self.turn_page
|
|
|
+ yield request
|
|
|
+
|
|
|
+ def turn_page(self, request, response):
|
|
|
+ page = request.page
|
|
|
+ tab = response.browser.tab
|
|
|
+
|
|
|
+ # 获取元素
|
|
|
+ ele = tab.ele('x://input[@class="vxe-pager--goto"]')
|
|
|
+ # 单击鼠标左键,移动到元素上
|
|
|
+ tab.actions.click(ele)
|
|
|
+ # 输入页码前清空文本框
|
|
|
+ ele.input(page, clear=True)
|
|
|
+ # 翻页
|
|
|
+ tab.actions.click("页")
|
|
|
+ request.callback = self.parse
|
|
|
yield request
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- SeleniumFeapder(redis_key="jy:SeleniumFeapder").start()
|
|
|
+ Spider(redis_key="jy:SeleniumFeapder").start()
|