浏览代码

更新自动化浏览器采集

dzr 1 月之前
父节点
当前提交
f70dc63065

+ 45 - 44
a_qgjsxmjghjbhysxmgkxx_xmgkxx/全国建设项目竣工环境保护验收项目公开信息.py

@@ -7,34 +7,22 @@ Created on 2025-03-19
 @author: lzz
 """
 import re
+import time
+from collections import namedtuple
 
 import feapder
-from items.njpc_item import NjpcListItem
 from feapder.network.selector import Selector
-from collections import namedtuple
-import time
+from items.njpc_item import DataNjpcItem
 
 
-class SeleniumFeapder(feapder.BiddingListSpider):
-    __custom_setting__ = dict(
-        WEBDRIVER=dict(
-            driver_type="FIREFOX",
-            pool_size=1,
-            usages_local_driver=True,
-            headless=False,
-        )
-    )
+class Spider(feapder.BiddingListSpider):
 
     def start_callback(self):
-
-        self.site = "全国建设项目竣工环境保护验收项目公开信息"
-
         Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
-
         self.menus = [
-            Menu('项目公开信息', 'a_qgjsxmjghjbhysxmgkxx_xmgkxx', 1),
+            Menu('项目公开信息', 'a_qgjsxmjghjbhysxmgkxx_xmgkxx', 10),
         ]
-
+        self.site = "全国建设项目竣工环境保护验收项目公开信息"
         self.headers = {
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
             "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
@@ -45,23 +33,26 @@ class SeleniumFeapder(feapder.BiddingListSpider):
         }
 
     def start_requests(self):
+        url = 'https://cepc.lem.org.cn/#/publicityProject'
         for menu in self.menus:
-            start_url = 'https://cepc.lem.org.cn/#/publicityProject'
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
-                                  render=True, render_time=5, proxies=False)
+            yield feapder.Request(url,
+                                  render=True,
+                                  render_time=5,
+                                  proxies=False,
+                                  item=menu._asdict(),
+                                  page=1)
 
     def download_midware(self, request):
-        page = request.page
         request.headers = self.headers
 
     def parse(self, request, response):
+        page = request.page
         driver = response.browser
-        driver.maximize_window()
-        time.sleep(1)
+        host = driver.tab.url
         menu = request.item
         info_list = response.xpath('//table[@class="vxe-table--body"]/tbody/tr')
         for info in info_list:
-            # href = info.xpath('').extract_first().strip()
+            ys_href = info.xpath('./td[6]/div/text()').extract_first("").strip().replace('网站', '')
             title = info.xpath('./td[2]/div/a/span/text()').extract_first().strip()
             projectaddr = info.xpath('./td[3]/div/span/text()').extract_first("").strip()
             owner = info.xpath('./td[4]/div/span/text()').extract_first("").strip()
@@ -72,25 +63,17 @@ class SeleniumFeapder(feapder.BiddingListSpider):
             city = ""  # 城市
             district = ""  # 区县
 
-            try:
-                next_page = driver.find_element_by_xpath(f'//a/span[contains(text(),"{title[:20]}")]')
-            except:
-                try:
-                    next_page = driver.find_element_by_xpath(f'//a/span[contains(text(),"{title[:10]}")]')  # 标题过长
-                except:
-                    continue
+            ele = driver.tab.ele(f'x://a/span[contains(text(),"{title}")]')
+            ele.click()
 
-            next_page.click()
-            time.sleep(3)
+            href = host + f'?file={ys_href}' + f'&t={int(time.time())}'
 
-            href = driver.current_url
-
-            data_item = NjpcListItem()  # 存储数据的管道
+            data_item = DataNjpcItem()  # 存储数据的管道
             data_item.href = href  # 标书链接
             data_item.unique_key = ('title', 'href')  # 去重
             data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
             data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title  # 标题
+            data_item.projectname = title  # 标题
             data_item.publishtime = publish_time  # 标书发布时间
             data_item.site = self.site
             data_item.area = area or "全国"  # 省份 默认:全国
@@ -100,7 +83,7 @@ class SeleniumFeapder(feapder.BiddingListSpider):
             data_item.owner = owner
             data_item.projectaddr = projectaddr
 
-            detail_html = Selector(text=driver.page_source)
+            detail_html = Selector(text=driver.tab.html)
             html = ""
             dx_list = ['//div[@class="el-dialog__body"]', ]
             for dx in dx_list:
@@ -108,18 +91,36 @@ class SeleniumFeapder(feapder.BiddingListSpider):
                 if html:
                     break
 
-            data_item.contenthtml = html.replace('<span>关闭</span>','')
+            html = html.replace('<span>关闭</span>', '')
+            data_item.contenthtml = html
 
-            cl = driver.find_element_by_xpath('//span[text()="关闭"]')
-            cl.click()
-            time.sleep(3)
+            button = driver.tab.ele('x://span[text()="关闭"]')
+            if button.states.is_clickable:
+                button.click()
 
             yield data_item
 
         # 翻页
         request = self.infinite_pages(request, response)
+        if request is not None and request.page > page:
+            request.callback = self.turn_page
+            yield request
+
+    def turn_page(self, request, response):
+        page = request.page
+        tab = response.browser.tab
+
+        # 获取元素
+        ele = tab.ele('x://input[@class="vxe-pager--goto"]')
+        # 单击鼠标左键,移动到元素上
+        tab.actions.click(ele)
+        # 输入页码前清空文本框
+        ele.input(page, clear=True)
+        # 翻页
+        tab.actions.click("页")
+        request.callback = self.parse
         yield request
 
 
 if __name__ == "__main__":
-    SeleniumFeapder(redis_key="jy:SeleniumFeapder").start()
+    Spider(redis_key="jy:SeleniumFeapder").start()