ソースを参照

代码提交:6.12

dzr 1 ヶ月 前
コミット
a4d0843c2f

+ 44 - 32
通用采集/detail_chrome.py

@@ -6,9 +6,6 @@ Created on 2023-08-07
 ---------
 @author:
 """
-import time
-import json
-import re
 
 from urllib.parse import urljoin, quote
 
@@ -16,6 +13,12 @@ import feapder
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
 from untils.tools import remove_htmldata, extract_file_type
+try:
+    import time
+    import json
+    import re
+except ImportError:
+    raise
 
 
 headers = {
@@ -29,12 +32,33 @@ headers = {
 }
 
 
-class Details(feapder.BiddingDetailSpider):
-
+DRISSIONPAGE = dict(
+    pool_size=1,  # 浏览器标签页的数量
+    user_agent=None,  # 字符串
+    load_images=False,  # 是否加载图片
+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx
+    headless=True,  # 是否为无头浏览器
+    timeout=30,  # 请求超时时间
+    retry=1,  # 连接失败重试次数
+    interval=0.5,  # 连接失败重试间隔(秒)
+    page_load=30,
+    render_time=0,  # 渲染时长,即打开网页等待加载超时时间
+    window_size=(1024, 800),  # 窗口大小
+    driver_type="chromium",
+    load_mode="normal",  # 网页加载策略, 可选值:"normal", "eager", "none"
+    download_path=None,  # 下载文件的路径
+    custom_argument=[
+        "--no-sandbox",
+        "--ignore-certificate-errors"
+    ]
+)
+
+
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
-        WEBDRIVER=dict(
-            driver_type="CHROME"
-        )
+        PROXY_EXTRACT_API="http://172.17.162.28:16001/sam",
+        PROXY_POOL="feapder.network.proxy_pool.SpringBoardProxyPool",
+        DRISSIONPAGE=DRISSIONPAGE
     )
 
     def start_requests(self):
@@ -45,27 +69,15 @@ class Details(feapder.BiddingDetailSpider):
             if item.get('ex_python'):
                 exec(item.get('ex_python'))
 
-            if item.get('proxies'):
-                yield feapder.Request(url=item.get('parse_url'),
-                                      timeout=timeout,
-                                      render=True,
-                                      render_time=item.get('render_time', 5),
-                                      callback=eval(item.get('parse')),
-                                      item=item,
-                                      files_info=item.get('files'),
-                                      deal_detail=item.get('deal_detail'),
-                                      **request_params)
-            else:
-                yield feapder.Request(url=item.get('parse_url'),
-                                      proxies=False,
-                                      timeout=timeout,
-                                      render=True,
-                                      render_time=item.get('render_time', 5),
-                                      callback=eval(item.get('parse')),
-                                      item=item,
-                                      files_info=item.get('files'),
-                                      deal_detail=item.get('deal_detail'),
-                                      **request_params)
+            yield feapder.Request(url=item.get('parse_url'),
+                                  timeout=timeout,
+                                  render=True,
+                                  render_time=item.get('render_time', 5),
+                                  callback=eval(item.get('parse')),
+                                  item=item,
+                                  files_info=item.get('files'),
+                                  deal_detail=item.get('deal_detail'),
+                                  **request_params)
 
     def detail_get(self, request, response):
         items = request.item
@@ -107,8 +119,8 @@ class Details(feapder.BiddingDetailSpider):
                 else:
                     file_type = files_info.get("file_type")
 
-                if request.proxies:
-                    fpx = request.proxies()
+                if request.get_proxies() is not None:
+                    fpx = request.get_proxies()
                 else:
                     fpx = False
 
@@ -133,4 +145,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key="detail:chrome").start()
+    Spider(redis_key="detail:chrome").start()

+ 45 - 33
通用采集/detail_firefox.py

@@ -6,15 +6,19 @@ Created on 2023-08-07
 ---------
 @author:
 """
-import time
-import json
-import re
+
 from urllib.parse import urljoin, quote
 
 import feapder
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
 from untils.tools import remove_htmldata, extract_file_type
+try:
+    import time
+    import json
+    import re
+except ImportError:
+    pass
 
 headers = {
     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
@@ -26,13 +30,33 @@ headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
 }
 
-
-class Details(feapder.BiddingDetailSpider):
-
+DRISSIONPAGE = dict(
+    pool_size=10,  # 浏览器标签页的数量
+    user_agent=None,  # 字符串
+    load_images=False,  # 是否加载图片
+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx
+    headless=True,  # 是否为无头浏览器
+    timeout=30,  # 请求超时时间
+    retry=1,  # 连接失败重试次数
+    interval=0.5,  # 连接失败重试间隔(秒)
+    page_load=30,
+    render_time=0,  # 渲染时长,即打开网页等待加载超时时间
+    window_size=(1024, 800),  # 窗口大小
+    driver_type="chromium",
+    load_mode="normal",  # 网页加载策略, 可选值:"normal", "eager", "none"
+    download_path=None,  # 下载文件的路径
+    custom_argument=[
+        "--no-sandbox",
+        "--ignore-certificate-errors"
+    ]
+)
+
+
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
-        WEBDRIVER=dict(
-            driver_type="FIREFOX"
-        )
+        PROXY_EXTRACT_API="http://172.17.162.28:16001/sam",
+        PROXY_POOL="feapder.network.proxy_pool.SpringBoardProxyPool",
+        DRISSIONPAGE=DRISSIONPAGE
     )
 
     def start_requests(self):
@@ -43,27 +67,15 @@ class Details(feapder.BiddingDetailSpider):
             if item.get('ex_python'):
                 exec(item.get('ex_python'))
 
-            if item.get('proxies'):
-                yield feapder.Request(url=item.get('parse_url'),
-                                      timeout=timeout,
-                                      render=True,
-                                      render_time=item.get('render_time', 5),
-                                      callback=eval(item.get('parse')),
-                                      item=item,
-                                      files_info=item.get('files'),
-                                      deal_detail=item.get('deal_detail'),
-                                      **request_params)
-            else:
-                yield feapder.Request(url=item.get('parse_url'),
-                                      proxies=False,
-                                      timeout=timeout,
-                                      render=True,
-                                      render_time=item.get('render_time', 5),
-                                      callback=eval(item.get('parse')),
-                                      item=item,
-                                      files_info=item.get('files'),
-                                      deal_detail=item.get('deal_detail'),
-                                      **request_params)
+            yield feapder.Request(url=item.get('parse_url'),
+                                  timeout=timeout,
+                                  render=True,
+                                  render_time=item.get('render_time', 5),
+                                  callback=eval(item.get('parse')),
+                                  item=item,
+                                  files_info=item.get('files'),
+                                  deal_detail=item.get('deal_detail'),
+                                  **request_params)
 
     def detail_get(self, request, response):
         items = request.item
@@ -105,8 +117,8 @@ class Details(feapder.BiddingDetailSpider):
                 else:
                     file_type = files_info.get('file_type')
 
-                if request.proxies:
-                    fpx = request.proxies()
+                if request.get_proxies():
+                    fpx = request.get_proxies()
                 else:
                     fpx = False
 
@@ -130,4 +142,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key='detail:firefox').start()
+    Spider(redis_key='detail:firefox', thread_count=10).start()

+ 2 - 2
通用采集/details.py

@@ -97,8 +97,8 @@ class Details(feapder.BiddingDetailSpider):
                 else:
                     file_type = files_info.get('file_type')
 
-                if request.proxies:
-                    fpx = request.proxies()
+                if request.get_proxies():
+                    fpx = request.get_proxies()
                 else:
                     fpx = False
 

+ 67 - 45
通用采集/njpc_details.py

@@ -6,10 +6,6 @@ Created on 2023-10-08
 ---------
 @author: njpc_feapder
 """
-import re
-import json
-import time
-import random
 
 import feapder
 from items.njpc_item import DataNjpcItem
@@ -17,6 +13,34 @@ from lxml.html import fromstring
 from untils.attachment import AttachmentDownloader as AD
 from untils.attachment_res import AttachmentDownloader as ADres
 from untils.tools import remove_htmldata, extract_file_type
+try:
+    import re
+    import json
+    import time
+    import random
+except ImportError:
+    raise
+
+DRISSIONPAGE = dict(
+    pool_size=10,  # 浏览器标签页的数量
+    user_agent=None,  # 字符串
+    load_images=False,  # 是否加载图片
+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx
+    headless=True,  # 是否为无头浏览器
+    timeout=30,  # 请求超时时间
+    retry=1,  # 连接失败重试次数
+    interval=0.5,  # 连接失败重试间隔(秒)
+    page_load=30,
+    render_time=0,  # 渲染时长,即打开网页等待加载超时时间
+    window_size=(1024, 800),  # 窗口大小
+    driver_type="chromium",
+    load_mode="normal",  # 网页加载策略, 可选值:"normal", "eager", "none"
+    download_path=None,  # 下载文件的路径
+    custom_argument=[
+        "--no-sandbox",
+        "--ignore-certificate-errors"
+    ]
+)
 
 
 # 拟建爬虫下载附件
@@ -27,19 +51,23 @@ def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
         filetype = val.split('.')[-1].replace('"', '').replace("'", "")
         filetypes.append(filetype)
 
+    attachments = {}
+
     root = fromstring(html)
     file_info = root.xpath('//a[@href]')
     if file_info:
-        attachments = {}
+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb',
+                      'hzzbs',
+                      'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
+        res_downloader = ADres()
+        downloader = AD()
         for info in file_info:
             file_url = "".join(info.xpath('./@href'))
-            file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
-                          'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
             file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
             if file_type.lower() == "res":
                 if s_key in file_url and file_name:
                     file_name = file_name.strip()
-                    attachment = ADres().fetch_attachment(
+                    attachment = res_downloader.fetch_attachment(
                         file_name=file_name,
                         download_url=file_url,
                         callback=parse_filetype,
@@ -55,14 +83,23 @@ def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
 
                 if file_tp and s_key in file_url and file_name:
                     file_name = file_name.strip()
-                    attachment = AD().fetch_attachment(
-                        file_name=file_name, file_type=file_tp, download_url=file_url,
-                        proxies=proxies, headers=headers)
+                    attachment = downloader.fetch_attachment(
+                        file_name=file_name,
+                        file_type=file_tp,
+                        download_url=file_url,
+                        proxies=proxies,
+                        headers=headers)
                     attachments[str(len(attachments) + 1)] = attachment
-        return attachments
 
+    return attachments
 
-class Details(feapder.PlanToBuildDetailSpider):
+
+class Spider(feapder.PlanToBuildDetailSpider):
+    __custom_setting__ = dict(
+        PROXY_EXTRACT_API="http://172.17.162.28:16001/sam",
+        PROXY_POOL="feapder.network.proxy_pool.SpringBoardProxyPool",
+        DRISSIONPAGE=DRISSIONPAGE
+    )
 
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=500, timeout=60)
@@ -76,35 +113,20 @@ class Details(feapder.PlanToBuildDetailSpider):
             render_time = item.get("render_time") or 3          # 浏览器渲染时间
             extra_activity = item.get("extra_activity")         # 额外的需求动作
             file_params = item.get("file_params")               # 附件下载配置
-            if item.get("proxies"):
-                yield feapder.Request(url=item.get("parser_url"),
-                                      timeout=timeout,
-                                      render=render,
-                                      render_time=render_time,
-                                      callback=item.get("parser"),
-                                      item=item,
-                                      deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html,
-                                      extra_html=extra_html,
-                                      title_xpath=title_xpath,
-                                      file_params=file_params,
-                                      extra_activity=extra_activity,
-                                      **request_params)
-            else:
-                yield feapder.Request(url=item.get("parser_url"),
-                                      proxies=False,
-                                      timeout=timeout,
-                                      render=render,
-                                      render_time=render_time,
-                                      callback=item.get("parser"),
-                                      item=item,
-                                      deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html,
-                                      extra_html=extra_html,
-                                      title_xpath=title_xpath,
-                                      file_params=file_params,
-                                      extra_activity=extra_activity,
-                                      **request_params)
+
+            yield feapder.Request(url=item.get("parser_url"),
+                                  timeout=timeout,
+                                  render=render,
+                                  render_time=render_time,
+                                  callback=item.get("parser"),
+                                  item=item,
+                                  deal_detail=item.get("deal_detail"),
+                                  is_join_html=is_join_html,
+                                  extra_html=extra_html,
+                                  title_xpath=title_xpath,
+                                  file_params=file_params,
+                                  extra_activity=extra_activity,
+                                  **request_params)
 
     def detail_get(self, request, response):
         items = request.item
@@ -143,8 +165,8 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         data_item.contenthtml = html
 
-        if request.proxies:
-            fpx = request.proxies()
+        if request.get_proxies():
+            fpx = request.get_proxies()
         else:
             fpx = False
 
@@ -168,4 +190,4 @@ class Details(feapder.PlanToBuildDetailSpider):
 
 
 if __name__ == '__main__':
-    Details(redis_key="detail:njpc_details", thread_count=10).start()
+    Spider(redis_key="detail:njpc_details", thread_count=10).start()

+ 68 - 50
通用采集/njpc_details_firefox.py

@@ -6,10 +6,6 @@ Created on 2023-10-08
 ---------
 @author: njpc_feapder
 """
-import re
-import json
-import time
-import random
 
 import feapder
 from items.njpc_item import DataNjpcItem
@@ -17,6 +13,35 @@ from lxml.html import fromstring
 from untils.attachment import AttachmentDownloader as AD
 from untils.attachment_res import AttachmentDownloader as ADres
 from untils.tools import remove_htmldata, extract_file_type
+try:
+    import re
+    import json
+    import time
+    import random
+except ImportError:
+    raise
+
+
+DRISSIONPAGE = dict(
+    pool_size=1,  # 浏览器标签页的数量
+    user_agent=None,  # 字符串
+    load_images=False,  # 是否加载图片
+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx
+    headless=True,  # 是否为无头浏览器
+    timeout=30,  # 请求超时时间
+    retry=1,  # 连接失败重试次数
+    interval=0.5,  # 连接失败重试间隔(秒)
+    page_load=30,
+    render_time=0,  # 渲染时长,即打开网页等待加载超时时间
+    window_size=(1024, 800),  # 窗口大小
+    driver_type="chromium",
+    load_mode="normal",  # 网页加载策略, 可选值:"normal", "eager", "none"
+    download_path=None,  # 下载文件的路径
+    custom_argument=[
+        "--no-sandbox",
+        "--ignore-certificate-errors"
+    ]
+)
 
 
 # 拟建爬虫下载附件
@@ -27,19 +52,23 @@ def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
         filetype = val.split('.')[-1].replace('"', '').replace("'", "")
         filetypes.append(filetype)
 
+    attachments = {}
+
     root = fromstring(html)
     file_info = root.xpath('//a[@href]')
     if file_info:
-        attachments = {}
+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb',
+                      'hzzbs',
+                      'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
+        res_downloader = ADres()
+        downloader = AD()
         for info in file_info:
             file_url = "".join(info.xpath('./@href'))
-            file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
-                          'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
             file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
             if file_type.lower() == "res":
                 if s_key in file_url and file_name:
                     file_name = file_name.strip()
-                    attachment = ADres().fetch_attachment(
+                    attachment = res_downloader.fetch_attachment(
                         file_name=file_name,
                         download_url=file_url,
                         callback=parse_filetype,
@@ -55,18 +84,22 @@ def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
 
                 if file_tp and s_key in file_url and file_name:
                     file_name = file_name.strip()
-                    attachment = AD().fetch_attachment(
-                        file_name=file_name, file_type=file_tp, download_url=file_url,
-                        proxies=proxies, headers=headers)
+                    attachment = downloader.fetch_attachment(
+                        file_name=file_name,
+                        file_type=file_tp,
+                        download_url=file_url,
+                        proxies=proxies,
+                        headers=headers)
                     attachments[str(len(attachments) + 1)] = attachment
-        return attachments
 
+    return attachments
 
-class Details(feapder.PlanToBuildDetailSpider):
+
+class Spider(feapder.PlanToBuildDetailSpider):
     __custom_setting__ = dict(
-        WEBDRIVER=dict(
-            driver_type="FIREFOX"
-        )
+        PROXY_EXTRACT_API="http://172.17.162.28:16001/sam",
+        PROXY_POOL="feapder.network.proxy_pool.SpringBoardProxyPool",
+        DRISSIONPAGE=DRISSIONPAGE
     )
 
     def start_requests(self):
@@ -80,35 +113,20 @@ class Details(feapder.PlanToBuildDetailSpider):
             render_time = item.get("render_time") or 3    # 浏览器渲染时间
             extra_activity = item.get("extra_activity")   # 额外的需求动作
             file_params = item.get("file_params")         # 附件下载配置
-            if item.get("proxies"):
-                yield feapder.Request(url=item.get("parser_url"),
-                                      timeout=timeout,
-                                      render=True,
-                                      render_time=render_time,
-                                      callback=item.get("parser"),
-                                      item=item,
-                                      deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html,
-                                      extra_html=extra_html,
-                                      title_xpath=title_xpath,
-                                      file_params=file_params,
-                                      extra_activity=extra_activity,
-                                      **request_params)
-            else:
-                yield feapder.Request(url=item.get("parser_url"),
-                                      proxies=False,
-                                      timeout=timeout,
-                                      render=True,
-                                      render_time=render_time,
-                                      callback=item.get("parser"),
-                                      item=item,
-                                      deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html,
-                                      extra_html=extra_html,
-                                      title_xpath=title_xpath,
-                                      file_params=file_params,
-                                      extra_activity=extra_activity,
-                                      **request_params)
+
+            yield feapder.Request(url=item.get("parser_url"),
+                                  timeout=timeout,
+                                  render=True,
+                                  render_time=render_time,
+                                  callback=item.get("parser"),
+                                  item=item,
+                                  deal_detail=item.get("deal_detail"),
+                                  is_join_html=is_join_html,
+                                  extra_html=extra_html,
+                                  title_xpath=title_xpath,
+                                  file_params=file_params,
+                                  extra_activity=extra_activity,
+                                  **request_params)
 
     def detail_get(self, request, response):
         items = request.item
@@ -127,7 +145,7 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         if request.title_xpath:
             for sxpath in request.title_xpath:
-                title = response.xpath(sxpath).extract_first() # 三级页标题
+                title = response.xpath(sxpath).extract_first()  # 三级页标题
                 if title:
                     data_item.title = title.strip()
                     if "..." in data_item.projectname or "…" in data_item.projectname:
@@ -136,7 +154,7 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         try:
             if request.extra_activity:
-                from untils.tools import njpc_fields_extract,njpc_fields_extract_special
+                from untils.tools import njpc_fields_extract, njpc_fields_extract_special
                 exec(request.extra_activity)
         except:
             pass
@@ -147,8 +165,8 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         data_item.contenthtml = html
 
-        if request.proxies:
-            fpx = request.proxies()
+        if request.get_proxies():
+            fpx = request.get_proxies()
         else:
             fpx = False
 
@@ -172,4 +190,4 @@ class Details(feapder.PlanToBuildDetailSpider):
 
 
 if __name__ == '__main__':
-    Details(redis_key="detail:njpc_firefox_details").start()
+    Spider(redis_key="detail:njpc_firefox_details").start()