瀏覽代碼

脚本修复

dzr 3 周之前
父節點
當前提交
f3ca486571

+ 6 - 26
hb_hbszfcgwssc_ddcg_cjgg_python/spider_details.py

@@ -7,36 +7,15 @@ Created on 2024-01-08
 @author: lzz
 """
 import feapder
+from feapder.utils.tools import log
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
-from feapder.utils.tools import log
-import requests
 from untils.tools import get_proxy
-import re
-from untils.get_imgcode import jy_ocr
-
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
 
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
-class Details(feapder.BiddingDetailSpider):
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
         SPIDER_MAX_RETRY_TIMES=10
     )
@@ -56,6 +35,7 @@ class Details(feapder.BiddingDetailSpider):
             'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
             'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
         ]
+        self.downloader = AttachmentDownloader()
 
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=50)
@@ -105,7 +85,7 @@ class Details(feapder.BiddingDetailSpider):
                 file_type = file_url.split('.')[-1].lower()
 
             if file_type in self.file_types and "file" in file_url:
-                attachment = AttachmentDownloader().fetch_attachment(
+                attachment = self.downloader.fetch_attachment(
                     file_name=file_name,
                     file_type=file_type,
                     download_url=file_url
@@ -123,4 +103,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:Hbszfcgwssc").start()
+    Spider(redis_key="lzz:Hbszfcgwssc").start()

+ 2 - 26
hb_hbszfcgwssc_ddcg_cjgg_python/spider_list.py

@@ -6,34 +6,13 @@ Created on 2024-01-08
 ---------
 @author: lzz
 """
-import re
 from collections import namedtuple
 
 import feapder
-import requests
 from items.spider_item import BidingListItem
-from untils.get_imgcode import jy_ocr
 from untils.tools import get_proxy
 
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
-
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
 class Spider(feapder.BiddingListSpider):
@@ -53,9 +32,6 @@ class Spider(feapder.BiddingListSpider):
         self.headers = {
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
             "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "close",
-            "Pragma": "no-cache",
             "Upgrade-Insecure-Requests": "1",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36",
         }
@@ -63,7 +39,7 @@ class Spider(feapder.BiddingListSpider):
     def start_requests(self):
         for menu in self.menus:
             url = f'http://wssc.hubeigp.gov.cn/upgrade/fixed_project_notices?type={menu.typeone}&pt=all'
-            yield feapder.Request(url, item=menu._asdict(), use_session=True, page=1, proxies=False)
+            yield feapder.Request(url, item=menu._asdict(), page=1)
 
     def download_midware(self, request):
         page = request.page

+ 35 - 0
hb_hbszfcgwssc_ddcg_cjgg_python/tools.py

@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-07-31 
+---------
+@summary:  
+"""
+import re
+
+import requests
+from untils.get_imgcode import jy_ocr
+
+
+def ocr_captcha(headers, proxies=None, max_retries=3):
+    with requests.session() as s:
+        s.proxies = proxies if proxies is not None else {}
+
+        src = re.compile("'src', '(.*?)'", flags=re.S)  # src
+        href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+
+        code = ''
+        for _ in range(max_retries):
+            try:
+                resp = s.get(href, headers=headers, timeout=30, verify=False)
+                resp.raise_for_status()
+                text = resp.content.decode()
+                img_url = "http://wssc.hubeigp.gov.cn" + "".join(src.findall(text))
+                img = s.get(img_url, headers=headers, timeout=30, verify=False)
+                img.raise_for_status()
+                code = jy_ocr(image=img.content)
+                if code and len(code) == 6:
+                    return code, s.cookies.get_dict()
+            except requests.exceptions.RequestException:
+                break
+
+        return code, s.cookies.get_dict()

+ 8 - 30
hb_hbszfcgwssc_ddcg_zzgg_python/spider_details.py

@@ -6,37 +6,17 @@ Created on 2024-01-08
 ---------
 @author: lzz
 """
+
 import feapder
+from feapder.utils.tools import log
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
-from feapder.utils.tools import log
-import requests
 from untils.tools import get_proxy
-import re
-from untils.get_imgcode import jy_ocr
-
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
 
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+from tools import ocr_captcha
 
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
 
-    return code, session.cookies.get_dict()
-
-
-class Details(feapder.BiddingDetailSpider):
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
         SPIDER_MAX_RETRY_TIMES=10
     )
@@ -51,11 +31,11 @@ class Details(feapder.BiddingDetailSpider):
             "Upgrade-Insecure-Requests": "1",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
         }
-
         self.file_types = [
             'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
             'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
         ]
+        self.downloader = AttachmentDownloader()
 
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=50)
@@ -71,9 +51,7 @@ class Details(feapder.BiddingDetailSpider):
 
     def download_midware(self, request):
         captcha, cookies = ocr_captcha(self.headers, self.proxy)
-        params = {
-            "captcha": captcha
-        }
+        params = {"captcha": captcha}
         request.params = params
         request.cookies = cookies
         request.headers = self.headers
@@ -105,7 +83,7 @@ class Details(feapder.BiddingDetailSpider):
                 file_type = file_url.split('.')[-1].lower()
 
             if file_type in self.file_types and "file" in file_url:
-                attachment = AttachmentDownloader().fetch_attachment(
+                attachment = self.downloader.fetch_attachment(
                     file_name=file_name,
                     file_type=file_type,
                     download_url=file_url
@@ -123,4 +101,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:Hbszfcgwssc").start()
+    Spider(redis_key="lzz:Hbszfcgwssc").start()

+ 8 - 28
hb_hbszfcgwssc_ddcg_zzgg_python/spider_list.py

@@ -6,39 +6,19 @@ Created on 2024-01-08
 ---------
 @author: lzz
 """
-import re
 from collections import namedtuple
 
 import feapder
-import requests
 from items.spider_item import BidingListItem
-from untils.get_imgcode import jy_ocr
-from untils.tools import get_proxy
-
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
 
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
-
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
+from untils.tools import get_proxy
 
 
 class Spider(feapder.BiddingListSpider):
     __custom_setting__ = dict(
-        SPIDER_MAX_RETRY_TIMES=10
+        SPIDER_MAX_RETRY_TIMES=10,
+        SESSION_DOWNLOADER="tools.CustomSessionDownloader"
     )
 
     def start_callback(self):
@@ -46,9 +26,8 @@ class Spider(feapder.BiddingListSpider):
         self.site = "湖北省政府采购网上商城"
 
         self.menus = [
-            Menu('定点采购-终止公告', 'hb_hbszfcgwssc_ddcg_zzgg_python', '4', 2),
+            Menu('定点采购-终止公告', 'hb_hbszfcgwssc_ddcg_zzgg_python', '4', 10),
         ]
-
         self.proxy = get_proxy()
         self.headers = {
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
@@ -63,12 +42,13 @@ class Spider(feapder.BiddingListSpider):
     def start_requests(self):
         for menu in self.menus:
             url = f'http://wssc.hubeigp.gov.cn/upgrade/fixed_project_notices?type={menu.typeone}&pt=all'
-            yield feapder.Request(url, item=menu._asdict(), use_session=True, page=1, proxies=False)
+            yield feapder.Request(url, item=menu._asdict(), use_session=True, page=1, proxy=False)
 
     def download_midware(self, request):
         page = request.page
         menu = request.item
         request.headers = self.headers
+        request.proxies = self.proxy
 
         if page != 1:
             captcha, cookies = ocr_captcha(self.headers, self.proxy)
@@ -86,7 +66,7 @@ class Spider(feapder.BiddingListSpider):
     def validate(self, request, response):
         items = response.xpath('//div[@class="jmr_noticelist"]/ul/li')
         if not items:
-            raise ValueError('列表数据为空!')
+            raise ValueError('访问被拒绝!')
         return True
 
     def parse(self, request, response):

+ 35 - 0
hb_hbszfcgwssc_ddcg_zzgg_python/tools.py

@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-07-31 
+---------
+@summary:  
+"""
+import re
+
+import requests
+from untils.get_imgcode import jy_ocr
+
+
+def ocr_captcha(headers, proxies=None, max_retries=3):
+    with requests.session() as s:
+        s.proxies = proxies if proxies is not None else {}
+
+        src = re.compile("'src', '(.*?)'", flags=re.S)  # src
+        href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+
+        code = ''
+        for _ in range(max_retries):
+            try:
+                resp = s.get(href, headers=headers, timeout=30, verify=False)
+                resp.raise_for_status()
+                text = resp.content.decode()
+                img_url = "http://wssc.hubeigp.gov.cn" + "".join(src.findall(text))
+                img = s.get(img_url, headers=headers, timeout=30, verify=False)
+                img.raise_for_status()
+                code = jy_ocr(image=img.content)
+                if code and len(code) == 6:
+                    return code, s.cookies.get_dict()
+            except requests.exceptions.RequestException:
+                break
+
+        return code, s.cookies.get_dict()

+ 7 - 29
hb_hbszfcgwssc_xygh_cjgg_python/spider_details.py

@@ -7,36 +7,15 @@ Created on 2024-01-08
 @author: lzz
 """
 import feapder
+from feapder.utils.tools import log
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
-from feapder.utils.tools import log
-import requests
 from untils.tools import get_proxy
-import re
-from untils.get_imgcode import jy_ocr
-
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
 
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+from tools import ocr_captcha
 
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
 
-    return code, session.cookies.get_dict()
-
-
-class Details(feapder.BiddingDetailSpider):
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
         SPIDER_MAX_RETRY_TIMES=10
     )
@@ -56,6 +35,7 @@ class Details(feapder.BiddingDetailSpider):
             'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
             'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
         ]
+        self.downloader = AttachmentDownloader()
 
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=50)
@@ -71,9 +51,7 @@ class Details(feapder.BiddingDetailSpider):
 
     def download_midware(self, request):
         captcha, cookies = ocr_captcha(self.headers, self.proxy)
-        params = {
-            "captcha": captcha
-        }
+        params = {"captcha": captcha}
         request.params = params
         request.cookies = cookies
         request.headers = self.headers
@@ -105,7 +83,7 @@ class Details(feapder.BiddingDetailSpider):
                 file_type = file_url.split('.')[-1].lower()
 
             if file_type in self.file_types and "file" in file_url:
-                attachment = AttachmentDownloader().fetch_attachment(
+                attachment = self.downloader.fetch_attachment(
                     file_name=file_name,
                     file_type=file_type,
                     download_url=file_url
@@ -123,4 +101,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:Hbszfcgwssc").start()
+    Spider(redis_key="lzz:Hbszfcgwssc").start()

+ 1 - 22
hb_hbszfcgwssc_xygh_cjgg_python/spider_list.py

@@ -6,34 +6,13 @@ Created on 2024-01-08
 ---------
 @author: lzz
 """
-import re
 from collections import namedtuple
 
 import feapder
-import requests
 from items.spider_item import BidingListItem
-from untils.get_imgcode import jy_ocr
 from untils.tools import get_proxy
 
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
-
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
 class Spider(feapder.BiddingListSpider):

+ 35 - 0
hb_hbszfcgwssc_xygh_cjgg_python/tools.py

@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-07-31 
+---------
+@summary:  
+"""
+import re
+
+import requests
+from untils.get_imgcode import jy_ocr
+
+
+def ocr_captcha(headers, proxies=None, max_retries=3):
+    with requests.session() as s:
+        s.proxies = proxies if proxies is not None else {}
+
+        src = re.compile("'src', '(.*?)'", flags=re.S)  # src
+        href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+
+        code = ''
+        for _ in range(max_retries):
+            try:
+                resp = s.get(href, headers=headers, timeout=30, verify=False)
+                resp.raise_for_status()
+                text = resp.content.decode()
+                img_url = "http://wssc.hubeigp.gov.cn" + "".join(src.findall(text))
+                img = s.get(img_url, headers=headers, timeout=30, verify=False)
+                img.raise_for_status()
+                code = jy_ocr(image=img.content)
+                if code and len(code) == 6:
+                    return code, s.cookies.get_dict()
+            except requests.exceptions.RequestException:
+                break
+
+        return code, s.cookies.get_dict()

+ 7 - 26
hb_hbszfcgwssc_xygh_zzgg_python/spider_details.py

@@ -7,36 +7,15 @@ Created on 2024-01-08
 @author: lzz
 """
 import feapder
+from feapder.utils.tools import log
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
-from feapder.utils.tools import log
-import requests
 from untils.tools import get_proxy
-import re
-from untils.get_imgcode import jy_ocr
-
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
 
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+from tools import ocr_captcha
 
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
 
-    return code, session.cookies.get_dict()
-
-
-class Details(feapder.BiddingDetailSpider):
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
         SPIDER_MAX_RETRY_TIMES=10
     )
@@ -57,6 +36,8 @@ class Details(feapder.BiddingDetailSpider):
             'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
         ]
 
+        self.downloader = AttachmentDownloader()
+
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=50)
         for item in data_list:
@@ -105,7 +86,7 @@ class Details(feapder.BiddingDetailSpider):
                 file_type = file_url.split('.')[-1].lower()
 
             if file_type in self.file_types and "file" in file_url:
-                attachment = AttachmentDownloader().fetch_attachment(
+                attachment = self.downloader.fetch_attachment(
                     file_name=file_name,
                     file_type=file_type,
                     download_url=file_url
@@ -123,4 +104,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:Hbszfcgwssc").start()
+    Spider(redis_key="lzz:Hbszfcgwssc").start()

+ 2 - 22
hb_hbszfcgwssc_xygh_zzgg_python/spider_list.py

@@ -6,34 +6,13 @@ Created on 2024-01-08
 ---------
 @author: lzz
 """
-import re
 from collections import namedtuple
 
 import feapder
-import requests
 from items.spider_item import BidingListItem
-from untils.get_imgcode import jy_ocr
 from untils.tools import get_proxy
 
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
-
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
 class Spider(feapder.BiddingListSpider):
@@ -80,6 +59,7 @@ class Spider(feapder.BiddingListSpider):
                 "type": f"{menu.get('typeone')}",
                 "captcha": f"{captcha}"
             }
+            a = (123, )
             request.params = params
             request.cookies = cookies
 

+ 35 - 0
hb_hbszfcgwssc_xygh_zzgg_python/tools.py

@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-07-31 
+---------
+@summary:  
+"""
+import re
+
+import requests
+from untils.get_imgcode import jy_ocr
+
+
+def ocr_captcha(headers, proxies=None, max_retries=3):
+    with requests.session() as s:
+        s.proxies = proxies if proxies is not None else {}
+
+        src = re.compile("'src', '(.*?)'", flags=re.S)  # src
+        href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+
+        code = ''
+        for _ in range(max_retries):
+            try:
+                resp = s.get(href, headers=headers, timeout=30, verify=False)
+                resp.raise_for_status()
+                text = resp.content.decode()
+                img_url = "http://wssc.hubeigp.gov.cn" + "".join(src.findall(text))
+                img = s.get(img_url, headers=headers, timeout=30, verify=False)
+                img.raise_for_status()
+                code = jy_ocr(image=img.content)
+                if code and len(code) == 6:
+                    return code, s.cookies.get_dict()
+            except requests.exceptions.RequestException:
+                break
+
+        return code, s.cookies.get_dict()

+ 6 - 26
hb_hbszfcgwssc_zxcg_cjgg_python/spider_details.py

@@ -7,36 +7,15 @@ Created on 2024-01-08
 @author: lzz
 """
 import feapder
+from feapder.utils.tools import log
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
-from feapder.utils.tools import log
-import requests
 from untils.tools import get_proxy
-import re
-from untils.get_imgcode import jy_ocr
-
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
 
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
-class Details(feapder.BiddingDetailSpider):
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
         SPIDER_MAX_RETRY_TIMES=10
     )
@@ -56,6 +35,7 @@ class Details(feapder.BiddingDetailSpider):
             'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
             'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
         ]
+        self.downloader = AttachmentDownloader()
 
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=50)
@@ -105,7 +85,7 @@ class Details(feapder.BiddingDetailSpider):
                 file_type = file_url.split('.')[-1].lower()
 
             if file_type in self.file_types and "file" in file_url:
-                attachment = AttachmentDownloader().fetch_attachment(
+                attachment = self.downloader.fetch_attachment(
                     file_name=file_name,
                     file_type=file_type,
                     download_url=file_url
@@ -123,4 +103,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:Hbszfcgwssc").start()
+    Spider(redis_key="lzz:Hbszfcgwssc").start()

+ 1 - 22
hb_hbszfcgwssc_zxcg_cjgg_python/spider_list.py

@@ -6,34 +6,13 @@ Created on 2024-01-08
 ---------
 @author: lzz
 """
-import re
 from collections import namedtuple
 
 import feapder
-import requests
 from items.spider_item import BidingListItem
-from untils.get_imgcode import jy_ocr
 from untils.tools import get_proxy
 
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
-
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
 class Spider(feapder.BiddingListSpider):

+ 35 - 0
hb_hbszfcgwssc_zxcg_cjgg_python/tools.py

@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-07-31 
+---------
+@summary:  
+"""
+import re
+
+import requests
+from untils.get_imgcode import jy_ocr
+
+
+def ocr_captcha(headers, proxies=None, max_retries=3):
+    with requests.session() as s:
+        s.proxies = proxies if proxies is not None else {}
+
+        src = re.compile("'src', '(.*?)'", flags=re.S)  # src
+        href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+
+        code = ''
+        for _ in range(max_retries):
+            try:
+                resp = s.get(href, headers=headers, timeout=30, verify=False)
+                resp.raise_for_status()
+                text = resp.content.decode()
+                img_url = "http://wssc.hubeigp.gov.cn" + "".join(src.findall(text))
+                img = s.get(img_url, headers=headers, timeout=30, verify=False)
+                img.raise_for_status()
+                code = jy_ocr(image=img.content)
+                if code and len(code) == 6:
+                    return code, s.cookies.get_dict()
+            except requests.exceptions.RequestException:
+                break
+
+        return code, s.cookies.get_dict()

+ 6 - 26
hb_hbszfcgwssc_zxcg_zzgg_python/spider_details.py

@@ -7,36 +7,15 @@ Created on 2024-01-08
 @author: lzz
 """
 import feapder
+from feapder.utils.tools import log
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
-from feapder.utils.tools import log
-import requests
 from untils.tools import get_proxy
-import re
-from untils.get_imgcode import jy_ocr
-
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
 
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
-class Details(feapder.BiddingDetailSpider):
+class Spider(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
         SPIDER_MAX_RETRY_TIMES=10
     )
@@ -56,6 +35,7 @@ class Details(feapder.BiddingDetailSpider):
             'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
             'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
         ]
+        self.downloader = AttachmentDownloader()
 
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=50)
@@ -105,7 +85,7 @@ class Details(feapder.BiddingDetailSpider):
                 file_type = file_url.split('.')[-1].lower()
 
             if file_type in self.file_types and "file" in file_url:
-                attachment = AttachmentDownloader().fetch_attachment(
+                attachment = self.downloader.fetch_attachment(
                     file_name=file_name,
                     file_type=file_type,
                     download_url=file_url
@@ -123,4 +103,4 @@ class Details(feapder.BiddingDetailSpider):
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:Hbszfcgwssc").start()
+    Spider(redis_key="lzz:Hbszfcgwssc").start()

+ 1 - 22
hb_hbszfcgwssc_zxcg_zzgg_python/spider_list.py

@@ -6,34 +6,13 @@ Created on 2024-01-08
 ---------
 @author: lzz
 """
-import re
 from collections import namedtuple
 
 import feapder
-import requests
 from items.spider_item import BidingListItem
-from untils.get_imgcode import jy_ocr
 from untils.tools import get_proxy
 
-
-def ocr_captcha(headers, proxies=False, max_retries=10):
-    session = requests.session()
-    session.proxies = proxies
-
-    s = re.compile("'src', '(.*?)'", flags=re.S)  # src
-    href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
-
-    code = ''
-    for _ in range(max_retries):
-        resp1 = session.get(href, headers=headers, timeout=30, verify=False)
-        text = resp1.content.decode()
-        img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
-        resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
-        code = jy_ocr(image=resp2.content)
-        if code and len(code) == 6:
-            break
-
-    return code, session.cookies.get_dict()
+from tools import ocr_captcha
 
 
 class Spider(feapder.BiddingListSpider):

+ 35 - 0
hb_hbszfcgwssc_zxcg_zzgg_python/tools.py

@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-07-31 
+---------
+@summary:  
+"""
+import re
+
+import requests
+from untils.get_imgcode import jy_ocr
+
+
+def ocr_captcha(headers, proxies=None, max_retries=3):
+    with requests.session() as s:
+        s.proxies = proxies if proxies is not None else {}
+
+        src = re.compile("'src', '(.*?)'", flags=re.S)  # src
+        href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
+
+        code = ''
+        for _ in range(max_retries):
+            try:
+                resp = s.get(href, headers=headers, timeout=30, verify=False)
+                resp.raise_for_status()
+                text = resp.content.decode()
+                img_url = "http://wssc.hubeigp.gov.cn" + "".join(src.findall(text))
+                img = s.get(img_url, headers=headers, timeout=30, verify=False)
+                img.raise_for_status()
+                code = jy_ocr(image=img.content)
+                if code and len(code) == 6:
+                    return code, s.cookies.get_dict()
+            except requests.exceptions.RequestException:
+                break
+
+        return code, s.cookies.get_dict()