浏览代码

脚本代码更新

dzr 4 月之前
父节点
当前提交
69951fb6c9
共有 100 个文件被更改,包括 11349 次插入1219 次删除
  1. 317 0
      lzz_theme/bqgyjtgscgdzswpt/details_bqgyjt.py
  2. 170 0
      lzz_theme/bqgyjtgscgdzswpt/list_bqgyjt.py
  3. 59 0
      lzz_theme/bqgyjtgscgdzswpt/login_account.py
  4. 106 0
      lzz_theme/bqgyjtgscgdzswpt/pass_slide.py
  5. 7 0
      lzz_theme/bqgyjtgscgdzswpt/start.sh
  6. 1 1
      lzz_theme/clgjzbcgjtyxgs/clgj_cookies.txt
  7. 192 0
      lzz_theme/clgjzbcgjtyxgs/list_start.sh
  8. 171 0
      lzz_theme/cqszfcgyptfwcs/cqszfcgy_cgxq_list.py
  9. 176 0
      lzz_theme/cqszfcgyptfwcs/cqszfcgy_details.py
  10. 157 0
      lzz_theme/cqszfcgyptfwcs/cqszfcgy_list.py
  11. 117 0
      lzz_theme/cqszfcgyptfwcs/cqszfcgy_qx_details.py
  12. 15 0
      lzz_theme/cqszfcgyptfwcs/start.sh
  13. 1 1
      lzz_theme/gdgczbzxyxgs/gdgczb_cookies.json
  14. 309 0
      lzz_theme/gdgczbzxyxgs/start.sh
  15. 181 0
      lzz_theme/gnzggzyjyzx/gnz_gn_zbjh.py
  16. 235 0
      lzz_theme/gnzggzyjyzx/gnz_gn_zgys.py
  17. 211 0
      lzz_theme/gnzggzyjyzx/gnz_sx_zb.py
  18. 181 0
      lzz_theme/gnzggzyjyzx/gnz_sx_zgys.py
  19. 211 0
      lzz_theme/gnzggzyjyzx/gnz_ygcg.py
  20. 14 0
      lzz_theme/gnzggzyjyzx/start.sh
  21. 0 1
      lzz_theme/hnszfcgdzmc/dt_start.sh
  22. 3 3
      lzz_theme/hnszfcgdzmc/hn_collector.py
  23. 6 32
      lzz_theme/hnszfcgdzmc/jjjg_spider.py
  24. 2 14
      lzz_theme/hnszfcgdzmc/spider.py
  25. 135 0
      lzz_theme/hnszfcgdzmc/start.sh
  26. 4 10
      lzz_theme/hnszfcgdzmc/zxjj_spider.py
  27. 160 0
      lzz_theme/htdzcgpt/htdz_bggg_list.py
  28. 99 0
      lzz_theme/htdzcgpt/htdz_cjgg_details.py
  29. 151 0
      lzz_theme/htdzcgpt/htdz_cjgg_list.py
  30. 134 0
      lzz_theme/htdzcgpt/htdz_jzxtp_details.py
  31. 179 0
      lzz_theme/htdzcgpt/htdz_jzxtp_list.py
  32. 111 0
      lzz_theme/htdzcgpt/htdz_login.py
  33. 139 0
      lzz_theme/htdzcgpt/htdz_qtcg_list.py
  34. 133 0
      lzz_theme/htdzcgpt/htdz_xjgg_details.py
  35. 174 0
      lzz_theme/htdzcgpt/htdz_xjgg_list.py
  36. 149 0
      lzz_theme/htdzcgpt/htdz_zbgg_details.py
  37. 153 0
      lzz_theme/htdzcgpt/htdz_zbgg_list.py
  38. 153 0
      lzz_theme/htdzcgpt/htdz_zbhxrgs_details.py
  39. 158 0
      lzz_theme/htdzcgpt/htdz_zbhxrgs_list.py
  40. 28 0
      lzz_theme/htdzcgpt/start.sh
  41. 29 0
      lzz_theme/jsxmhjyxdjbbaxt/det_start.sh
  42. 96 0
      lzz_theme/lcdzcgpt/lcdz_details.py
  43. 167 0
      lzz_theme/lcdzcgpt/lcdz_list.py
  44. 43 0
      lzz_theme/lcdzcgpt/lcdz_login.py
  45. 5 0
      lzz_theme/lcdzcgpt/start.sh
  46. 416 0
      lzz_theme/package-lock.json
  47. 6 0
      lzz_theme/package.json
  48. 2 2
      lzz_theme/qgzbgggsssyq/py_ssyq_details.py
  49. 2 2
      lzz_theme/qgzbgggsssyq/py_ssyq_details2.py
  50. 2 2
      lzz_theme/qgzbgggsssyq/py_ssyq_details3.py
  51. 2 2
      lzz_theme/qgzbgggsssyq/py_ssyq_details4.py
  52. 2 2
      lzz_theme/qgzbgggsssyq/py_ssyq_details_bu.py
  53. 2 2
      lzz_theme/qgzbgggsssyq/sscrawl_details.py
  54. 15 11
      lzz_theme/qgzbgggsssyq/sscrawl_list.py
  55. 126 0
      lzz_theme/qgzbgggsssyq/start.sh
  56. 126 0
      lzz_theme/qjwqzbcgxxw/qjwqzb_details.py
  57. 149 0
      lzz_theme/qjwqzbcgxxw/qjwqzb_list.py
  58. 8 0
      lzz_theme/qjwqzbcgxxw/start.sh
  59. 6 0
      lzz_theme/rm_file.sh
  60. 59 0
      lzz_theme/sfc/login_account.py
  61. 113 0
      lzz_theme/sfc/pass_slide.py
  62. 200 0
      lzz_theme/sfc/sfc_cjgg_detail.py
  63. 172 0
      lzz_theme/sfc/sfc_cjgg_list.py
  64. 1 0
      lzz_theme/sfc/sfc_cookies.txt
  65. 180 0
      lzz_theme/sfc/sfc_gkbx_list.py
  66. 230 0
      lzz_theme/sfc/sfc_gzgg_detail.py
  67. 175 0
      lzz_theme/sfc/sfc_gzgg_list.py
  68. 204 0
      lzz_theme/sfc/sfc_htgg_detail.py
  69. 172 0
      lzz_theme/sfc/sfc_htgg_list.py
  70. 1 0
      lzz_theme/sfc/sfc_uuid.txt
  71. 134 0
      lzz_theme/sfc/sfc_zzgg_detail.py
  72. 172 0
      lzz_theme/sfc/sfc_zzgg_list.py
  73. 二进制
      lzz_theme/sfc/slice.png
  74. 22 0
      lzz_theme/sfc/start.sh
  75. 0 1
      lzz_theme/sgycw/sgycw_ck.json
  76. 二进制
      lzz_theme/sgycw/start.sh
  77. 0 29
      lzz_theme/szycycgpt/pass_slide.py
  78. 二进制
      lzz_theme/szycycgpt/slice.png
  79. 0 3
      lzz_theme/szycycgpt/start.sh
  80. 0 1
      lzz_theme/szycycgpt/szyc_list.py
  81. 1 4
      lzz_theme/tjszfcgw/start.sh
  82. 1 4
      lzz_theme/tjszfcgw/tjszfcgw_details2.py
  83. 67 0
      lzz_theme/utils/PYCCS_cookies.py
  84. 92 0
      lzz_theme/utils/RedisDB.py
  85. 43 0
      lzz_theme/utils/aliyun.py
  86. 308 0
      lzz_theme/utils/attachment.py
  87. 89 0
      lzz_theme/utils/chaojiying.py
  88. 118 0
      lzz_theme/utils/check_utils.py
  89. 147 0
      lzz_theme/utils/clean_html.py
  90. 58 0
      lzz_theme/utils/es_query.py
  91. 37 0
      lzz_theme/utils/execptions.py
  92. 130 0
      lzz_theme/utils/get_imgcode.py
  93. 6 0
      lzz_theme/utils/js/stealth.min.js
  94. 18 0
      lzz_theme/utils/robbot.py
  95. 50 0
      lzz_theme/utils/title_participle.py
  96. 699 0
      lzz_theme/utils/tools.py
  97. 461 0
      lzz_theme/utils/webdriver.py
  98. 308 310
      lzz_theme/xgyyglj/start.sh
  99. 1 1
      lzz_theme/yyc/yyc_ck.json
  100. 774 781
      lzz_theme/yyc/yyc_zbgg_details.py

+ 317 - 0
lzz_theme/bqgyjtgscgdzswpt/details_bqgyjt.py

@@ -0,0 +1,317 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-02
+---------
+@summary: 兵器工业集团公司采购电子商务平台 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+import time
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from login_account import Login
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.Authorization = self.get_cookies()
+
+    def get_cookies(self):
+        if not os.path.isfile('./bqgy_cookies.txt'):
+            Login()
+
+        with open('./bqgy_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_cgmx(self, hid):
+        cgmx_html = ""
+        headers = {
+            "Authorization": f"{self.Authorization}",
+            "Referer": "https://newtd.norincogroup-ebuy.com/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "Accept": "application/json, text/plain, */*",
+            "Content-Type": "application/json;charset=utf-8"
+        }
+        url = f"https://msapi.norincogroup-ebuy.com/tdweb/index/source/{hid}"
+        params = {
+            "page": "1",
+            "size": "500",
+            "ssid": "M80160",
+            "sid": "M80160",
+            "subsystem": "S00000",
+            "_t": f"{int(time.time() * 1000)}"
+        }
+        try:
+            response = requests.get(url, headers=headers, params=params, timeout=30, verify=False)
+            time.sleep(1)
+            if "登录信息过期" in response.text:
+                try:
+                    os.remove('./bqgy_cookies.txt')
+                except:
+                    pass
+                raise EOFError("登录信息过期!")
+            data_info = response.json().get('data').get('goodsPage', {}).get('list')
+
+            if data_info:
+                index = 1
+                for info in data_info:
+                    cgmx_html += f'''
+                    <tr>
+                        <td>{index}</td>
+                        <td>{info.get('goodsName')}</td>
+                        <td>{info.get('categoryTotal')}</td>
+                        <td>{info.get('purchaseNum')}{info.get('measure')}</td>
+                        <td>{info.get('minReplyNum')}{info.get('measure')}</td>
+                        <td>{info.get('str1')}</td>
+                        <td>{info.get('str2')}</td>
+                    </tr>
+                    '''
+                    index += 1
+
+        except:
+            pass
+
+        return f'''
+        <div>采购明细</div>
+        <table>
+            <thead>
+            <tr>
+                <th>序号</th>
+                <th>商品名称</th>
+                <th>品类</th>
+                <th>采购数量</th>
+                <th>最少响应量</th>
+                <th>技术要求/型号</th>
+                <th>报价要求</th>
+            </tr>
+            </thead>
+        
+            <tbody>
+            {cgmx_html}
+            </tbody>
+        </table>
+        '''
+
+    def get_cjgg(self, hid):
+        cjgg_html = ""
+        headers = {
+            "Authorization": f"{self.Authorization}",
+            "Referer": "https://newtd.norincogroup-ebuy.com/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "Accept": "application/json, text/plain, */*",
+            "Content-Type": "application/json;charset=utf-8"
+        }
+        url = f"https://msapi.norincogroup-ebuy.com/tdweb/index/price/traded/{hid}"
+        params = {
+            "page": "1",
+            "size": "500",
+            "ssid": "M80160",
+            "sid": "M80160",
+            "subsystem": "S00000",
+            "_t": f"{int(time.time() * 1000)}"
+        }
+        try:
+            response = requests.get(url, headers=headers, params=params, timeout=30, verify=False)
+            time.sleep(1)
+            if "登录信息过期" in response.text:
+                try:
+                    os.remove('./bqgy_cookies.txt')
+                except:
+                    pass
+                raise EOFError("登录信息过期!")
+            data_info = response.json().get('data').get('pricePage', {}).get('list')
+
+            if data_info:
+                index = 1
+                for info in data_info:
+                    cjgg_html += f'''
+                    <tr>
+                        <td>{index}</td>
+                        <td>{info.get('goodsName')}</td>
+                        <td>{info.get('joinInfoDTO', {}).get('sellMbName')}</td>
+                        <td>{info.get('str1')}</td>
+                        <td>{info.get('replyNum')}{info.get('measure')}</td>
+                        <td>{info.get('tradeNum')}{info.get('measure')}</td>
+                        <td>{info.get('tradeSumPrice', '0')}元</td>
+                        <td>{info.get('arrivalDate')}</td>
+                        <td>{info.get('arrivalPlace')}</td>
+                        <td>{info.get('contractCode')}</td>
+                    </tr>
+                    '''
+                    index += 1
+
+        except:
+            pass
+
+        return f'''
+        <div>成交公告</div>
+        <table>
+            <thead>
+            <tr>
+                <th>序号</th>
+                <th>商品名称</th>
+                <th>供应商名称</th>
+                <th>型号</th>
+                <th>供应量</th>
+                <th>成交数量</th>
+                <th>成交总价</th>
+                <th>到货日期</th>
+                <th>制造商/到站地</th>
+                <th>订单号</th>
+            </tr>
+            </thead>
+
+            <tbody>
+            {cjgg_html}
+            </tbody>
+        </table>
+        '''
+
+    def detail_get(self, response, item):
+        hid = item['competehref'].split('id=')[-1]
+        if "登录信息过期" in response.text:
+            try:
+                os.remove('./bqgy_cookies.txt')
+            except:
+                pass
+            return
+
+        dt = response.json().get('data')
+        file_list = dt.get('fileList')
+        file_html = ""
+        attachments = {}
+
+        cgmx_html = self.get_cgmx(hid)
+        cjgg_html = self.get_cjgg(hid)
+
+        if file_list:
+            for info in file_list:
+                file_name = info.get('fileName')
+                base_url = "https://resource.norincogroup-ebuy.com/v1/fs/web/download/public/"
+                file_url = base_url + info.get('filePath') + f"?fileName={file_name}&download=1"
+                file_type = extract_file_type(file_name=file_name, file_url=file_url)
+                if file_type:
+                    file_html += f'<div><a href="{file_url}">{file_name}</a></div>'
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url)
+                    attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item['projectinfo'] = {"attachments": attachments}
+
+        html = f'''
+        <div>基本信息</div>
+        <div>
+            <span>{dt.get('stage', {}).get('name')}</span> <h6>{dt.get('code')}</h6>
+            <ul>
+                <li><label>发布单位:</label> 
+                    <span>{dt.get('buyMbName')}</span>
+                </li>
+                <li><label for="">最终单位:</label><span>{dt.get('finalMbName')}</span></li>
+            </ul>
+            <ul>
+                <li><label>参与方式:</label><span>{dt.get('isOriented', {}).get('name')}</span></li>
+                <li><label>出价方式:</label><span>{dt.get('priceMethod', {}).get('name')}</span></li>
+                <li><label>付款方式:</label><span>{dt.get('paymentMethod')}</span></li>
+                <li><label style="width: 150px;">是否必须加盖电子签章:</label>
+                <span>{dt.get('needDeposit', {}).get('name')}</span></li>
+            </ul>
+            <ul>
+                <li><label>保证金:</label><span>{dt.get('deposit', '0')}元</span></li>
+                <li><label>附件:</label>
+                    {file_html}
+                </li>
+            </ul>
+            <div>
+                <p>{dt.get('contact')}</p>
+                <p>{dt.get('contactMobile')}</p>
+            </div>
+            <p>{dt.get('remark')}</p>
+        </div>
+        '''
+
+        item["contenthtml"] = (html + cgmx_html + cjgg_html).replace('None', '')
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+
+        headers = {
+            "Authorization": f"{self.Authorization}",
+            "Referer": "https://newtd.norincogroup-ebuy.com/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "Accept": "application/json, text/plain, */*",
+            "Content-Type": "application/json;charset=utf-8"
+        }
+        params = {
+            "ssid": "M80160",
+            "sid": "M80160",
+            "subsystem": "S00000",
+            "_t": f"{int(time.time() * 1000)}"
+        }
+        response = requests.get(url=item.get("parse_url"), headers=headers, params=params, timeout=(30, 30),
+                                verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item.get('competehref')} 采集异常:{e}")
+                retry_times += 1
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        logger.debug("********** 等待60s列表页数据加载... **********")
+        time.sleep(60)
+        with self.db_name.find({"parser_name": "ztpc_bqgyjtgscgdzswpt", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=100)

+ 170 - 0
lzz_theme/bqgyjtgscgdzswpt/list_bqgyjt.py

@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-02
+---------
+@summary: 兵器工业集团公司采购电子商务平台 https://newtd.norincogroup-ebuy.com/inquiryweb/notice
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+from login_account import Login
+import warnings
+warnings.filterwarnings('ignore')
+
+
+class Crawl_bqgy:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+
+        self.r = Redis_client()
+        self.redis_key = 'ztpc_bqgyjtgscgdzswpt'
+
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./bqgy_cookies.txt'):
+            Login()
+
+        with open('./bqgy_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def fetch_list_page(self, page):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        Authorization = self.get_cookies()
+        headers = {
+            "Authorization": f"{Authorization}",
+            "Referer": "https://newtd.norincogroup-ebuy.com/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "Accept": "application/json, text/plain, */*",
+            "Content-Type": "application/json;charset=UTF-8;"
+        }
+
+        url = "https://msapi.norincogroup-ebuy.com/tdweb/index/announce/goods"
+        params = {
+            "ssid": "M80160",
+            "sid": "M80160",
+            "subsystem": "S00000",
+            "_t": f"{int(time.time() * 1000)}"
+        }
+        data = {
+            "page": page,
+            "size": 50,
+            "stage": "ALL",
+            "order": "DESC",
+            "orderName": "datePublish",
+            "subsystem": "S00000",
+            "ssid": "M80160",
+            "sid": "M80160"
+        }
+        data = json.dumps(data, separators=(',', ':'))
+
+        request_params = {
+            "headers": headers,
+            "params": params,
+            "data": data,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        if "登录信息过期" in response.text:
+            try:
+                os.remove('./bqgy_cookies.txt')
+            except:
+                pass
+            return
+        results_list = []
+        info_list = response.json().get('data').get('list')
+        for info in info_list:
+            hid = info.get('id')
+            title = info.get('title').strip()
+            publish_time = info.get('datePublish').strip()
+            href = f"https://newtd.norincogroup-ebuy.com/inquiryweb/notice?id={hid}"
+
+            dedup = md5value(title + href + publish_time)
+
+            if not self.r.hexists(self.redis_key, dedup):
+                item = {
+                    "site": "兵器工业集团公司采购电子商务平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": f"https://msapi.norincogroup-ebuy.com/tdweb/index/goods/{hid}",
+                    "parser_name": self.redis_key,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.r.hset(self.redis_key, dedup, '')
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    self.real_cont += len(informations)
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(10,20))
+                    break
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                retry_times += 1
+                time.sleep(5)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('商品类询价', 'a_bqgyjtgscgdzswpt_splxj', 5),
+    ]
+
+    Crawl_bqgy().start_list(menus)

+ 59 - 0
lzz_theme/bqgyjtgscgdzswpt/login_account.py

@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-02
+---------
+@summary: 账密登录 https://baseapp.norincogroup-ebuy.com/child/member/home
+---------
+@author: Lzz
+"""
+import requests
+import time
+import random
+from hashlib import md5
+from pass_slide import get_token
+
+
+
+def Login(username="13213013670", password="Admin54321@"):
+    session = requests.session()
+    session.verify = False
+
+    try:
+        token = ""
+        for _ in range(30):
+            token = get_token(session)
+            if token:
+                break
+            time.sleep(random.randint(3,7))
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Authorization": "Basic bWVtYmVyczo1MGRiZmI1ZTU4YWU0ZmZkYjQyMGE5YTZmNDU5YjQ2MA==",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "https://msauth.norincogroup-ebuy.com",
+            "Referer": "https://msauth.norincogroup-ebuy.com/mslogin/index",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+        url = "https://msauth.norincogroup-ebuy.com/msauth/login"
+        params = {
+            "_t": f"{int(time.time() * 1000)}"
+        }
+        data = {
+            "username": username,
+            "password": md5(password.encode()).hexdigest().upper(),
+            "vailId": token
+        }
+        response = session.post(url, headers=headers, params=params, data=data, timeout=20)
+
+        hcookies = response.json().get('data').get('access_token')
+        with open('./bqgy_cookies.txt', 'w', encoding='utf-8') as fw:
+            fw.write(str(hcookies))
+
+        print(f" >>> 账号:{username} 登录完成!")
+        return True
+    except Exception as e:
+        print(f" >>> 账号:{username} 登录失败!{e}")
+        return False
+

+ 106 - 0
lzz_theme/bqgyjtgscgdzswpt/pass_slide.py

@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-02
+---------
+@summary: 滑块验证
+---------
+@author: lzz
+"""
+from PIL import Image
+import cv2
+import numpy as np
+import warnings
+import re
+import base64
+import time
+import json
+
+warnings.filterwarnings('ignore')
+
+
+def pilImgToCv2(img: Image.Image, flag=cv2.COLOR_RGB2BGR):
+    return cv2.cvtColor(np.asarray(img), flag)
+
+
+def getDistance(img: Image.Image, slice: Image.Image):
+    # 背景图和滑块图都需要做相同处理
+    grayImg = pilImgToCv2(img, cv2.COLOR_BGR2GRAY)
+    graySlice = pilImgToCv2(slice, cv2.COLOR_BGR2GRAY)
+    # 做边缘检测进一步降低干扰
+    grayImg = cv2.Canny(grayImg, 255, 255)
+    graySlice = cv2.Canny(graySlice, 255, 255)
+    # 通过模板匹配两张图片,找出缺口的位置
+    result = cv2.matchTemplate(grayImg, graySlice, cv2.TM_CCOEFF_NORMED)
+    maxLoc = cv2.minMaxLoc(result)[3]
+    # 匹配出来的滑动距离
+    distance = maxLoc[0]
+    sliceHeight, sliceWidth = graySlice.shape[:2]
+    # 左上角
+    x, y = maxLoc
+    # 右下角
+    x2, y2 = x + sliceWidth, y + sliceHeight
+    resultBg = pilImgToCv2(img, cv2.COLOR_RGB2BGR)
+    cv2.rectangle(resultBg, (x, y), (x2, y2), (0, 0, 255), 2)
+    return distance
+
+
+def get_dist(sliceimgpath, imgpath):
+    distance = getDistance(Image.open(imgpath), Image.open(sliceimgpath))
+    return distance
+
+
+def decode_image(filename, src):
+    # 1、信息提取
+    result = re.search("data:image/(?P<ext>.*?);base64,(?P<data>.*)", src, re.DOTALL)
+    if result:
+        ext = result.groupdict().get("ext")
+        data = result.groupdict().get("data")
+    else:
+        raise Exception("Do not parse!")
+
+    img = base64.urlsafe_b64decode(data)
+    with open(f"{filename}.png", "wb") as f:
+        f.write(img)
+
+    return filename
+
+def get_token(session):
+    headers = {
+        "Accept": "application/json, text/plain, */*",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Connection": "keep-alive",
+        "Content-Type": "application/json;charset=utf-8",
+        "Referer": "https://msauth.norincogroup-ebuy.com/mslogin/index",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+    }
+
+    url = "https://msauth.norincogroup-ebuy.com/msauth/oauth/validata/code/slideImage"
+    params = {
+        "_t": f"{int(time.time() * 1000)}"
+    }
+    res = session.get(url, headers=headers, params=params, timeout=20)
+
+    img_info = res.json().get('data')
+    fullpage = img_info.get('background')
+    decode_image("fullpage", fullpage)
+    slicepage = img_info.get('drag')
+    vailId = img_info.get('vailId')
+    decode_image("slice", slicepage)
+
+    dis = get_dist('./slice.png', './fullpage.png')
+
+    url = "https://msauth.norincogroup-ebuy.com/msauth/oauth/validata/code/checkSlideImage"
+    params = {
+        "_t": f"{int(time.time() * 1000)}"
+    }
+    data = {
+        "imagecode": int(dis),
+        "vailId": vailId
+    }
+    data = json.dumps(data, separators=(',', ':'))
+    response = session.post(url, headers=headers, params=params, data=data, timeout=20)
+
+    token = response.json().get('data')
+
+    return token
+

+ 7 - 0
lzz_theme/bqgyjtgscgdzswpt/start.sh

@@ -0,0 +1,7 @@
+#!/bin/bash
+
+ps -ef |grep "list_bqgyjt.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "details_bqgyjt.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+nohup python3 list_bqgyjt.py > log/list_bqgyjt.out 2>&1 &
+nohup python3 details_bqgyjt.py > log/details_bqgyjt.out 2>&1 &
+

+ 1 - 1
lzz_theme/clgjzbcgjtyxgs/clgj_cookies.txt

@@ -1 +1 @@
-eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiIyOTYyOTIiLCJpc3MiOiJwbGF0Zm9ybUNlbnRlciIsImlhdCI6MTczNDQ4Mzc0NCwiZXhwIjoxNzM0NTcwMTQ0LCJuYmYiOjE3MzQ0ODM3NDQsImp0aSI6ImE3YzgwZTM0YzllNTQ4ZGRhNzZkYmNlYzQyYWQxNDUxIiwiYXVkIjpbImJVc2VyIl0sInVzZXJJZCI6Mjk2MjkyLCJ1c2VyTmFtZSI6IjEzMjIzMDc0MDAzIiwidGVuYW50SWQiOjIyMDEsImVtYWlsQWRkcmVzcyI6IjEzMjIzMDc0MDAzQGludmFsaWQuY24iLCJ0eXBlIjoiYWNjZXNzVG9rZW4ifQ.ZPkftNcnbszc0A2X3gNRmjkZrtTPjJ6ptQxQm8SmCh3CJ1kNX2bXKKy0QxgKlZIl8OENcbgDugPWXOppZPqxkerX9G3mKzB_9B9_DSBS85lj69yVj0IGMr0dXmsr_-OjKi5nXKgRnRO3dwMr1jaTPh2KXKMCvQZWFh-vwb0Juy8WnXQT_xc3bJTRhQDUN41KUZXyKmvTpVY71LLjTTsvQWGTxXN6mkbKod8yyncEEP-6nhAWllG6g8b6cw59i6KaVxEp_p2UR6dw6RgWWNjK-pAQzx8B0_FLJnwrfcTOqYkIKTEZrCICzzEpia-Lhr7jWLS1Utfmehr8KJtJ8R1pFw
+eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiIyOTYyOTIiLCJpc3MiOiJwbGF0Zm9ybUNlbnRlciIsImlhdCI6MTc0MzQ2OTIxOSwiZXhwIjoxNzQzNTU1NjE5LCJuYmYiOjE3NDM0NjkyMTksImp0aSI6IjU5ZTAwMjllNjQ2ZjRhNjA5MzgwY2IyY2U2ZDRiOWZmIiwiYXVkIjpbImJVc2VyIl0sInVzZXJJZCI6Mjk2MjkyLCJ1c2VyTmFtZSI6IjEzMjIzMDc0MDAzIiwidGVuYW50SWQiOjIyMDEsImVtYWlsQWRkcmVzcyI6IjEzMjIzMDc0MDAzQGludmFsaWQuY24iLCJ0eXBlIjoiYWNjZXNzVG9rZW4ifQ.H2Gugj4nsPdeY-WJvTyAjTzHOFiA4s9HDkq5C6b3eNcZTwUb_G8VG5HBA3ENBgXjCHARxpN7mIocAUPSGr5iIDw4_8_aImdZvp-NhoTRitkSYvnO-S2vabBJDLkw9GR4ui6vENfKMllcYvM6TpE_Z42RzvMkW0V_iLcuo2oK2VmncPoN1GhJhZtWfxmfHuLCEPPv130cLvdjB-GPy3EWyzYWUlxcOktZ6KuDAciUQernR0GU1GenGCyazszJpT7JAA9INWontdTZ-iBBkYk9rS4GbVuWDlPNQRz4fB8NiqcSQMRC5Z0yFfJSTHMnE1eTqs-wiBhsXHX6OV32zrP02g

+ 192 - 0
lzz_theme/clgjzbcgjtyxgs/list_start.sh

@@ -0,0 +1,192 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-03-13
+---------
+@summary: 重庆市政府采购云平台·服务超市
+---------
+@author: lzz
+"""
+import sys
+import os
+import time
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.tools import *
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "accept": "application/json, text/plain, */*",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "accesstoken": "undefined",
+            "content-type": "application/json",
+            "logintoken": "undefined",
+            "origin": "https://chinazhyc.zbj.com",
+            "priority": "u=1, i",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+            "x-auth-token": "undefined",
+            "x-requested-with": "XMLHttpRequest"
+        }
+
+    def get_announcement(self,hid):
+        url = "https://cgzb.zbj.com/api/workbench/queryPurchasingDemandHistoryById"
+        data = {
+            "data": {
+                "id": hid
+            }
+        }
+        response = requests.post(url, headers=self.headers, json=data, timeout=30, verify=False)
+        ann = response.json().get('data') or {}
+        return ann.get('announcement') or []
+
+    def detail_get(self, response, item):
+        dt = response.json().get('data')
+        ps_html = ""
+        hid = item.get('req_data').get('data').get('purchaseDemandId')
+        ps_list = self.get_announcement(hid)
+        if ps_list:
+            for kv in ps_list:
+                biddingType = kv.get('biddingType')
+                winningWay = kv.get('winningWay')
+                createTimeName = kv.get('createTimeName')
+                if biddingType not in [5, 7]:
+                    if winningWay == 1:
+                        wshow = "最低价"
+                    else:
+                        wshow = "综合评分"
+                else:
+                    if biddingType == 7:
+                        wshow = "随机中选"
+                    else:
+                        wshow = "-"
+
+                if createTimeName == "成交":
+                    if biddingType == 5:
+                        dshow = "已中选"
+                    elif biddingType == 7:
+                        dshow = "随机中选"
+                    else:
+                        if winningWay == 1:
+                            ww = "价格最低"
+                        else:
+                            ww = "评分最高"
+                        dshow = "有效供应商中" + ww
+                else:
+                    dshow = "-"
+                temp = f'''
+                <tr>
+                    <td rowspan="1">{kv.get('purchaseDemandName')}</td> 
+                    <td>{kv.get('spName')}</td>
+                    <td>{kv.get('latestQuotationPrice')}</td>
+                    <td>{kv.get('dealAmount')}</td> 
+                    <td>{kv.get('verifyUserName')}</td>
+                    <td>{wshow}</td>
+                    <td>{createTimeName}</td> 
+                    <td>{timestamp_to_date(int(str(kv.get('createTime'))[:10]))}</td>
+                    <td>{dshow}</td>
+                </tr>
+                '''
+                ps_html += temp
+
+            psjjgg_html = f'''
+            <ul>
+                <li><span>评审结果公告</span></li>
+                <li>
+                    <table>
+                        <thead>
+                        <th width="140px">分包名称</th> 
+                        <th>供应商名称</th>
+                        <th width="65px">报价金额</th> 
+                        <th width="65px">成交金额</th>
+                        <th width="90px">实际成交金额</th>
+                        <th width="65px">评审方式</th>
+                        <th width="65px">评审结果</th> 
+                        <th width="115px">需求日期</th>
+                        <th width="105px">成交/未成交原因</th>
+                        </thead>
+                        {ps_html}
+                    </table>
+                </li>
+            </ul>
+            '''
+        else:
+            psjjgg_html = ""
+
+        html = f'''
+        <div>
+            <ul>
+                <li><span>需求基本信息</span></li>
+                <li><label>发布时间:</label> {timestamp_to_date(int(str(dt.get('publishDate'))[:10]))}</li>
+                <li><label>采购编号:</label> {dt.get('id')}</li>
+                <li><label>其他要求:</label>{dt.get('purchaseDemandDescribe')}</li>
+            </ul>
+            <ul>
+                <li><label>采购需求方信息 </label><span>{dt.get('purchasingInformation')}</span></li>
+            </ul>
+            {psjjgg_html}
+        </div>
+        '''
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        data = item.get('req_data')
+        response = requests.post(url=item.get("parse_url"),json=data, headers=self.headers, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(30)
+        with self.db_name.find({"parser_name": "ztpc_cqszfcgy_cgxq", "is_crawl": False, "failed": False}).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(2, 5))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=100)

+ 171 - 0
lzz_theme/cqszfcgyptfwcs/cqszfcgy_cgxq_list.py

@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-03-13
+---------
+@summary: 重庆市政府采购云平台·服务超市
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+class Crawl_Cqs:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "accept": "application/json, text/plain, */*",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "accesstoken": "undefined",
+            "content-type": "application/json",
+            "logintoken": "undefined",
+            "origin": "https://chinazhyc.zbj.com",
+            "priority": "u=1, i",
+            "referer": "https://chinazhyc.zbj.com/buyerHall",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+            "x-auth-token": "undefined",
+            "x-requested-with": "XMLHttpRequest"
+        }
+
+        url = "https://cgzb.zbj.com/api/platform/queryPurchase"
+        data = {
+            "data": {
+                "categoryUnionLevel1Id": None,
+                "categoryUnionLevel2Id": None,
+                "town": "",
+                "expirytTimeSort": 0,
+                "priceSort": 0,
+                "releaseTimeSort": 1,
+                "highPrice": "",
+                "lowPrice": "",
+                "page": page,
+                "biddingType": 0,
+                "purchaseState": menu.tid,
+                "categoryUnionLevel3Id": None,
+                "frontCatLevel1Id": None,
+                "frontCatLevel2Id": None,
+                "frontCatLevel3Id": None,
+                "frontCatLevel4Id": None,
+                "frontCatLevel5Id": None,
+                "frontCatLevel6Id": None,
+                "province": "重庆市",
+                "pageSize": 100,
+                "thirdIdType": 0,
+                "title": ""
+            }
+        }
+        data = json.dumps(data, separators=(',', ':'))
+        resp = requests.post(url, data=data, headers=headers,  timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('data').get('data')
+        for info in info_list:
+            hid = info.get('id')
+            href = f"https://chinazhyc.zbj.com/detailsDemand?id={hid}&&puste=6"
+            title = info.get('purchaseDemandName').strip()
+            create_time = timestamp_to_date(int(str(info.get('publishDate'))[:10]))
+
+            def_city = "重庆市"
+            def_area = "重庆"
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                req_data = {
+                    "request": "1531362372728",
+                    "data": {
+                        "purchaseDemandId": f"{hid}",
+                        "pushId": "",
+                        "serviceProviderId": "",
+                        "pushLogRulesId": ""
+                    }
+                }
+                parser_name = "ztpc_cqszfcgy_cgxq"
+                parse_url = "https://cgzb.zbj.com/api/workbench/searchProjectById"
+                item = {
+                    "site": "重庆市政府采购云平台·服务超市",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": def_area,
+                    "city": def_city,
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "req_data": req_data,
+                    "parse_url": parse_url,
+                    "parser_name": parser_name,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(2)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('采购需求-已开标', 'a_cqszfcgyptfwcs_cgxq_ykb', '6', 2),
+        Menu('采购需求-已完成', 'a_cqszfcgyptfwcs_cgxq_ywc', '12', 1),
+    ]
+    Crawl_Cqs().start_list(menus)

+ 176 - 0
lzz_theme/cqszfcgyptfwcs/cqszfcgy_details.py

@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 重庆市政府采购云平台·服务超市
+---------
+@author: lzz
+"""
+import sys
+import os
+import time
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.tools import *
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "accept": "application/json, text/plain, */*",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "accesstoken": "undefined",
+            "content-type": "application/json",
+            "logintoken": "undefined",
+            "origin": "https://chinazhyc.zbj.com",
+            "priority": "u=1, i",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "x-auth-token": "undefined",
+            "x-requested-with": "XMLHttpRequest"
+        }
+
+    def detail_get(self, response, item):
+        dt = response.json().get('data')
+        ps_html = ""
+        ps_list = dt.get('announcementDtoList') or []
+        for kv in ps_list:
+            biddingType = kv.get('biddingType')
+            winningWay = kv.get('winningWay')
+            createTimeName = kv.get('createTimeName')
+            if biddingType not in [5, 7]:
+                if winningWay == 1:
+                    wshow = "最低价"
+                else:
+                    wshow = "综合评分"
+            else:
+                if biddingType == 7:
+                    wshow = "随机中选"
+                else:
+                    wshow = "-"
+
+            if createTimeName == "成交":
+                if biddingType == 5:
+                    dshow = "已中选"
+                elif biddingType == 7:
+                    dshow = "随机中选"
+                else:
+                    if winningWay == 1:
+                        ww = "价格最低"
+                    else:
+                        ww = "评分最高"
+                    dshow = "有效供应商中" + ww
+            else:
+                dshow = "-"
+            temp = f'''
+            <tr>
+                <td rowspan="1">{kv.get('purchaseDemandName')}</td> 
+                <td>{kv.get('spName')}</td>
+                <td>{kv.get('latestQuotationPrice')}</td>
+                <td>{kv.get('dealAmount')}</td> 
+                <td>{kv.get('verifyUserName')}</td>
+                <td>{wshow}</td>
+                <td>{createTimeName}</td> 
+                <td>{timestamp_to_date(int(str(kv.get('createTime'))[:10]))}</td>
+                <td>{dshow}</td>
+            </tr>
+            '''
+            ps_html += temp
+
+        html = f'''
+        <div>
+            <div>按照直接采购成交规则,现将采购结果公告如下:</div>
+            <ul>
+                <li><span>需求基本信息</span></li>
+                <li><label>采购名称</label> {dt.get('purchaseDemandName')}</li>
+                <li><label>采购编号</label> {dt.get('id')}</li>
+                <li><label>预算金额</label><span class="price"> ¥ {dt.get('purchaserBudget')}</span></li>
+            </ul>
+            <ul>
+                <li><label>采购执行方 </label><span>{dt.get('businessId')}</span></li>
+                <li><label>联系人 </label><span>{dt.get('contacts')}</span></li>
+            </ul>
+            <ul>
+                <li><span>评审结果</span></li>
+                <li>
+                    <table>
+                        <thead>
+                        <th width="140px">分包名称</th> 
+                        <th>供应商名称</th>
+                        <th width="65px">报价金额</th> 
+                        <th width="65px">成交金额</th>
+                        <th width="90px">实际成交金额</th>
+                        <th width="65px">评审方式</th>
+                        <th width="65px">评审结果</th> 
+                        <th width="115px">需求日期</th>
+                        <th width="105px">成交/未成交原因</th>
+                        </thead>
+                        {ps_html}
+                    </table>
+                </li>
+            </ul>
+        </div>
+        '''
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        data = item.get('req_data')
+        response = requests.post(url=item.get("parse_url"),json=data, headers=self.headers, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(30)
+        with self.db_name.find({"parser_name": "ztpc_cqszfcgy", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(2, 5))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=100)

+ 157 - 0
lzz_theme/cqszfcgyptfwcs/cqszfcgy_list.py

@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 重庆市政府采购云平台·服务超市
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+class Crawl_Cqs:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "accept": "application/json, text/plain, */*",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "accesstoken": "undefined",
+            "content-type": "application/json",
+            "logintoken": "undefined",
+            "origin": "https://chinazhyc.zbj.com",
+            "priority": "u=1, i",
+            "referer": "https://chinazhyc.zbj.com/publicity?type=3",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "x-auth-token": "undefined",
+            "x-requested-with": "XMLHttpRequest"
+        }
+
+        url = "https://cgzb.zbj.com/api/notice/queryNoticeList"
+        data = {
+            "data": {
+                "requestId": "1531362372728",
+                "businessId": "",
+                "type": menu.tid,
+                "page": page,
+                "pageSize": 100,
+                "startTime": get_month(-3),
+                "endTime": get_month()
+            }
+        }
+        data = json.dumps(data, separators=(',', ':'))
+        resp = requests.post(url, data=data, headers=headers,  timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('data').get('data')
+        for info in info_list:
+            hid = info.get('id')
+            tp = info.get('type')
+            href = f"https://chinazhyc.zbj.com/publicityDetails?id={hid}&type={tp}"
+            title = info.get('name').strip()
+            create_time = timestamp_to_date(int(str(info.get('time'))[:10]))
+
+            def_city = "重庆市"
+            def_area = "重庆"
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                req_data = {
+                    "data": {
+                        "id": f"{hid}",
+                        "type": f"{tp}"
+                    }
+                }
+                parser_name = "ztpc_cqszfcgy"
+                parse_url = "https://cgzb.zbj.com/api/buyer/queryAnnouncementById"
+                if menu.tid == "1":
+                    parser_name = "ztpc_cqszfcgy_qx"
+                    parse_url = "https://cgzb.zbj.com/api/buyer/queryPurchasingDemandHistoryById"
+                item = {
+                    "site": "重庆市政府采购云平台·服务超市",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": def_area,
+                    "city": def_city,
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "req_data": req_data,
+                    "parse_url": parse_url,
+                    "parser_name": parser_name,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(2)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('结果公告', 'cq_cqszfcgyptfwcs_jggg', '3', 3),
+        Menu('取消公告', 'cq_cqszfcgyptfwcs_qxgg', '1', 1),
+    ]
+    Crawl_Cqs().start_list(menus)

+ 117 - 0
lzz_theme/cqszfcgyptfwcs/cqszfcgy_qx_details.py

@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 重庆市政府采购云平台·服务超市
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.tools import *
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "accept": "application/json, text/plain, */*",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "accesstoken": "undefined",
+            "content-type": "application/json",
+            "logintoken": "undefined",
+            "origin": "https://chinazhyc.zbj.com",
+            "priority": "u=1, i",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "x-auth-token": "undefined",
+            "x-requested-with": "XMLHttpRequest"
+        }
+
+    def detail_get(self, response, item):
+        dt = response.json().get('data')
+
+        html = f'''
+        <table width="96%">
+            <tr>
+                <td colspan="2">需求编号:{dt.get('projectId')}</td>
+            </tr>
+            <tr>
+                <td colspan="2">创建时间:{timestamp_to_date(int(str(dt.get('cancelHistory').get('createTime'))[:10]))}</td>
+            </tr>
+            <tr>
+                <td colspan="2">需求名称:{dt.get('purchaseDemandName')}</a>
+                </td>
+            </tr>
+            <tr>
+                <td colspan="2">取消需求</td>
+            </tr>
+            <tr>
+                <td colspan="2">{dt.get('cancelHistory').get('cancelCause')}</td>
+            </tr>
+        </table>
+        '''
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        data = item.get('req_data')
+        response = requests.post(url=item.get("parse_url"),json=data, headers=self.headers, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(30)
+        with self.db_name.find({"parser_name": "ztpc_cqszfcgy_qx", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(2, 5))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=100)

+ 15 - 0
lzz_theme/cqszfcgyptfwcs/start.sh

@@ -0,0 +1,15 @@
+#!/bin/bash
+
+ps -ef |grep "cqszfcgy_qx_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "cqszfcgy_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "cqszfcgy_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "cqszfcgy_cgxq_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "cqszfcgy_cgxq_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+
+nohup python3 cqszfcgy_qx_details.py > log/cqszfcgy_qx_details.out 2>&1 &
+nohup python3 cqszfcgy_details.py > log/cqszfcgy_details.out 2>&1 &
+nohup python3 cqszfcgy_list.py > log/cqszfcgy_list.out 2>&1 &
+nohup python3 cqszfcgy_cgxq_list.py > log/cqszfcgy_cgxq_list.out 2>&1 &
+nohup python3 cqszfcgy_cgxq_details.py > log/cqszfcgy_cgxq_details.out 2>&1 &
+
+

+ 1 - 1
lzz_theme/gdgczbzxyxgs/gdgczb_cookies.json

@@ -1 +1 @@
-{'PHPSESSID': '1fdtf0lfjiq3rbaflqo0rstnu3', 'ZDEDebuggerPresent': 'php,phtml,php3', 'cookie_id_user': '459', 'cookie_sign_user': 'b7e167749e8f8ddac68a927e16e0756a', 'cookie_siteurl_user': 'a04869d4280afae7fe40a770e2263425'}
+{'PHPSESSID': 'dlesemm0s291kk6aq7t5lsm3a1', 'cookie_id_user': '459', 'cookie_sign_user': 'b7e167749e8f8ddac68a927e16e0756a', 'cookie_siteurl_user': '7c10f3f0c34c75d6da287914081fe027'}

+ 309 - 0
lzz_theme/gdgczbzxyxgs/start.sh

@@ -0,0 +1,309 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-09
+---------
+@summary: 
+---------
+@author: lzz
+"""
+import json
+import re
+import time
+import execjs
+import requests
+from parsel import Selector
+
+
+
+def get_file_info(url,purl,data, proxies):
+    headers = {
+        "accept": "*/*",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+        "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+        "pragma": "no-cache",
+        "priority": "u=1, i",
+        "referer": url,
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        "x-requested-with": "XMLHttpRequest"
+    }
+
+    response = requests.post(purl, headers=headers, data=data, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+
+    file_list = []
+    file_dict = "".join(re.findall('attachmentMap=(.*?);',response.text))
+    if file_dict:
+        attachmentMap = file_dict.replace('null', '1').replace('false', '1').replace('true', '1')
+        f_dict = json.loads(attachmentMap)
+        for k, v in f_dict.items():
+            if type(v) == list:
+                file_list = v
+                break
+
+    return response.text,file_list
+
+
+def get_xmxx(url, proxies):
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "pragma": "no-cache",
+        "priority": "u=0, i",
+        "referer": url,
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+    }
+
+    new_url = url.split("?")[0] + "?pageIndex=0"
+    response = requests.get(new_url, headers=headers, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    root = Selector(response.text)
+    text = root.xpath('//div[@class="jxTradingPublic"]').extract_first("")
+    order = root.xpath('//li[contains(@class,"jxPoint") and contains(@class,"jxPointRed")]/div/text()').extract()
+
+    purl = "https://ggzyjy.gnzrmzf.gov.cn/f/countytrade/tenderprojects/countyflowBidpackage"
+    pm = "".join(re.findall(f'countyflowBidpackage",(.*?),func',response.text))
+    if not pm:
+        purl = "https://ggzyjy.gnzrmzf.gov.cn/f/newtenderproject/flowBidpackage"
+        pm = "".join(re.findall(f'flowBidpackage",(.*?),func',response.text))
+
+    data = execjs.eval(pm)
+    file_text,file_list = get_file_info(new_url,purl,data,proxies)
+    return text+file_text,file_list,new_url,order
+
+
+def get_ggxx(url, proxies):
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "pragma": "no-cache",
+        "priority": "u=0, i",
+        "referer": url,
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+    }
+
+    new_url = url.split("?")[0] + "?pageIndex=1"
+    response = requests.get(new_url, headers=headers, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    file_url = Selector(response.text).xpath('//div[@class="jxTradingPublic"]//a[@class="pdf"]/@href').extract_first()
+    if not file_url:
+        text = Selector(response.text).xpath('//div[@class="jxTradingPublic"]').extract_first("")
+    else:
+        text = ""
+    return text,file_url,new_url
+
+
+def get_kpb(url, proxies):
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "pragma": "no-cache",
+        "priority": "u=0, i",
+        "referer": url,
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+    }
+
+    new_url = url.split("?")[0] + "?pageIndex=3"
+    response = requests.get(new_url, headers=headers, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    text = Selector(response.text).xpath('//div[@class="jxTradingPublic"]').extract_first()
+    return text,new_url
+
+
+
+def get_zbgs(url, proxies):
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "pragma": "no-cache",
+        "priority": "u=0, i",
+        "referer": url,
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+    }
+
+    response = requests.get(url, headers=headers, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    file_url = Selector(response.text).xpath('//div[@class="jxTradingPublic"]//a[@class="pdf"]/@href').extract_first()
+    return "",file_url,url
+
+def get_gg(url, proxies):
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "pragma": "no-cache",
+        "priority": "u=0, i",
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+    }
+
+    response = requests.get(url, headers=headers, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    file_url = Selector(response.text).xpath('//div[@class="jxTradingPublic"]//a[@class="pdf"]/@href').extract_first()
+    return file_url
+
+
+def get_plan(url, proxies):
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "pragma": "no-cache",
+        "priority": "u=0, i",
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+    }
+
+    response = requests.get(url, headers=headers, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    file_url = Selector(response.text).xpath('//div[@class="jxTradingPublic"]//iframe[@class="pdf"]/@src').extract_first()
+    return file_url
+
+
+def get_gglist(url, proxies):
+    headers = {
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "pragma": "no-cache",
+        "priority": "u=0, i",
+        "upgrade-insecure-requests": "1",
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+    }
+
+    response = requests.get(url, headers=headers, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    gglist = Selector(response.text).xpath('//div[@style="cursor:pointer;"]/@onclick').extract()
+    return gglist
+
+
+
+def getAnnoDetail(htp, projectId, annogoodsId, referer, proxies):
+    headers = {
+        "accept": "text/html, */*; q=0.01",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+        "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+        "pragma": "no-cache",
+        "priority": "u=1, i",
+        "referer": referer,
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        "x-requested-with": "XMLHttpRequest"
+    }
+
+    url = f"https://ggzyjy.gnzrmzf.gov.cn/f/{htp}/annoment/getAnnoDetail"
+    data = {
+        "projectId": projectId,
+        "annogoodsId": annogoodsId
+    }
+
+    response = requests.post(url, headers=headers, data=data, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    return "采购公告", response.text
+
+
+
+def getBidResult(htp, projectId, annogoodsId, referer, proxies):
+    headers = {
+        "accept": "text/html, */*; q=0.01",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+        "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+        "pragma": "no-cache",
+        "priority": "u=1, i",
+        "referer": referer,
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        "x-requested-with": "XMLHttpRequest"
+    }
+
+    url = f"https://ggzyjy.gnzrmzf.gov.cn/f/{htp}/annoment/getBidResult"
+    data = {
+        "projectId": projectId,
+    }
+
+    response = requests.post(url, headers=headers, data=data, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    return "竞价结果", response.text
+
+
+def getPublicityDetail(htp, projectId, annogoodsId, referer, proxies):
+    headers = {
+        "accept": "text/html, */*; q=0.01",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+        "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+        "pragma": "no-cache",
+        "priority": "u=1, i",
+        "referer": referer,
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        "x-requested-with": "XMLHttpRequest"
+    }
+
+    url = f"https://ggzyjy.gnzrmzf.gov.cn/f/{htp}/annoment/getPublicityDetail"
+    data = {
+        "projectId": projectId,
+    }
+
+    response = requests.post(url, headers=headers, data=data, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    return "成交公示", response.text
+
+
+def getContractList(htp, projectId, annogoodsId, referer, proxies):
+    headers = {
+        "accept": "text/html, */*; q=0.01",
+        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "cache-control": "no-cache",
+        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+        "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+        "pragma": "no-cache",
+        "priority": "u=1, i",
+        "referer": referer,
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        "x-requested-with": "XMLHttpRequest"
+    }
+
+    url = f"https://ggzyjy.gnzrmzf.gov.cn/f/{htp}/annoment/getContractList"
+    data = {
+        "projectId": projectId,
+    }
+
+    response = requests.post(url, headers=headers, data=data, proxies=proxies, timeout=30, verify=False)
+    time.sleep(.5)
+    if "the page you are looking for is currently unavailable" in response.text:
+        raise ValueError("数据异常")
+    return "合同", response.text

+ 181 - 0
lzz_theme/gnzggzyjyzx/gnz_gn_zbjh.py

@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-10
+---------
+@summary: 甘南州公共资源交易中心
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from api_param import get_plan
+from parsel import Selector
+import requests
+import warnings
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+warnings.simplefilter(action='ignore', category=RuntimeWarning)
+
+
+class Crawl_Zgyd:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.zt_details = self.py_spider.data_bak
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+
+        headers = {
+            "accept": "text/html, */*; q=0.01",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "cache-control": "no-cache",
+            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+            "pragma": "no-cache",
+            "priority": "u=1, i",
+            "referer": "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list?tradeStatus=0",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "x-requested-with": "XMLHttpRequest"
+        }
+        data = f"pageNo={page}&pageSize=20&tradeStatus=0&prjpropertycode=1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11&prjpropertycode=21%2C22%2C23%2C24&prjpropertycode=31&prjpropertycode=13%2C14%2C15%2C16%2C18%2C19%2C20&prjpropertycode=600&tradeArea=14&projectname=&tabType=4&tradeType="
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "proxies": self.proxy,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        url = "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/getAnnoList"
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        selector = Selector(text=response.text, base_url="https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list")
+        selector.root.make_links_absolute()
+
+        info_list = selector.xpath('//dd')
+
+        for info in info_list:
+            title = "".join(info.xpath('./a/text()').extract()).replace('[]', '').strip()
+            href = info.xpath('./a/@href').extract_first("").strip()
+            publish_time = info.xpath('./span[@class="byTradingDetailTime"]/text()').extract_first("").strip()
+
+            area = "甘肃"
+            city = "甘南州"
+
+            dedup = [href, publish_time]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "甘南州公共资源交易中心",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": area,
+                    "city": city,
+                    "district": "",
+                    "href": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": "parse_url",
+                    "parser_name": "ztpc_gnzggzyjyzx",
+                    "is_mixed": True,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+                self.get_details(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def get_details(self, item):
+
+        file_url = get_plan(item['href'], self.proxy)
+
+        item["contenthtml"] = "详情请访问原网页!"
+
+        attachments = {}
+
+        file_name = item['title']
+        file_type = extract_file_type(file_url=file_url)
+        if file_type:
+            attachment = AttachmentDownloader().fetch_attachment(
+                file_name=file_name, file_type=file_type, download_url=file_url)
+            if attachment.__contains__("fid"):
+                attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item["projectinfo"] = {"attachments": attachments}
+        else:
+            return
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def crawl_list_spider(self, page, menu):
+        response = None
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    crawl_num = len(informations)
+                    self.real_cont += crawl_num
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.random())
+                    break
+                else:
+                    self.proxy = get_proxy()
+                    retry_times += 1
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                self.proxy = get_proxy()
+                retry_times += 1
+                time.sleep(2)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('交易类型全部-县级地区-招标资格预审公告', 'gs_gnzggzyjyzx_jsgc_zb_05', 1),
+    ]
+    Crawl_Zgyd().start_list(menus)

+ 235 - 0
lzz_theme/gnzggzyjyzx/gnz_gn_zgys.py

@@ -0,0 +1,235 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-10
+---------
+@summary: 甘南州公共资源交易中心
+---------
+@author: Lzz
+"""
+import sys
+import os
+import copy
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from api_param import *
+from parsel import Selector
+import requests
+import warnings
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+warnings.simplefilter(action='ignore', category=RuntimeWarning)
+
+
+class Crawl_Zgyd:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.zt_details = self.py_spider.data_bak
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "accept": "text/html, */*; q=0.01",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "cache-control": "no-cache",
+            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+            "pragma": "no-cache",
+            "priority": "u=1, i",
+            "referer": "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list?tradeStatus=0",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "x-requested-with": "XMLHttpRequest"
+        }
+        data = f"pageNo={page}&pageSize=20&tradeStatus=0&prjpropertycode=1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11&prjpropertycode=21%2C22%2C23%2C24&prjpropertycode=31&prjpropertycode=13%2C14%2C15%2C16%2C18%2C19%2C20&prjpropertycode=600&tradeArea=14&projectname=&tabType={menu.tid}&tradeType="
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "proxies": self.proxy,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        url = "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/getAnnoList"
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        selector = Selector(text=response.text, base_url="https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list")
+        selector.root.make_links_absolute()
+
+        info_list = selector.xpath('//dd')
+
+        for info in info_list:
+            title = "".join(info.xpath('./a/text()').extract()).replace('[]', '').strip()
+            href = info.xpath('./a/@href').extract_first("").strip()
+            publish_time = info.xpath('./span[@class="byTradingDetailTime"]/text()').extract_first("").strip()
+
+            area = "甘肃"
+            city = "甘南州"
+
+            dedup = [href, publish_time]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "甘南州公共资源交易中心",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": area,
+                    "city": city,
+                    "district": "",
+                    "href": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": "parse_url",
+                    "parser_name": "ztpc_gnzggzyjyzx",
+                    "is_mixed": True,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.get_details(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def get_details(self, item):
+        href = item['href']
+        xtext, file_list, new_url, order = get_xmxx(href, self.proxy)
+        gtext, gg_url, gg = get_ggxx(href, self.proxy)
+        if int(order[-1]) >= 3:
+            kbp_html,kpb = get_kpb(href,self.proxy)
+        else:
+            kbp_html = ""
+            kpb = ""
+
+        if int(order[-1]) >= 5:
+            hhref = href.split("?")[0] + "?pageIndex=5"
+            hxr_html,hxr_url,hxr = get_zbgs(hhref,self.proxy)
+        else:
+            hxr_html = ""
+            hxr_url = []
+            hxr = ""
+
+        if int(order[-1]) >= 6:
+            zhref = href.split("?")[0] + "?pageIndex=6"
+            jg_html,jg_url,jg = get_zbgs(zhref,self.proxy)
+        else:
+            jg_html = ""
+            jg_url = []
+            jg = ""
+
+        old_title = item['title']
+        org_item = copy.deepcopy(item)
+
+        dt_info = {"项目信息": {"text": xtext, "url": file_list, "new_url": new_url},
+                   "公告信息": {"text": gtext, "url": gg_url, "new_url": gg},
+                   "开评标信息": {"text": kbp_html, "url": [], "new_url": kpb},
+                   "中标候选人公示": {"text": hxr_html, "url": hxr_url, "new_url": hxr},
+                   "中标结果公示": {"text": jg_html, "url": jg_url, "new_url": jg},}
+        for k, v in dt_info.items():
+            if v.get('text'):
+                item["contenthtml"] = v.get('text')
+            else:
+                item["contenthtml"] = "详情请访问原网页!"
+
+            item['title'] = old_title + f"_{k}"
+            item['href'] = new_url
+
+            attachments = {}
+            if type(v.get('url')) == str:
+                file_name = item['title']
+                file_url = v.get('url')
+                file_type = extract_file_type(file_url=file_url)
+                if file_type:
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url)
+                    if attachment.__contains__("fid"):
+                        attachments[str(len(attachments) + 1)] = attachment
+            elif type(v.get('url')) == list:
+                for mm in v.get('url'):
+                    f_name = mm.get('filename')
+                    f_url = mm.get('fileurl', '')
+                    if "http" not in f_url:
+                        f_url = f"https://ggzyjy.gnzrmzf.gov.cn/f/OssUploadDownload/countyDownloadFile?fileName={f_name}&urlStr={mm.get('fileurl', '')}"
+                    f_type = extract_file_type(file_url=f_url)
+                    if f_type:
+                        attachment = AttachmentDownloader().fetch_attachment(
+                            file_name=f_name, file_type=f_type, download_url=f_url)
+                        if attachment.__contains__("fid"):
+                            attachments[str(len(attachments) + 1)] = attachment
+
+            if attachments:
+                item["projectinfo"] = {"attachments": attachments}
+            elif not attachments and not v.get('text'):
+                return
+
+            item = format_fileds(item)
+
+            try:
+                self.zt_details.insert_one(item)
+                logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+            except DuplicateKeyError:
+                logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+            item = copy.deepcopy(org_item)
+
+    def crawl_list_spider(self, page, menu):
+        response = None
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    crawl_num = len(informations)
+                    self.real_cont += crawl_num
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.random())
+                    break
+                else:
+                    self.proxy = get_proxy()
+                    retry_times += 1
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                self.proxy = get_proxy()
+                retry_times += 1
+                time.sleep(2)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('交易类型全部-县级地区-招标资格预审公告', 'gs_gnzggzyjyzx_jsgc_zb_05', '1', 1),
+        Menu('交易类型全部-县级地区-招标资格预审公告', 'gs_gnzggzyjyzx_jsgc_zb_05', '2', 1),
+        Menu('交易类型全部-县级地区-招标资格预审公告', 'gs_gnzggzyjyzx_jsgc_zb_05', '3', 1),
+    ]
+    Crawl_Zgyd().start_list(menus)

+ 211 - 0
lzz_theme/gnzggzyjyzx/gnz_sx_zb.py

@@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-10
+---------
+@summary: 甘南州公共资源交易中心
+---------
+@author: Lzz
+"""
+import sys
+import os
+import copy
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from api_param import *
+from parsel import Selector
+import requests
+import warnings
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+warnings.simplefilter(action='ignore', category=RuntimeWarning)
+
+
+class Crawl_Zgyd:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.zt_details = self.py_spider.data_bak
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+
+        headers = {
+            "accept": "text/html, */*; q=0.01",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "cache-control": "no-cache",
+            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+            "pragma": "no-cache",
+            "priority": "u=1, i",
+            "referer": "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list?index=1&selectedProjectType=1&tradeStatus=0",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "x-requested-with": "XMLHttpRequest"
+        }
+        data = f"pageNo={page}&pageSize=20&tradeStatus=0&prjpropertycode=1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11&prjpropertycode=21%2C22%2C23%2C24&prjpropertycode=31&prjpropertycode=13%2C14%2C15%2C16%2C18%2C19%2C20&prjpropertycode=600&tradeArea=1&tradeArea=2&tradeArea=3&tradeArea=4&tradeArea=5&tradeArea=6&tradeArea=7&tradeArea=8&projectname=&tabType=3&tradeType="
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "proxies": self.proxy,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        url = "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/getAnnoList"
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        selector = Selector(text=response.text, base_url="https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list")
+        selector.root.make_links_absolute()
+        info_list = selector.xpath('//dd')
+
+        for info in info_list:
+            title = "".join(info.xpath('./a/text()').extract()).replace('[]', '').strip()
+            href = info.xpath('./a/@href').extract_first("").strip()
+            publish_time = info.xpath('./span[@class="byTradingDetailTime"]/text()').extract_first("").strip()
+
+            area = "甘肃"
+            city = "甘南州"
+
+            dedup = [href, publish_time]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "甘南州公共资源交易中心",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": area,
+                    "city": city,
+                    "district": "",
+                    "href": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": "parse_url",
+                    "parser_name": "ztpc_gnzggzyjyzx",
+                    "is_mixed": True,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.get_details(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def get_details(self, item):
+
+        xtext, file_list, new1_url, order = get_xmxx(item['href'], self.proxy)
+        gtext, gg_url, new2_url = get_ggxx(item['href'], self.proxy)
+        ztext, zb_url, new3_url = get_zbgs(item['href'], self.proxy)
+
+        old_title = item['title']
+        org_item = copy.deepcopy(item)
+
+        dt_info = {"项目信息": {"text": xtext, "url": file_list, "new_url": new1_url},
+                   "公告信息": {"text": gtext, "url": gg_url, "new_url": new2_url},
+                   "中标公示": {"text": ztext, "url": zb_url, "new_url": new3_url}}
+        for k, v in dt_info.items():
+            if v.get('text'):
+                item["contenthtml"] = v.get('text')
+            else:
+                item["contenthtml"] = "详情请访问原网页!"
+
+            item['title'] = old_title + f"_{k}"
+            item['href'] = v.get('new_url')
+
+            attachments = {}
+            if type(v.get('url')) == str:
+                file_name = item['title']
+                file_url = v.get('url')
+                file_type = extract_file_type(file_url=file_url)
+                if file_type:
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url)
+                    if attachment.__contains__("fid"):
+                        attachments[str(len(attachments) + 1)] = attachment
+            elif type(v.get('url')) == list:
+                for mm in v.get('url'):
+                    f_name = mm.get('filename')
+                    f_url = mm.get('fileurl','')
+                    if "http" not in f_url:
+                        f_url = f"https://ggzyjy.gnzrmzf.gov.cn/f/OssUploadDownload/countyDownloadFile?fileName={f_name}&urlStr={mm.get('fileurl','')}"
+                    f_type = extract_file_type(file_url=f_url)
+                    if f_type:
+                        attachment = AttachmentDownloader().fetch_attachment(
+                            file_name=f_name, file_type=f_type, download_url=f_url)
+                        if attachment.__contains__("fid"):
+                            attachments[str(len(attachments) + 1)] = attachment
+
+            if attachments:
+                item["projectinfo"] = {"attachments": attachments}
+            elif not attachments and not v.get('text'):
+                return
+
+            item = format_fileds(item)
+
+            try:
+                self.zt_details.insert_one(item)
+                logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+            except DuplicateKeyError:
+                logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+            item = copy.deepcopy(org_item)
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    crawl_num = len(informations)
+                    self.real_cont += crawl_num
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.random())
+                    break
+                else:
+                    self.proxy = get_proxy()
+                    retry_times += 1
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                self.proxy = get_proxy()
+                retry_times += 1
+                time.sleep(2)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('交易类型全部-县级地区-项目信息', 'gs_gnzggzyjyzx_jsgc_zb_02', 1),
+    ]
+    Crawl_Zgyd().start_list(menus)

+ 181 - 0
lzz_theme/gnzggzyjyzx/gnz_sx_zgys.py

@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-10
+---------
+@summary: 甘南州公共资源交易中心
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from api_param import get_gg
+from parsel import Selector
+import requests
+import warnings
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+warnings.simplefilter(action='ignore', category=RuntimeWarning)
+
+
+class Crawl_Zgyd:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.zt_details = self.py_spider.data_bak
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+
+        headers = {
+            "accept": "text/html, */*; q=0.01",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "cache-control": "no-cache",
+            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+            "pragma": "no-cache",
+            "priority": "u=1, i",
+            "referer": "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list?index=1&selectedProjectType=1&tradeStatus=0",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "x-requested-with": "XMLHttpRequest"
+        }
+
+        data = f"pageNo={page}&pageSize=20&tradeStatus=0&prjpropertycode=1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11&prjpropertycode=21%2C22%2C23%2C24&prjpropertycode=31&prjpropertycode=13%2C14%2C15%2C16%2C18%2C19%2C20&prjpropertycode=600&tradeArea=1&tradeArea=2&tradeArea=3&tradeArea=4&tradeArea=5&tradeArea=6&tradeArea=7&tradeArea=8&projectname=&tabType=1&tradeType="
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "proxies": self.proxy,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        url = "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/getAnnoList"
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        selector = Selector(text=response.text, base_url="https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list")
+        selector.root.make_links_absolute()
+        info_list = selector.xpath('//dd')
+        for info in info_list:
+            title = "".join(info.xpath('./a/text()').extract()).replace('[]', '').strip()
+            href = info.xpath('./a/@href').extract_first("").strip()
+            publish_time = info.xpath('./span[@class="byTradingDetailTime"]/text()').extract_first("").strip()
+
+            area = "甘肃"
+            city = "甘南州"
+
+            dedup = [href, publish_time]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "甘南州公共资源交易中心",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": area,
+                    "city": city,
+                    "district": "",
+                    "href": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": "parse_url",
+                    "parser_name": "ztpc_gnzggzyjyzx",
+                    "is_mixed": True,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.get_details(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def get_details(self, item):
+
+        file_url = get_gg(item['href'], self.proxy)
+
+        item["contenthtml"] = "详情请访问原网页!"
+
+        attachments = {}
+
+        file_name = item['title']
+        file_type = extract_file_type(file_url=file_url)
+        if file_type:
+            attachment = AttachmentDownloader().fetch_attachment(
+                file_name=file_name, file_type=file_type, download_url=file_url)
+            if attachment.__contains__("fid"):
+                attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item["projectinfo"] = {"attachments": attachments}
+        else:
+            return
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def crawl_list_spider(self, page, menu):
+        response = None
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    crawl_num = len(informations)
+                    self.real_cont += crawl_num
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.random())
+                    break
+                else:
+                    self.proxy = get_proxy()
+                    retry_times += 1
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                self.proxy = get_proxy()
+                retry_times += 1
+                time.sleep(2)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('交易类型全部-县级地区-招标资格预审公告', 'gs_gnzggzyjyzx_jsgc_zb_05', 1),
+    ]
+    Crawl_Zgyd().start_list(menus)

+ 211 - 0
lzz_theme/gnzggzyjyzx/gnz_ygcg.py

@@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-10
+---------
+@summary: 甘南州公共资源交易中心
+---------
+@author: Lzz
+"""
+import copy
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from api_param import *
+from parsel import Selector
+import requests
+import warnings
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+warnings.simplefilter(action='ignore', category=RuntimeWarning)
+
+
+class Crawl_Zgyd:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.zt_details = self.py_spider.data_bak
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+
+        headers = {
+            "accept": "text/html, */*; q=0.01",
+            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "cache-control": "no-cache",
+            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "origin": "https://ggzyjy.gnzrmzf.gov.cn",
+            "pragma": "no-cache",
+            "priority": "u=1, i",
+            "referer": "https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list?index=1&selectedProjectType=1&tradeStatus=0",
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "x-requested-with": "XMLHttpRequest"
+        }
+        params = f"pageNo={page}&pageSize=20&tradeStatus=0&prjpropertycode=800&prjpropertycode=801&prjpropertycode=802&prjpropertycode=803&tradeArea=14&projectname=&tabType={menu.tid}&tradeType=sunAll"
+        request_params = {
+            "headers": headers,
+            "params": params,
+            "proxies": self.proxy,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        url = "https://ggzyjy.gnzrmzf.gov.cn/f/purchase/purchaseAnnoment/getAnnoList?type=sunAll&annomentTitle="
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        selector = Selector(text=response.text,base_url="https://ggzyjy.gnzrmzf.gov.cn/f/newtrade/annogoods/list")
+        selector.root.make_links_absolute()
+
+        info_list = selector.xpath('//dd')
+
+        for info in info_list:
+            title = "".join(info.xpath('./a/text()').extract()).replace('[]', '').strip()
+            href = info.xpath('./a/@href').extract_first("").strip()
+            publish_time = info.xpath('./span[@class="byTradingDetailTime"]/text()').extract_first("").strip()
+
+            area = "甘肃"
+            city = "甘南州"
+
+            dedup = [href, publish_time]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "甘南州公共资源交易中心",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": area,
+                    "city": city,
+                    "district": "",
+                    "href": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": "parse_url",
+                    "parser_name": "ztpc_gnzggzyjyzx",
+                    "is_mixed": True,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.get_details(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def get_details(self, item):
+        href = item['href']
+        hid = "".join(re.findall('cn/f/(.*?)/', href))
+        pid = "".join(re.findall('nnoment/(.*?)/', href))
+        aid = "".join(re.findall('annogoodsId=[\d]{1,8}', href)).replace('annogoodsId=', '')
+        gglist = get_gglist(href, self.proxy)
+
+        old_title = item['title']
+        org_item = copy.deepcopy(item)
+
+        for gg in gglist:
+            if gg and "(this);" in gg:
+
+                tmp = (hid, pid, aid, href, self.proxy)
+                ggtype, detail_text = eval(gg.replace('(this);', str(tmp)))
+                root = Selector(text=detail_text)
+                ex_html = root.xpath('//div[@class="CaiGouPrompt"]').extract_first()
+                if ex_html:
+                    detail_text = detail_text.replace(ex_html,'')
+
+                item["contenthtml"] = detail_text
+                txp = '//div[@class="yDealMain"]/h6[@class="yDealMainTitle"]/text()|//h4[@class="yAnnounceName"]/text()'
+                s_title = root.xpath(txp).extract_first("").strip()
+                if s_title:
+                    item['s_title'] = s_title
+                else:
+                    item['title'] = old_title + f"_{ggtype}"
+                item['href'] = href + f"&{int(time.time()*1000)}"
+
+                attachments = {}
+
+                file_list = root.xpath('//a')
+                if file_list:
+                    for info in file_list:
+                        file_url = info.xpath('./@href').extract_first("").strip()
+                        file_name = info.xpath('./text()').extract_first("").strip()
+                        file_type = extract_file_type(file_name, file_url)
+                        if file_type:
+                            attachment = AttachmentDownloader().fetch_attachment(
+                                file_name=file_name, file_type=file_type, download_url=file_url)
+                            if attachment.__contains__("fid"):
+                                attachments[str(len(attachments) + 1)] = attachment
+
+                if attachments:
+                    item["projectinfo"] = {"attachments": attachments}
+
+                item = format_fileds(item)
+
+                try:
+                    self.zt_details.insert_one(item)
+                    logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+                except DuplicateKeyError:
+                    logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+                item = copy.deepcopy(org_item)
+
+    def crawl_list_spider(self, page, menu):
+        response = None
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    crawl_num = len(informations)
+                    self.real_cont += crawl_num
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.random())
+                    break
+                else:
+                    self.proxy = get_proxy()
+                    retry_times += 1
+                    time.sleep(1)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                self.proxy = get_proxy()
+                retry_times += 1
+                time.sleep(2)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('交易类型全部-阳光采购-合同', 'gs_gnzggzyjyzx_jsgc_zb_09', '1', 1),
+        Menu('交易类型全部-阳光采购-合同', 'gs_gnzggzyjyzx_jsgc_zb_09', '2', 1),
+    ]
+    Crawl_Zgyd().start_list(menus)

+ 14 - 0
lzz_theme/gnzggzyjyzx/start.sh

@@ -0,0 +1,14 @@
+#!/bin/bash
+
+ps -ef |grep "gnz_gn_zbjh.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "gnz_gn_zgys.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "gnz_sx_zb.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "gnz_sx_zgys.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "gnz_ygcg.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+
+nohup python3 gnz_gn_zbjh.py > log/gnz_gn_zbjh.out 2>&1 &
+nohup python3 gnz_gn_zgys.py > log/gnz_gn_zgys.out 2>&1 &
+nohup python3 gnz_sx_zb.py > log/gnz_sx_zb.out 2>&1 &
+nohup python3 gnz_sx_zgys.py > log/gnz_sx_zgys.out 2>&1 &
+nohup python3 gnz_ygcg.py > log/gnz_ygcg.out 2>&1 &
+

+ 0 - 1
lzz_theme/hnszfcgdzmc/dt_start.sh

@@ -2,4 +2,3 @@
 
 ps -ef |grep "zxjj_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
 nohup python3 zxjj_details.py > log/zxjj_details.out 2>&1 &
-

+ 3 - 3
lzz_theme/hnszfcgdzmc/hn_collector.py

@@ -24,8 +24,8 @@ menus_list = []
 # file_list = [file for file in files if re.findall('[\u4e00-\u9fa5]', file)]
 
 for file in file_list:
-    module_name = file[:-3].replace(".\\", "")  # 去除后缀名".py" windows
-    # module_name = file[:-3].replace("./", "")  # 去除后缀名".py"  centos mac
+    # module_name = file[:-3].replace(".\\", "")  # 去除后缀名".py" windows
+    module_name = file[:-3].replace("./", "")  # 去除后缀名".py"  centos mac
     if re.findall('^[\u4e00-\u9fa5]', module_name):
         spider_param = importlib.import_module(module_name)
         menus_list.append(spider_param.menus)
@@ -54,5 +54,5 @@ def main(work=1):
 
 
 if __name__ == '__main__':
-    main(work=1)
+    main(work=4)
 

+ 6 - 32
lzz_theme/hnszfcgdzmc/pinyi_jjjg_spider.py → lzz_theme/hnszfcgdzmc/jjjg_spider.py

@@ -21,23 +21,6 @@ import warnings
 
 warnings.filterwarnings('ignore')
 
-# def get_IP():
-#     url = "https://share.proxy.qg.net/get?key=09D1B211&num=1&area=&isp=0&format=txt&seq=&distinct=true"
-#     res = requests.get(url, timeout=10)
-#     proxy = {'http': f'http://{res.text}',
-#              'https': f'http://{res.text}'}
-#     logger.warning(proxy)
-#     return proxy
-
-def get_IP():
-    proxy = "http://CBD8E3AB:E936749E747D@tun-uzqqwl.qg.net:19281"
-    proxies = {
-        "http": proxy,
-        "https": proxy,
-    }
-    return proxies
-
-
 
 class Crawl_Hndzmc:
 
@@ -46,9 +29,6 @@ class Crawl_Hndzmc:
         self.zb_list = self.py_spider.theme_list
         self.RDS = RedisFilter()
         self.real_cont = 0
-        # self.ip_list = pinyi_proxy()
-        # self.py_proxy = self.ip_list.pop(0)
-        self.py_proxy = get_IP()
         self.params = {}
         self.cookies = {}
 
@@ -103,11 +83,11 @@ class Crawl_Hndzmc:
         except:
             return {}
 
-    def fetch_list_page(self, page, menu, proxy):
+    def fetch_list_page(self, page, menu):
         logger.debug(f' *** {menu.channel} 开始采集第{page}页 ***')
 
         session = requests.Session()
-        session.proxies = get_IP()
+        session.proxies = get_QGIP()
         session.verify = False
 
         headers = {
@@ -126,7 +106,7 @@ class Crawl_Hndzmc:
         data = {
             "backCategoryName": "",
             "pageNo": page,
-            "pageSize": 10,
+            "pageSize": 100,
             "stateList": [],
             "otherSearch": "",
             "districtCodeList": [],
@@ -210,10 +190,8 @@ class Crawl_Hndzmc:
         retry = 0
         while (retry := retry + 1) < 10:
             try:
-                # if len(self.ip_list) < 10:
-                #     self.ip_list = pinyi_proxy()
                 logger.debug(f"{menu.channel}_第{page}页 start")
-                response = self.fetch_list_page(page=page, menu=menu, proxy=self.py_proxy)
+                response = self.fetch_list_page(page=page, menu=menu)
                 if response is not None and response.status_code == 200:
                     informations = self.parser_list_page(response=response, page=page, menu=menu)
                     crawl_num = len(informations)
@@ -223,13 +201,9 @@ class Crawl_Hndzmc:
                     return
                 else:
                     time.sleep(2)
-                    # self.py_proxy = self.ip_list.pop(0)
-                    self.py_proxy = get_IP()
             except Exception as e:
                 logger.error(e)
                 time.sleep(2)
-                # self.py_proxy = self.ip_list.pop(0)
-                self.py_proxy = get_IP()
 
     def start(self, menus):
         logger.debug("采集开始 》》》 ")
@@ -243,7 +217,7 @@ if __name__ == '__main__':
     Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
 
     menus = [
-        Menu('湖南省本级-竞价结果', 'hn_hnszfcgdzmc_hnsbj_jjjg', 200),
+        Menu('湖南省本级-竞价结果', 'hn_hnszfcgdzmc_hnsbj_jjjg', 1),
     ]
 
-    Crawl_Hndzmc().start(menus)
+    Crawl_Hndzmc().start(menus)

+ 2 - 14
lzz_theme/hnszfcgdzmc/pinyi_spider.py → lzz_theme/hnszfcgdzmc/spider.py

@@ -193,6 +193,7 @@ class Crawl_Hndzmc:
                                 break
                             self.proxy = get_proxy()
 
+
                 item = {
                     "site": "湖南省政府采购电子卖场",
                     "channel": menu.channel,
@@ -225,7 +226,7 @@ class Crawl_Hndzmc:
                 self.RDS.data_save_redis(dedup)
                 results_list.append(item)
 
-        logger.info(f' *** {menu.channel}_第{page}页 采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        logger.info(f' *** {menu.channel}_{menu.anTypes}_第{page}页 采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
         if len(results_list) < 10 or len(info_list) < 100:
             self.is_stop = True
         return results_list
@@ -259,16 +260,3 @@ class Crawl_Hndzmc:
                 if self.is_stop:
                     break
             self.is_stop = False
-
-
-if __name__ == '__main__':
-
-    Menu = namedtuple('Menu', ['channel', 'spidercode', 'district', 'anTypes', 'crawl_page'])
-
-    announcementTypes = [8020, 8025, 8026, 8013]
-
-    menus = [
-        Menu('湖南省本级-公告大厅', 'hn_hnszfcgdzmc_hnsbj_ggdt', '439900', announcementTypes, 30),
-    ]
-
-    Crawl_Hndzmc().start(menus)

+ 135 - 0
lzz_theme/hnszfcgdzmc/start.sh

@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-06-24
+---------
+@summary: 湖南省政府采购电子卖场 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+sys.path.append(os.path.dirname(os.getcwd()))
+
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.rds = Redis_client()
+        self.headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Pragma": "no-cache",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+            "X-Requested-With": "XMLHttpRequest",
+        }
+
+
+    def detail_get(self, response, item):
+        html = response.json().get('result').get('content')
+
+        item["contenthtml"] = html.replace('None', '')
+
+        attachments = {}
+        file_list = response.json().get('result').get('attachments')
+        if file_list:
+            for info in file_list:
+                file_url = info.get('url','')
+                file_name = info.get('attachmentName','').strip()
+                file_type = extract_file_type(file_name, file_url)
+                if not file_type:
+                    continue
+
+                attachment = AttachmentDownloader().fetch_attachment(
+                    file_name=file_name,
+                    file_type=file_type,
+                    download_url=file_url,
+                    proxies=self.proxy
+                )
+                attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item['projectinfo'] = {"attachments": attachments}
+
+        item = format_fileds(item)
+
+        item['comeintime'] = int2long(time.time())
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        href = item.get("parse_url")
+        rid = "".join(re.findall('requisitionId=(.*?)&',href))
+        tp = "".join(re.findall('&type=(.*)',href))
+        url = "https://hunan.zcygov.cn/api/sparta/announcement/detail"
+        params = {
+            "requisitionId": rid,
+            "type": tp,
+            "timestamp": f"{int(time.time())}"
+        }
+        response = requests.get(url=url, headers=self.headers, params=params,
+                                proxies=self.proxy, timeout=30, verify=False)
+
+        return response
+
+    def deal_request(self, item):
+        response = None
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 3:
+            try:
+                response = self.fetch_request(item)
+                if response is not None and response.status_code == 200:
+                    self.detail_get(response, item=item)
+                    time.sleep(random.random())
+                    return True
+                else:
+                    self.proxy = get_proxy()
+                    time.sleep(1)
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item['href']} 采集异常:{e}")
+                self.proxy = get_proxy()
+                retry_times += 1
+        logger.warning(f"[采集失败]{item['href']}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(random.random())
+        count = 0
+        with self.db_name.find({"parser_name": "ztpc_hnszfcgdzmc", "failed": False, "is_crawl": False},
+                               no_cursor_timeout=True) as data_lsit:
+            for item in data_lsit:
+                # logger.debug(item)
+                if count >= limit:
+                    break
+                count += 1
+                update_id = item["_id"]
+                retry = item["retry"]
+                if self.deal_request(item):
+                    self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+                else:
+                    retry += 1
+                    self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=200)

+ 4 - 10
lzz_theme/hnszfcgdzmc/pinyi_zxjj_spider.py → lzz_theme/hnszfcgdzmc/zxjj_spider.py

@@ -29,8 +29,6 @@ class Crawl_Hndzmc:
         self.zb_list = self.py_spider.theme_list
         self.RDS = RedisFilter()
         self.real_cont = 0
-        self.ip_list = pinyi_proxy()
-        self.py_proxy = self.ip_list.pop(0)
         self.params = {}
         self.cookies = {}
 
@@ -85,11 +83,11 @@ class Crawl_Hndzmc:
         except:
             return {}
 
-    def fetch_list_page(self, page, menu, proxy):
+    def fetch_list_page(self, page, menu):
         logger.debug(f' *** {menu.channel} 开始采集第{page}页 ***')
 
         session = requests.Session()
-        session.proxies = proxy
+        session.proxies = get_QGIP()
         session.verify = False
 
         headers = {
@@ -194,10 +192,8 @@ class Crawl_Hndzmc:
         retry = 0
         while (retry := retry + 1) < 10:
             try:
-                if len(self.ip_list) < 10:
-                    self.ip_list = pinyi_proxy()
                 logger.debug(f"{menu.channel}_第{page}页 start")
-                response = self.fetch_list_page(page=page, menu=menu, proxy=self.py_proxy)
+                response = self.fetch_list_page(page=page, menu=menu)
                 if response is not None and response.status_code == 200:
                     informations = self.parser_list_page(response=response, page=page, menu=menu)
                     crawl_num = len(informations)
@@ -207,11 +203,9 @@ class Crawl_Hndzmc:
                     return
                 else:
                     time.sleep(2)
-                    self.py_proxy = self.ip_list.pop(0)
             except Exception as e:
                 logger.error(e)
                 time.sleep(2)
-                self.py_proxy = self.ip_list.pop(0)
 
     def start(self, menus):
         logger.debug("采集开始 》》》 ")
@@ -230,4 +224,4 @@ if __name__ == '__main__':
         Menu('湖南省本级-最新竞价', 'hn_hnszfcgdzmc_hnsbj_zxjj', 1),
     ]
 
-    Crawl_Hndzmc().start(menus)
+    Crawl_Hndzmc().start(menus)

+ 160 - 0
lzz_theme/htdzcgpt/htdz_bggg_list.py

@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-10
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def openChilUrl(url):
+    return url
+
+
+class Crawl_Htdz:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "https://td.ispacechina.com",
+            "Referer": "https://td.ispacechina.com/xjbgList.do",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+        url = f"https://td.ispacechina.com/{menu.tid}.do"
+        data = {
+            "hzdm": "",
+            "pm": "",
+            "subType": "",
+            "fphm": "",
+            "title": "",
+            "hyname": "",
+            "rqStart2": "",
+            "rqEnd2": "",
+            "pageNumber": f"{page}",
+            "pageSize": "10",
+            "sortColumns": "undefined"
+        }
+
+        resp = requests.post(url, headers=headers, data=data, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = Selector(response.text).xpath('//div[@class="dynamic_tablebox"]/table/tbody/tr')
+        for info in info_list:
+            if menu.channel == "变更公告":
+                hid = info.xpath('./td[2]/@changeid').extract_first()
+                href = f"https://td.ispacechina.com/xjbgInfo.do?changeId={hid}"
+            else:
+                hid = info.xpath('./td[2]/@fphm').extract_first()
+                href = f"https://td.ispacechina.com/xjzzInfo.do?fphm={hid}"
+            title = info.xpath('./td[2]/@title').extract_first()
+            create_time = info.xpath('./td[last()]/@title').extract_first("") + ":00"
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "航天电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_htdz_zbhxrgs",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(3)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('变更公告', 'a_htdzcgpt_biggg', 'xjbgList', 1),
+        Menu('中止公告', 'a_htdzcgpt_zhzgg', 'xjzzList', 1),
+    ]
+    Crawl_Htdz().start_list(menus)

+ 99 - 0
lzz_theme/htdzcgpt/htdz_cjgg_details.py

@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from parsel import Selector
+import json
+import warnings
+import copy
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json;charset=utf-8",
+            "Origin": "https://jt.ispacechina.com",
+            "Referer": "https://jt.ispacechina.com/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+        }
+
+    def detail_get(self, response, item):
+
+        root = Selector(response.text)
+        html = root.xpath('//div[@class="htcg_dynamic_con"]').extract_first("")
+        rm_list = ['//div[@class="clearfix"]', ]
+        html = remove_htmldata(rm_list, html, root)
+        item['contenthtml'] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        response = requests.get(url=item.get("parse_url"), headers=self.headers, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = copy.deepcopy(item)
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(30)
+        with self.db_name.find({"parser_name": "ztpc_htdz_cjgg", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=20)

+ 151 - 0
lzz_theme/htdzcgpt/htdz_cjgg_list.py

@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from parsel import Selector
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def viewDetail(hid):
+    return hid
+
+
+class Crawl_Htdz:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "https://ws.ispacechina.com",
+            "Pragma": "no-cache",
+            "Referer": "https://ws.ispacechina.com/homeJjsellggList.do",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+        }
+
+        url = "https://ws.ispacechina.com/homeJjsellggList.do"
+        data = {
+            "order": "",
+            "dxtype": "",
+            "dqname": "",
+            "priceType": "",
+            "showType": "1",
+            "fphm": "",
+            "title": "",
+            "rqStart": "",
+            "rqEnd": "",
+            "pageNumber": f"{page}",
+            "pageSize": "12",
+            "pageNumber1": "1",
+            "pageSize1": "12"
+        }
+        resp = requests.post(url, headers=headers, data=data, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = Selector(response.text).xpath('//div[@class="htzszy_listmode mt16"][1]/div[@class="list_wrap htzszy_prolist rows"]/ul/li')
+        for info in info_list:
+            f_org = info.xpath('./a/@onclick').extract_first("").replace(';', '')
+            if not f_org:
+                continue
+            hid = eval(f_org)
+            href = f"https://ws.ispacechina.com/homeJjsellggInfo.do?boutid={hid}&isCjgg=1"
+            title = info.xpath('.//div[contains(@class,"proname")]/text()').extract_first("").strip()
+            create_time = f"2024-{hid[4:6]}-{hid[6:8]}"
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "航天电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_htdz_cjgg",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(3)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
+
+    menus = [
+        Menu('成交公告', 'a_htdzcgpt_cjgg', 1),
+    ]
+    Crawl_Htdz().start_list(menus)

+ 134 - 0
lzz_theme/htdzcgpt/htdz_jzxtp_details.py

@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import json
+import warnings
+import copy
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def detail_get(self, response, item):
+        if "您登录的账号没有[通用服务询价]场次的报价权限" in response.text:
+            logger.warning("权限不足!!!")
+            return
+
+        if "获取用户信息失败" in response.text or "授权信息错误" in response.text:
+            try:
+                os.remove('./htdz_ck.json')
+            except:
+                pass
+            raise CustomError("cookies 失效!重新生成...")
+
+        attachments = {}
+        dt = response.json().get('data')
+        if not item['publishtime']:
+            pbtime = dt.get('releaseDate')
+            item['publishtime'] = pbtime
+
+        file_url = "https://resource.ispacechina.com/v1/fs/web/download/" + dt.get('pdfPath')
+        file_name = dt.get('pdfName') or item['title']
+        file_type = extract_file_type(file_name, file_url) or "pdf"
+
+        attachment = AttachmentDownloader().fetch_attachment(
+            file_name=file_name, file_type=file_type, download_url=file_url)
+        attachments[str(len(attachments) + 1)] = attachment
+        item['projectinfo'] = {"attachments": attachments}
+
+        item["contenthtml"] = "详情请访问原网页!"
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        cookies = self.get_cookies()
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Authorization": cookies.get('Market.ispacechina.com.access_token', ''),
+            "Connection": "keep-alive",
+            "Content-Type": "application/json;charset=utf-8",
+            "Origin": "https://jt.ispacechina.com",
+            "Referer": "https://jt.ispacechina.com/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+        }
+        params = item.get('params',None)
+        response = requests.get(url=item.get("parse_url"), params=params,headers=headers, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = copy.deepcopy(item)
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(60)
+        with self.db_name.find({"parser_name": "ztpc_htdz_jzxtp", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=50)

+ 179 - 0
lzz_theme/htdzcgpt/htdz_jzxtp_list.py

@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def openChilUrl(url):
+    return url
+
+
+class Crawl_Htdz:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Authorization": "74929d69-eca8-44d7-acfd-56f270d9d6bb",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json;charset=UTF-8;",
+            "Origin": "https://jt.ispacechina.com",
+            "Referer": "https://jt.ispacechina.com/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+        url = "https://msapi.ispacechina.com/htnegoweb/home/notice/search"
+        params = {
+            "ssid": "M80320",
+            "subsystem": "S00040",
+            "_t": f"{int(time.time()*1000)}"
+        }
+        data = {
+            "inSaleValid": 0,
+            "areaCode": "",
+            "cityCode": "",
+            "cityName": "",
+            "countyName": "",
+            "tradeName": "",
+            "detailedAddress": "",
+            "searchType": "",
+            "provinceList": "",
+            "projectType": "",
+            "noticeType": menu.tid,
+            "keyword": "",
+            "commodityName": "",
+            "purchaseAddress": "",
+            "releaseStartDate": "",
+            "releaseEndDate": "",
+            "releaseDateSort": "",
+            "page": page,
+            "size": 10
+        }
+        data = json.dumps(data, separators=(',', ':'))
+        resp = requests.post(url, headers=headers, data=data, params=params, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('data').get('list')
+        for info in info_list:
+            hid = info.get('bizId')
+            tp = info.get('noticeType')
+            href = f"https://jt.ispacechina.com/negoweb/noticeinfo?id={hid}&type={tp}"
+            title = info.get('projectName').strip()
+            create_time = info.get('pushdate')
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                dparams = {
+                    "noticeType": f"{tp}",
+                    "bizId": f"{hid}",
+                    "ssid": "M80320",
+                    "subsystem": "S00040",
+                }
+                item = {
+                    "site": "航天电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "params": dparams,
+                    "parse_url": "https://msapi.ispacechina.com/htnegoweb/home/notice/detail",
+                    "parser_name": "ztpc_htdz_jzxtp",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(3)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('采购公告', 'a_htdzcgpt_cggg', 1, 1),
+        Menu('变更公告', 'a_htdzcgpt_bggg', 2, 1),
+        Menu('预成交结果公告', 'a_htdzcgpt_ycjjggg', 4, 1),
+        Menu('成交结果公告', 'a_htdzcgpt_cjjggg', 5, 1),
+        Menu('终止公告', 'a_htdzcgpt_zhozgg', 5, 1),
+    ]
+    Crawl_Htdz().start_list(menus)

+ 111 - 0
lzz_theme/htdzcgpt/htdz_login.py

@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+import time
+
+import requests
+from hashlib import md5
+from utils.get_imgcode import chaojiying_platform
+from loguru import logger
+import json
+
+
+def create_cookie(username="jianyu2022", password="jY@123456"):
+    time.sleep(3)
+    session = requests.session()
+
+    iheaders = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "Pragma": "no-cache",
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+    }
+    iurl = "https://bs.ispacechina.com/login/unifyLogin.do?title=%E7%AB%9E%E4%BA%89%E6%80%A7%E8%B0%88%E5%88%A4%EF%BC%88%E7%A3%8B%E5%95%86%EF%BC%89%E7%99%BB%E5%BD%95&url=https://jt.ispacechina.com/negoweb/"
+    session.get(iurl, headers=iheaders)
+
+    rheaders = {
+        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "Pragma": "no-cache",
+        "Referer": iurl,
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+    }
+
+    yzm_url = "https://bs.ispacechina.com/authimg.img"
+
+    res = session.get(yzm_url, headers=rheaders)
+    # with open('./image.jpg', 'wb+') as f:
+    #     f.write(res.content)
+    rcode = chaojiying_platform(res.content, pic_type=6001)[0]
+
+    rrheaders = {
+        "Accept": "text/plain, */*; q=0.01",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "Content-Length": "0",
+        "Origin": "https://bs.ispacechina.com",
+        "Pragma": "no-cache",
+        "Referer": iurl,
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+        "X-Requested-With": "XMLHttpRequest",
+    }
+
+    rurl = "https://bs.ispacechina.com/market/syscode/publicfun/getPwdRandom.do"
+    rrr = session.post(rurl, headers=rrheaders)
+    randomm = str(rrr.text)
+
+    def md5value(s: str, randomm: str):
+        sm1 = md5(s.encode()).hexdigest().upper()
+        sm2 = md5((sm1 + randomm).encode()).hexdigest().upper()
+        return sm2
+
+    url = "https://bs.ispacechina.com/login/doUnifyLogin.do"
+    headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "Content-Type": "application/x-www-form-urlencoded",
+        "Origin": "https://bs.ispacechina.com",
+        "Pragma": "no-cache",
+        "Referer": iurl,
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+    }
+    retry = 0
+    cookies = {}
+    while retry < 5:
+        data = {
+            "title": "竞争性谈判(磋商)登录",
+            "type": "1",
+            "dltyp": "2",
+            "csstype": "1",
+            "logsid": "unifyLogin",
+            "pwd": md5value(password, randomm),
+            "url": "https://jt.ispacechina.com/negoweb/",
+            "defaultUrl": "https://www.ispacechina.com",
+            "isBind": "",
+            "uid": f"{username}",
+            "kl": "",
+            "randCode": f"{rcode}",
+            "openid": "",
+            "uid2": "",
+            "k2": "",
+            "sealdata": ""
+        }
+        response = session.post(url, headers=headers, data=data)
+        cookies = session.cookies.get_dict()
+
+        if "验证码错误,请重试" not in response.text:
+            with open(f'./htdz_ck.json', 'w', encoding='utf-8') as fw:
+                fw.write(json.dumps(cookies))
+            logger.success(f"{username} 登录成功!")
+            break
+        retry += 1
+        time.sleep(5)
+
+    return cookies

+ 139 - 0
lzz_theme/htdzcgpt/htdz_qtcg_list.py

@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-10
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def openChilUrl(url):
+    return url
+
+
+class Crawl_Htdz:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Referer": "https://www.ispacechina.com/zxdt/qtcg/index.htm",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+        url = f"https://www.ispacechina.com/zxdt/{menu.tid}/index_1.htm"
+
+        resp = requests.get(url, headers=headers, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = Selector(response.text).xpath('//ul[@class="dynamiclist"]/li')
+        for info in info_list:
+            href = eval(info.xpath('./div/h6/a/@onclick').extract_first("").replace(';', ''))
+            title = info.xpath('./div/h6/a/text()').extract_first("").strip()
+            create_time = info.xpath('./div/span/text()').extract_first("")
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "航天电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_htdz_zbhxrgs",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(3)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('国际招标公告', 'a_htdzcgpt_gjzbgg', 'gjzbgg', 1),
+        Menu('其他采购', 'a_htdzcgpt_qtcg', 'qtcg', 1),
+    ]
+    Crawl_Htdz().start_list(menus)

+ 133 - 0
lzz_theme/htdzcgpt/htdz_xjgg_details.py

@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import json
+import warnings
+import copy
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def detail_get(self, response, item):
+        if "您登录的账号没有[通用服务询价]场次的报价权限" in response.text:
+            logger.warning("权限不足!!!")
+            return
+
+        if "合肥拓普网络" not in response.text :
+            try:
+                os.remove('./htdz_ck.json')
+            except:
+                pass
+            raise CustomError("cookies 失效!重新生成...")
+
+
+        f_url = Selector(response.text).xpath('//iframe/@src').extract_first("")
+
+        attachments = {}
+
+        file_url = "".join(re.findall('file=(.*)', f_url))
+        file_name = item['title']
+        file_type = extract_file_type(file_url=file_url) or "pdf"
+
+        if file_type and "http" in file_url:
+            attachment = AttachmentDownloader().fetch_attachment(
+                file_name=file_name, file_type=file_type, download_url=file_url)
+            attachments[str(len(attachments) + 1)] = attachment
+        if attachments:
+            item['projectinfo'] = {"attachments": attachments}
+
+        item["contenthtml"] = "详情请访问原网页!"
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        cookies = self.get_cookies()
+        response = requests.get(url=item.get("parse_url"), headers=self.headers,
+                                cookies=cookies, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = copy.deepcopy(item)
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(30)
+        with self.db_name.find({"parser_name": "ztpc_htdz_xjgg", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=50)

+ 174 - 0
lzz_theme/htdzcgpt/htdz_xjgg_list.py

@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def openChilUrl(url):
+    return url
+
+
+class Crawl_Htdz:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "https://td.ispacechina.com",
+            "Referer": "https://td.ispacechina.com/xjList.do",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+        url = "https://td.ispacechina.com/xjList.do"
+        data = {
+            "typ": "1",
+            "flname01": "",
+            "days": "2",
+            "industryCode": "",
+            "dqcode": "",
+            "dqname": "",
+            "order": "1",
+            "procureType": "",
+            "title": "",
+            "statusStr": "2",
+            "fphmStr": "",
+            "type": "",
+            "rqStart": "",
+            "rqStart2": "",
+            "rqEnd": "",
+            "rqEnd2": "",
+            "word": "",
+            "wordType": "all",
+            "hyname": "",
+            "productTypeStr": "3",
+            "compareTypeStr": "2",
+            "pageNumber": f"{page}",
+            "pageSize": "10",
+            "sortColumns": "undefined"
+        }
+        cookies = self.get_cookies()
+        resp = requests.post(url, headers=headers, data=data, cookies=cookies, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = Selector(response.text).xpath('//div[@class="menu_content_layer6 menu_content_layer11"]/table/tr')
+        for info in info_list[1:]:
+            fphm = info.xpath('./td[2]/@title').extract_first("")
+            if "***" in fphm:
+                try:
+                    os.remove('./htdz_ck.json')
+                except:
+                    pass
+                raise CustomError("cookies 失效!重新生成...")
+            href = f"https://td.ispacechina.com/xjInfo.do?fphm={fphm}&typ=1"
+            title = fphm + "_" + info.xpath('./td[3]/@title').extract_first("")
+            create_time = info.xpath('./td[last()-1]/@title').extract_first("").split("至")[0] + ":00"
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "航天电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_htdz_xjgg",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(3)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
+
+    menus = [
+        Menu('询价公告', 'a_htdzcgpt_xjgg', 3),
+    ]
+    Crawl_Htdz().start_list(menus)

+ 149 - 0
lzz_theme/htdzcgpt/htdz_zbgg_details.py

@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import json
+import warnings
+import copy
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def detail_get(self, response, item):
+        if "您登录的账号没有[通用服务询价]场次的报价权限" in response.text:
+            logger.warning("权限不足!!!")
+            return
+
+        if "合肥拓普网络" not in response.text :
+            try:
+                os.remove('./htdz_ck.json')
+            except:
+                pass
+            raise CustomError("cookies 失效!重新生成...")
+        root = Selector(response.text)
+        xpath_list = ['//div[@class="detailmain"][last()]']
+        html = ""
+        for xpath in xpath_list:
+            html = root.xpath(xpath).extract_first()
+
+        s_pub = root.xpath('//div[@class="detailhead"]/div[last()]/p[1]/text()').extract_first("").strip()
+        pub_time = "".join(re.findall('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', s_pub))
+        item['publishtime'] = pub_time
+
+        if not html:
+            raise ValueError("空正文!!!")
+
+        rm_list = ['//div[@class="buttonbox txtc"]', '//h6[contains(text(),"发布公告的媒介")]/..']
+
+        html = remove_htmldata(rm_list, html, root)
+
+        item['contenthtml'] = html
+
+        file_info = root.xpath('//form[@id="gform"]//a[@href]')
+
+        if file_info:
+            attachments = {}
+            for info in file_info:
+                file_url = info.xpath('./@href').extract_first("")
+                file_name = "".join(info.xpath('.//text()').extract()).strip()
+                file_type = extract_file_type(file_name, file_url)
+
+                if file_type and "http" in file_url:
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url)
+                    attachments[str(len(attachments) + 1)] = attachment
+            if attachments:
+                item['projectinfo'] = {"attachments": attachments}
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        cookies = self.get_cookies()
+        response = requests.get(url=item.get("parse_url"), headers=self.headers,
+                                cookies=cookies, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = copy.deepcopy(item)
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(30)
+        with self.db_name.find({"parser_name": "ztpc_htdz_zbgg", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=50)

+ 153 - 0
lzz_theme/htdzcgpt/htdz_zbgg_list.py

@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-13
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def openChilUrl(url):
+    return url
+
+
+class Crawl_Htdz:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "https://bd.ispacechina.com",
+            "Referer": "https://bd.ispacechina.com/exp/bidding/sell/signup/indexHt.do",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+        url = "https://bd.ispacechina.com/exp/bidding/sell/signup/indexHt.do"
+        data = {
+            "probid": "",
+            "typ": menu.tid,
+            "subpFl": "",
+            "ggnr": "",
+            "timetype": "",
+            "ggyxq_time": "",
+            "pageNumber": f"{page}",
+            "pageSize": "10",
+            "sortColumns": "undefined"
+        }
+        resp = requests.post(url, headers=headers, data=data, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = Selector(response.text).xpath('//ul[@class="htcg_quailist"]/li')
+        for info in info_list:
+            hid = info.xpath('./div[1]//a[@class="tozbgginfo"]/@probid').extract_first()
+            href = f"https://bd.ispacechina.com/exp/bidding/sell/signup/toZbggInfoHt.do?probid={hid}"
+            title = info.xpath('./div[2]/h5/text()').extract_first("").strip()
+            create_time = ""
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "航天电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_htdz_zbgg",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(3)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('招标公告', 'a_htdzcgpt_zbgg', '1', 1),
+        Menu('资格预审公告', 'a_htdzcgpt_zgysgg', '2', 1),
+    ]
+    Crawl_Htdz().start_list(menus)

+ 153 - 0
lzz_theme/htdzcgpt/htdz_zbhxrgs_details.py

@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-10
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from htdz_login import create_cookie
+from parsel import Selector
+import json
+import warnings
+import copy
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+
+    def get_cookies(self):
+        if not os.path.isfile('./htdz_ck.json'):
+            create_cookie()
+
+        with open('./htdz_ck.json', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        ck = json.loads(cks.replace("'", '"'))
+        return ck
+
+    def detail_get(self, response, item):
+        if "您登录的账号没有[通用服务询价]场次的报价权限" in response.text:
+            logger.warning("权限不足!!!")
+            return
+
+        root = Selector(response.text)
+        xpath_list = ['//div[@class="mainlist"]', '//div[@id="ggInfo02"]', '//div[@id="contentSize"]']
+        html = ""
+        for xpath in xpath_list:
+            html = root.xpath(xpath).extract_first()
+            if html:
+                break
+
+        if "合肥拓普网络" not in response.text and not html:
+            try:
+                os.remove('./htdz_ck.json')
+            except:
+                pass
+            raise CustomError("cookies 失效!重新生成...")
+        elif "合肥拓普网络" in response.text and not html:
+            logger.warning("空正文!!!")
+            return
+
+        s_title = root.xpath('//div[@class="detailhead"]/h4[@class="txtc"]/text()').extract_first("").strip()
+        if s_title:
+            item['s_title'] = s_title
+
+        f_list = ['//div[@class="mainlist"]//a[@href]', '//div[@id="ggInfo02"]//a[@href]']
+        file_info = None
+        for fl in f_list:
+            file_info = root.xpath(fl)
+            if file_info:
+                break
+
+        if file_info:
+            attachments = {}
+            for info in file_info:
+                file_url = info.xpath('./@href').extract_first("")
+                file_name = "".join(info.xpath('.//text()').extract()).strip()
+                file_type = extract_file_type(file_name, file_url)
+
+                if file_type and "http" in file_url:
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url)
+                    attachments[str(len(attachments) + 1)] = attachment
+            if attachments:
+                item['projectinfo'] = {"attachments": attachments}
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        cookies = self.get_cookies()
+        response = requests.get(url=item.get("parse_url"), headers=self.headers,
+                                cookies=cookies, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = copy.deepcopy(item)
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(60)
+        with self.db_name.find({"parser_name": "ztpc_htdz_zbhxrgs", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=50)

+ 158 - 0
lzz_theme/htdzcgpt/htdz_zbhxrgs_list.py

@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-10
+---------
+@summary: 航天电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from parsel import Selector
+import requests
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def to_url(url, x, y):
+    return url
+
+
+class Crawl_Htdz:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "https://bd.ispacechina.com",
+            "Referer": "https://bd.ispacechina.com/retrieve.do",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+        url = "https://bd.ispacechina.com/retrieve.do"
+        data = {
+            "hy": "",
+            "dq": "",
+            "bidderHydm": "",
+            "keyFlag": "1",
+            "fbdays": "0",
+            "packtype": "",
+            "packtypeCode": "",
+            "packtypeValue": "",
+            "packtypeCodeValue": "",
+            "fl": "",
+            "typflag": menu.tid,
+            "esConditions": "",
+            "fbDateStart": "",
+            "fbDateEnd": "",
+            "keyConValue": "",
+            "keyCon": "",
+            "radio": "全部地区",
+            "orderby": "1",
+            "pageSize": "30",
+            "pageNumber": f"{page}",
+            "sortColumns": "undefined"
+        }
+        resp = requests.post(url, data=data, headers=headers, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = Selector(response.text).xpath('//div[@class="menu_content_layer6_list hand searchContentSc"]')
+        for info in info_list:
+            f_org = info.xpath('./@onclick').extract_first("").replace(';', '')
+            if not f_org:
+                continue
+            hid = eval(f_org)
+            href = f"https://bd.ispacechina.com{hid}"
+            title = info.xpath('./div[1]/h2/text()').extract_first("").strip()
+            create_time = info.xpath('./div[2]/h3/text()').extract_first("").replace('发布日期:', '').strip()
+
+            dedup = [href]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "航天电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_htdz_zbhxrgs",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(3)
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
+
+    menus = [
+        Menu('变更公告', 'a_htdzcgpt_bbgg', '3', 1),
+        Menu('中标候选人公示', 'a_htdzcgpt_zbhxrgs', '5', 1),
+        Menu('中标结果公告', 'a_htdzcgpt_zbjggg', '6', 1),
+    ]
+    Crawl_Htdz().start_list(menus)

+ 28 - 0
lzz_theme/htdzcgpt/start.sh

@@ -0,0 +1,28 @@
+#!/bin/bash
+
+ps -ef |grep "htdz_bggg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_cjgg_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_cjgg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_jzxtp_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_jzxtp_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_qtcg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_xjgg_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_xjgg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_zbgg_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_zbgg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_zbhxrgs_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "htdz_zbhxrgs_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+
+nohup python3 htdz_bggg_list.py > log/htdz_bggg_list.out 2>&1 &
+nohup python3 htdz_cjgg_details.py > log/htdz_cjgg_details.out 2>&1 &
+nohup python3 htdz_cjgg_list.py > log/htdz_cjgg_list.out 2>&1 &
+nohup python3 htdz_jzxtp_details.py > log/htdz_jzxtp_details.out 2>&1 &
+nohup python3 htdz_jzxtp_list.py > log/htdz_jzxtp_list.out 2>&1 &
+nohup python3 htdz_qtcg_list.py > log/htdz_qtcg_list.out 2>&1 &
+nohup python3 htdz_xjgg_details.py > log/htdz_xjgg_details.out 2>&1 &
+nohup python3 htdz_xjgg_list.py > log/htdz_xjgg_list.out 2>&1 &
+nohup python3 htdz_zbgg_details.py > log/htdz_zbgg_details.out 2>&1 &
+nohup python3 htdz_zbgg_list.py > log/htdz_zbgg_list.out 2>&1 &
+nohup python3 htdz_zbhxrgs_details.py > log/htdz_zbhxrgs_details.out 2>&1 &
+nohup python3 htdz_zbhxrgs_list.py > log/htdz_zbhxrgs_list.out 2>&1 &
+

+ 29 - 0
lzz_theme/jsxmhjyxdjbbaxt/det_start.sh

@@ -0,0 +1,29 @@
+const jsdom = require("jsdom");
+const {JSDOM} = jsdom;
+const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`,
+    {
+        url: "https://example.org/",
+        referrer: "https://example.com/",
+        contentType: "text/html",
+    });
+window = dom.window;
+
+const crypto = require('crypto');
+
+function en_pwd(pwd) {
+    var c = "igoCloud2022!@It"
+    Buffer.from(c, "base64");
+    let t = crypto.randomBytes(12)
+      , i = crypto.createCipheriv("aes-128-gcm", c, t)
+      , a = i.update(pwd, "utf8", "base64");
+    a += i.final("base64");
+    let s = i.getAuthTag();
+    a = Buffer.from(a, "base64");
+    let r = t.length + a.length + s.length
+      , l = Buffer.concat([t, a, s], r);
+    return l.toString("base64")
+}
+
+
+
+

+ 96 - 0
lzz_theme/lcdzcgpt/lcdz_details.py

@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-02-18
+---------
+@summary: 浪潮电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.tools import *
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Referer": "https://scs.inspur.com/Announcement/announce",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+        }
+        self.cookies = {
+            "Path": "/"
+        }
+
+
+    def detail_get(self, response, item):
+
+        html = response.json().get('result').get('noticebody')
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        response = requests.get(url=item.get("parse_url"), headers=self.headers,
+                                cookies=self.cookies, timeout=30, verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.exception(f"{item.get('competehref')} 采集异常:{e}")
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+
+        with self.db_name.find({"parser_name": "ztpc_lcdzcgpt", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=1)

+ 167 - 0
lzz_theme/lcdzcgpt/lcdz_list.py

@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-02-18
+---------
+@summary: 浪潮电子采购平台
+---------
+@author: lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.RedisDB import RedisFilter
+from collections import namedtuple
+from utils.tools import *
+from parsel import Selector
+import requests
+import json
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+class Crawl_Sgyc:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.zt_details = self.py_spider.data_bak
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Origin": "https://scs.inspur.com",
+            "Referer": "https://scs.inspur.com/Announcement/announce",
+            "Sec-Fetch-Dest": "empty",
+            "Sec-Fetch-Mode": "cors",
+            "Sec-Fetch-Site": "same-origin",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "sec-ch-ua": "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Google Chrome\";v=\"132\"",
+            "sec-ch-ua-mobile": "?0",
+            "sec-ch-ua-platform": "\"Windows\""
+        }
+        cookies = {
+            "Path": "/"
+        }
+
+        url = "https://scs.inspur.com/cloudapis/api/ep/hy/v1.0/portal/lcjt/website/gglist/list"
+        data = {
+            "pageIndex": page-1,
+            "pageSize": 10,
+            "start": "",
+            "end": "",
+            "subType": "",
+            "noticeTitle": "",
+            "ggType": menu.t,
+            "rfqType": menu.c,
+            "rfqTypes": [],
+            "orderBy": "desc",
+            "isVail": "0",
+            "biddingagency": "",
+            "noticetitle": "",
+            "pubtime": [],
+            "noticestate": ""
+        }
+        data = json.dumps(data, separators=(',', ':'))
+        resp = requests.post(url, data=data, headers=headers, cookies=cookies, timeout=30, verify=False)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('result').get('content')
+        for info in info_list:
+            hid = info.get('id')
+            href = f"https://scs.inspur.com/Announcement/annDetails?id={hid}"
+            title = info.get('noticetitle')
+            create_time = info.get('pubtime')
+            html = info.get('noticebody')
+
+            def_city = ""
+            def_area = "全国"
+
+            dedup = [href + title + create_time]
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "浪潮电子采购平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": def_area,
+                    "city": def_city,
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_lcdzcgpt",
+                    "is_mixed": True,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": True,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+
+                item["contenthtml"] = html
+                detail_item = format_fileds(item)
+                self.zt_details.insert_one(detail_item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+        self.real_cont += len(results_list)
+        logger.info(f"当前已采集 {self.real_cont} 条数据")
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while (retry_times := retry_times + 1) < 5:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response is not None and res_code == 200:
+                    self.parser_list_page(response=response, page=page, menu=menu)
+                    time.sleep(random.randint(5, 10))
+                    return
+                else:
+                    time.sleep(3)
+            except Exception as e:
+                logger.exception(f"第{page}页 采集异常:{e}")
+                time.sleep(6)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 't', 'c', 'crawl_page'])
+
+    menus = [
+        Menu('招标公告', 'a_lcdzcgpt_zbgg', 'GG', 'CRFQ', 1),
+        Menu('资格预审公告', 'a_lcdzcgpt_zgysgg', 'YG', 'CRFQ', 1),
+        Menu('询价公告', 'a_lcdzcgpt_xjcg', 'GG', 'SRFQ', 2),
+        Menu('竞争性谈判', 'a_lcdzcgpt_jzxtp', 'GG', 'TPFQ', 2),
+        Menu('变更公告', 'a_lcdzcgpt_bggg', 'GG', '0', 2),
+        Menu('候选人公告', 'a_lcdzcgpt_zhbgs', 'GS', '', 5),
+    ]
+    Crawl_Sgyc().start_list(menus)

+ 43 - 0
lzz_theme/lcdzcgpt/lcdz_login.py

@@ -0,0 +1,43 @@
+import json
+import requests
+import execjs
+from loguru import logger
+
+import warnings
+
+warnings.simplefilter('ignore')
+
+
+def create_cookie(proxies=False):
+    username = "15896959805"
+    password = "Sun15896959805"
+
+    with open('./lcdz.js','r') as fr:
+        ex_js = fr.read()
+    ctx = execjs.compile(ex_js)
+    pwd = ctx.call('en_pwd',password)
+    try:
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Origin": "https://scs.inspur.com",
+            "Referer": "https://scs.inspur.com/login",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+        }
+        url = "https://scs.inspur.com/cloudapis/api/ep/hy/v1.0/portal/lcjt/website/login/uLogin"
+        data = {
+            "userCode": "15896959805",
+            "password": pwd
+        }
+        data = json.dumps(data, separators=(',', ':'))
+        response = requests.post(url, headers=headers, data=data, timeout=30, proxies=proxies, verify=False)
+
+        token = response.json().get('result').get('token')
+        with open(f'./lcdz_ck.json', 'w', encoding='utf-8') as fw:
+            fw.write(token)
+        logger.success(f"{username} 登录成功!")
+        return token
+    except Exception as e:
+        logger.error(f"[登录失败] {e}")

+ 5 - 0
lzz_theme/lcdzcgpt/start.sh

@@ -0,0 +1,5 @@
+#!/bin/bash
+
+ps -ef |grep "lcdz_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+nohup python3 lcdz_list.py > log/lcdz_list.out 2>&1 &
+

+ 416 - 0
lzz_theme/package-lock.json

@@ -0,0 +1,416 @@
+{
+  "requires": true,
+  "lockfileVersion": 1,
+  "dependencies": {
+    "@tootallnate/once": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmmirror.com/@tootallnate/once/-/once-2.0.0.tgz",
+      "integrity": "sha512-XCuKFP5PS55gnMVu3dty8KPatLqUoy/ZYzDzAGCQ8JNFCkLXzmI7vNHCR+XpbZaMWQK/vQubr7PkYq8g470J/A=="
+    },
+    "abab": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmmirror.com/abab/-/abab-2.0.6.tgz",
+      "integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA=="
+    },
+    "acorn": {
+      "version": "8.14.0",
+      "resolved": "https://registry.npmmirror.com/acorn/-/acorn-8.14.0.tgz",
+      "integrity": "sha512-cl669nCJTZBsL97OF4kUQm5g5hC2uihk0NxY3WENAC0TYdILVkAyHymAntgxGkl7K+t0cXIrH5siy5S4XkFycA=="
+    },
+    "acorn-globals": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmmirror.com/acorn-globals/-/acorn-globals-6.0.0.tgz",
+      "integrity": "sha512-ZQl7LOWaF5ePqqcX4hLuv/bLXYQNfNWw2c0/yX/TsPRKamzHcTGQnlCjHT3TsmkOUVEPS3crCxiPfdzE/Trlhg==",
+      "requires": {
+        "acorn": "^7.1.1",
+        "acorn-walk": "^7.1.1"
+      },
+      "dependencies": {
+        "acorn": {
+          "version": "7.4.1",
+          "resolved": "https://registry.npmmirror.com/acorn/-/acorn-7.4.1.tgz",
+          "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A=="
+        }
+      }
+    },
+    "acorn-walk": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmmirror.com/acorn-walk/-/acorn-walk-7.2.0.tgz",
+      "integrity": "sha512-OPdCF6GsMIP+Az+aWfAAOEt2/+iVDKE7oy6lJ098aoe59oAmK76qV6Gw60SbZ8jHuG2wH058GF4pLFbYamYrVA=="
+    },
+    "agent-base": {
+      "version": "6.0.2",
+      "resolved": "https://registry.npmmirror.com/agent-base/-/agent-base-6.0.2.tgz",
+      "integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==",
+      "requires": {
+        "debug": "4"
+      }
+    },
+    "asynckit": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmmirror.com/asynckit/-/asynckit-0.4.0.tgz",
+      "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
+    },
+    "browser-process-hrtime": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmmirror.com/browser-process-hrtime/-/browser-process-hrtime-1.0.0.tgz",
+      "integrity": "sha512-9o5UecI3GhkpM6DrXr69PblIuWxPKk9Y0jHBRhdocZ2y7YECBFCsHm79Pr3OyR2AvjhDkabFJaDJMYRazHgsow=="
+    },
+    "combined-stream": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmmirror.com/combined-stream/-/combined-stream-1.0.8.tgz",
+      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
+      "requires": {
+        "delayed-stream": "~1.0.0"
+      }
+    },
+    "cssom": {
+      "version": "0.5.0",
+      "resolved": "https://registry.npmmirror.com/cssom/-/cssom-0.5.0.tgz",
+      "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="
+    },
+    "cssstyle": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmmirror.com/cssstyle/-/cssstyle-2.3.0.tgz",
+      "integrity": "sha512-AZL67abkUzIuvcHqk7c09cezpGNcxUxU4Ioi/05xHk4DQeTkWmGYftIE6ctU6AEt+Gn4n1lDStOtj7FKycP71A==",
+      "requires": {
+        "cssom": "~0.3.6"
+      },
+      "dependencies": {
+        "cssom": {
+          "version": "0.3.8",
+          "resolved": "https://registry.npmmirror.com/cssom/-/cssom-0.3.8.tgz",
+          "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg=="
+        }
+      }
+    },
+    "data-urls": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmmirror.com/data-urls/-/data-urls-3.0.2.tgz",
+      "integrity": "sha512-Jy/tj3ldjZJo63sVAvg6LHt2mHvl4V6AgRAmNDtLdm7faqtsx+aJG42rsyCo9JCoRVKwPFzKlIPx3DIibwSIaQ==",
+      "requires": {
+        "abab": "^2.0.6",
+        "whatwg-mimetype": "^3.0.0",
+        "whatwg-url": "^11.0.0"
+      },
+      "dependencies": {
+        "whatwg-url": {
+          "version": "11.0.0",
+          "resolved": "https://registry.npmmirror.com/whatwg-url/-/whatwg-url-11.0.0.tgz",
+          "integrity": "sha512-RKT8HExMpoYx4igMiVMY83lN6UeITKJlBQ+vR/8ZJ8OCdSiN3RwCq+9gH0+Xzj0+5IrM6i4j/6LuvzbZIQgEcQ==",
+          "requires": {
+            "tr46": "^3.0.0",
+            "webidl-conversions": "^7.0.0"
+          }
+        }
+      }
+    },
+    "debug": {
+      "version": "4.4.0",
+      "resolved": "https://registry.npmmirror.com/debug/-/debug-4.4.0.tgz",
+      "integrity": "sha512-6WTZ/IxCY/T6BALoZHaE4ctp9xm+Z5kY/pzYaCHRFeyVhojxlrm+46y68HA6hr0TcwEssoxNiDEUJQjfPZ/RYA==",
+      "requires": {
+        "ms": "^2.1.3"
+      }
+    },
+    "decimal.js": {
+      "version": "10.4.3",
+      "resolved": "https://registry.npmmirror.com/decimal.js/-/decimal.js-10.4.3.tgz",
+      "integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA=="
+    },
+    "delayed-stream": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmmirror.com/delayed-stream/-/delayed-stream-1.0.0.tgz",
+      "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="
+    },
+    "domexception": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmmirror.com/domexception/-/domexception-4.0.0.tgz",
+      "integrity": "sha512-A2is4PLG+eeSfoTMA95/s4pvAoSo2mKtiM5jlHkAVewmiO8ISFTFKZjH7UAM1Atli/OT/7JHOrJRJiMKUZKYBw==",
+      "requires": {
+        "webidl-conversions": "^7.0.0"
+      }
+    },
+    "escodegen": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmmirror.com/escodegen/-/escodegen-2.1.0.tgz",
+      "integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==",
+      "requires": {
+        "esprima": "^4.0.1",
+        "estraverse": "^5.2.0",
+        "esutils": "^2.0.2",
+        "source-map": "~0.6.1"
+      }
+    },
+    "esprima": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmmirror.com/esprima/-/esprima-4.0.1.tgz",
+      "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A=="
+    },
+    "estraverse": {
+      "version": "5.3.0",
+      "resolved": "https://registry.npmmirror.com/estraverse/-/estraverse-5.3.0.tgz",
+      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="
+    },
+    "esutils": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmmirror.com/esutils/-/esutils-2.0.3.tgz",
+      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g=="
+    },
+    "form-data": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmmirror.com/form-data/-/form-data-4.0.1.tgz",
+      "integrity": "sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw==",
+      "requires": {
+        "asynckit": "^0.4.0",
+        "combined-stream": "^1.0.8",
+        "mime-types": "^2.1.12"
+      }
+    },
+    "html-encoding-sniffer": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz",
+      "integrity": "sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==",
+      "requires": {
+        "whatwg-encoding": "^2.0.0"
+      }
+    },
+    "http-proxy-agent": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmmirror.com/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
+      "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
+      "requires": {
+        "@tootallnate/once": "2",
+        "agent-base": "6",
+        "debug": "4"
+      }
+    },
+    "https-proxy-agent": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmmirror.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
+      "integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==",
+      "requires": {
+        "agent-base": "6",
+        "debug": "4"
+      }
+    },
+    "iconv-lite": {
+      "version": "0.6.3",
+      "resolved": "https://registry.npmmirror.com/iconv-lite/-/iconv-lite-0.6.3.tgz",
+      "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
+      "requires": {
+        "safer-buffer": ">= 2.1.2 < 3.0.0"
+      }
+    },
+    "is-potential-custom-element-name": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmmirror.com/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
+      "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="
+    },
+    "jsdom": {
+      "version": "19.0.0",
+      "resolved": "https://registry.npmmirror.com/jsdom/-/jsdom-19.0.0.tgz",
+      "integrity": "sha512-RYAyjCbxy/vri/CfnjUWJQQtZ3LKlLnDqj+9XLNnJPgEGeirZs3hllKR20re8LUZ6o1b1X4Jat+Qd26zmP41+A==",
+      "requires": {
+        "abab": "^2.0.5",
+        "acorn": "^8.5.0",
+        "acorn-globals": "^6.0.0",
+        "cssom": "^0.5.0",
+        "cssstyle": "^2.3.0",
+        "data-urls": "^3.0.1",
+        "decimal.js": "^10.3.1",
+        "domexception": "^4.0.0",
+        "escodegen": "^2.0.0",
+        "form-data": "^4.0.0",
+        "html-encoding-sniffer": "^3.0.0",
+        "http-proxy-agent": "^5.0.0",
+        "https-proxy-agent": "^5.0.0",
+        "is-potential-custom-element-name": "^1.0.1",
+        "nwsapi": "^2.2.0",
+        "parse5": "6.0.1",
+        "saxes": "^5.0.1",
+        "symbol-tree": "^3.2.4",
+        "tough-cookie": "^4.0.0",
+        "w3c-hr-time": "^1.0.2",
+        "w3c-xmlserializer": "^3.0.0",
+        "webidl-conversions": "^7.0.0",
+        "whatwg-encoding": "^2.0.0",
+        "whatwg-mimetype": "^3.0.0",
+        "whatwg-url": "^10.0.0",
+        "ws": "^8.2.3",
+        "xml-name-validator": "^4.0.0"
+      }
+    },
+    "jsencrypt": {
+      "version": "3.3.2",
+      "resolved": "https://registry.npmmirror.com/jsencrypt/-/jsencrypt-3.3.2.tgz",
+      "integrity": "sha512-arQR1R1ESGdAxY7ZheWr12wCaF2yF47v5qpB76TtV64H1pyGudk9Hvw8Y9tb/FiTIaaTRUyaSnm5T/Y53Ghm/A=="
+    },
+    "mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmmirror.com/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="
+    },
+    "mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmmirror.com/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "requires": {
+        "mime-db": "1.52.0"
+      }
+    },
+    "ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmmirror.com/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
+    },
+    "nwsapi": {
+      "version": "2.2.16",
+      "resolved": "https://registry.npmmirror.com/nwsapi/-/nwsapi-2.2.16.tgz",
+      "integrity": "sha512-F1I/bimDpj3ncaNDhfyMWuFqmQDBwDB0Fogc2qpL3BWvkQteFD/8BzWuIRl83rq0DXfm8SGt/HFhLXZyljTXcQ=="
+    },
+    "parse5": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmmirror.com/parse5/-/parse5-6.0.1.tgz",
+      "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw=="
+    },
+    "psl": {
+      "version": "1.15.0",
+      "resolved": "https://registry.npmmirror.com/psl/-/psl-1.15.0.tgz",
+      "integrity": "sha512-JZd3gMVBAVQkSs6HdNZo9Sdo0LNcQeMNP3CozBJb3JYC/QUYZTnKxP+f8oWRX4rHP5EurWxqAHTSwUCjlNKa1w==",
+      "requires": {
+        "punycode": "^2.3.1"
+      }
+    },
+    "punycode": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmmirror.com/punycode/-/punycode-2.3.1.tgz",
+      "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg=="
+    },
+    "querystringify": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmmirror.com/querystringify/-/querystringify-2.2.0.tgz",
+      "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ=="
+    },
+    "requires-port": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmmirror.com/requires-port/-/requires-port-1.0.0.tgz",
+      "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="
+    },
+    "safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmmirror.com/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
+    },
+    "saxes": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmmirror.com/saxes/-/saxes-5.0.1.tgz",
+      "integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==",
+      "requires": {
+        "xmlchars": "^2.2.0"
+      }
+    },
+    "source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmmirror.com/source-map/-/source-map-0.6.1.tgz",
+      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
+      "optional": true
+    },
+    "symbol-tree": {
+      "version": "3.2.4",
+      "resolved": "https://registry.npmmirror.com/symbol-tree/-/symbol-tree-3.2.4.tgz",
+      "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
+    },
+    "tough-cookie": {
+      "version": "4.1.4",
+      "resolved": "https://registry.npmmirror.com/tough-cookie/-/tough-cookie-4.1.4.tgz",
+      "integrity": "sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==",
+      "requires": {
+        "psl": "^1.1.33",
+        "punycode": "^2.1.1",
+        "universalify": "^0.2.0",
+        "url-parse": "^1.5.3"
+      }
+    },
+    "tr46": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/tr46/-/tr46-3.0.0.tgz",
+      "integrity": "sha512-l7FvfAHlcmulp8kr+flpQZmVwtu7nfRV7NZujtN0OqES8EL4O4e0qqzL0DC5gAvx/ZC/9lk6rhcUwYvkBnBnYA==",
+      "requires": {
+        "punycode": "^2.1.1"
+      }
+    },
+    "universalify": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmmirror.com/universalify/-/universalify-0.2.0.tgz",
+      "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg=="
+    },
+    "url-parse": {
+      "version": "1.5.10",
+      "resolved": "https://registry.npmmirror.com/url-parse/-/url-parse-1.5.10.tgz",
+      "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
+      "requires": {
+        "querystringify": "^2.1.1",
+        "requires-port": "^1.0.0"
+      }
+    },
+    "w3c-hr-time": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmmirror.com/w3c-hr-time/-/w3c-hr-time-1.0.2.tgz",
+      "integrity": "sha512-z8P5DvDNjKDoFIHK7q8r8lackT6l+jo/Ye3HOle7l9nICP9lf1Ci25fy9vHd0JOWewkIFzXIEig3TdKT7JQ5fQ==",
+      "requires": {
+        "browser-process-hrtime": "^1.0.0"
+      }
+    },
+    "w3c-xmlserializer": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/w3c-xmlserializer/-/w3c-xmlserializer-3.0.0.tgz",
+      "integrity": "sha512-3WFqGEgSXIyGhOmAFtlicJNMjEps8b1MG31NCA0/vOF9+nKMUW1ckhi9cnNHmf88Rzw5V+dwIwsm2C7X8k9aQg==",
+      "requires": {
+        "xml-name-validator": "^4.0.0"
+      }
+    },
+    "webidl-conversions": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmmirror.com/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
+      "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g=="
+    },
+    "whatwg-encoding": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmmirror.com/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz",
+      "integrity": "sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==",
+      "requires": {
+        "iconv-lite": "0.6.3"
+      }
+    },
+    "whatwg-mimetype": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmmirror.com/whatwg-mimetype/-/whatwg-mimetype-3.0.0.tgz",
+      "integrity": "sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q=="
+    },
+    "whatwg-url": {
+      "version": "10.0.0",
+      "resolved": "https://registry.npmmirror.com/whatwg-url/-/whatwg-url-10.0.0.tgz",
+      "integrity": "sha512-CLxxCmdUby142H5FZzn4D8ikO1cmypvXVQktsgosNy4a4BHrDHeciBBGZhb0bNoR5/MltoCatso+vFjjGx8t0w==",
+      "requires": {
+        "tr46": "^3.0.0",
+        "webidl-conversions": "^7.0.0"
+      }
+    },
+    "ws": {
+      "version": "8.18.0",
+      "resolved": "https://registry.npmmirror.com/ws/-/ws-8.18.0.tgz",
+      "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw=="
+    },
+    "xml-name-validator": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmmirror.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz",
+      "integrity": "sha512-ICP2e+jsHvAj2E2lIHxa5tjXRlKDJo4IdvPvCXbXQGdzSfmSpNVyIKMvoZHjDY9DP0zV17iI85o90vRFXNccRw=="
+    },
+    "xmlchars": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmmirror.com/xmlchars/-/xmlchars-2.2.0.tgz",
+      "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
+    }
+  }
+}

+ 6 - 0
lzz_theme/package.json

@@ -0,0 +1,6 @@
+{
+  "dependencies": {
+    "jsdom": "^19.0.0",
+    "jsencrypt": "^3.3.2"
+  }
+}

+ 2 - 2
lzz_theme/qgzbgggsssyq/py_ssyq_details.py

@@ -131,7 +131,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=file_url,
                 proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             time.sleep(random.randint(3, 6))
@@ -226,7 +226,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=pdfurl,
                 proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             time.sleep(random.randint(3, 6))

+ 2 - 2
lzz_theme/qgzbgggsssyq/py_ssyq_details2.py

@@ -130,7 +130,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=file_url,
                 proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             time.sleep(random.randint(3, 6))
@@ -226,7 +226,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=pdfurl,
                 proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             if _ == 4:

+ 2 - 2
lzz_theme/qgzbgggsssyq/py_ssyq_details3.py

@@ -131,7 +131,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=file_url,
                 proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             time.sleep(random.randint(3, 6))
@@ -226,7 +226,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=pdfurl,
                 proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             if _ == 4:

+ 2 - 2
lzz_theme/qgzbgggsssyq/py_ssyq_details4.py

@@ -131,7 +131,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=file_url,
                 proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             time.sleep(random.randint(3, 6))
@@ -226,7 +226,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=pdfurl,
                 proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             if _ == 4:

+ 2 - 2
lzz_theme/qgzbgggsssyq/py_ssyq_details_bu.py

@@ -132,7 +132,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=file_url,
                 proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             time.sleep(random.randint(3, 6))
@@ -227,7 +227,7 @@ class Details:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=pdfurl,
                 proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             if _ == 4:

+ 2 - 2
lzz_theme/qgzbgggsssyq/sscrawl_details.py

@@ -125,7 +125,7 @@ class dt_Spider:
             attachment = AttachmentDownloader().fetch_attachment(
                 file_name=item["title"], file_type="pdf", download_url=file_url,
                 proxies=self.proxy, headers=headers, params=params, is_check=True)
-            if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+            if attachment.get('size'):
                 attachments[str(len(attachments) + 1)] = attachment
                 break
             time.sleep(random.randint(3, 6))
@@ -220,7 +220,7 @@ class dt_Spider:
         attachment = AttachmentDownloader().fetch_attachment(
             file_name=item["title"], file_type="pdf", download_url=pdfurl,
             proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-        if attachment.get('size', '32.8 kb') not in ["32.8 kb"]:
+        if attachment.get('size'):
             attachments[str(len(attachments) + 1)] = attachment
         else:
             raise FileNotFoundError("附件下载失败!")

+ 15 - 11
lzz_theme/qgzbgggsssyq/sscrawl_list.py

@@ -111,18 +111,22 @@ class Spider:
         return ctx.call('type_1017_ss',typm)
 
     def fetch(self, keyword):
-        url = "https://ctbpsp.com/cutominfoapi/searchkeyword"
-        params = {
-            "keyword": keyword,
-            "uid": "0",
-            "PageSize": "10",
-            "CurrentPage": "1",
-            "searchType": "0",
-            "bulletinType": "5",
-            "type__1017": self.get_type_1017(quote(keyword))
-        }
-        response = requests.get(url, headers=self.headers, params=params, proxies=get_QGIP(), verify=False)
+        # url = "https://ctbpsp.com/cutominfoapi/searchkeyword"
+        # params = {
+        #     "keyword": keyword,
+        #     "uid": "0",
+        #     "PageSize": "10",
+        #     "CurrentPage": "1",
+        #     "searchType": "0",
+        #     "bulletinType": "5",
+        #     "type__1017": self.get_type_1017(quote(keyword))
+        # }
+        furl = f"https://ctbpsp.com/cutominfoapi/searchkeyword?keyword={keyword}&uid=0&PageSize=10&CurrentPage=1&searchType=0&bulletinType=5&type__1017={self.get_type_1017(quote(keyword, safe='/'))}"
+
+        response = requests.get(furl, headers=self.headers, proxies=get_QGIP(), verify=False)
         data_info = self.get_data(response.text.replace('"', ""))
+        if "error while performing request" in data_info:
+            raise ValueError("错误请求!")
 
         return data_info
 

+ 126 - 0
lzz_theme/qgzbgggsssyq/start.sh

@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-02-11
+---------
+@summary: 全军武器装备采购信息网 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from parsel import Selector
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.proxy = None
+
+    def detail_get(self, response, item):
+        response.encoding = response.apparent_encoding
+        root = Selector(response.text)
+
+        html = root.xpath('//div[@id="content"]|//div[@class="secret"]').extract_first("")
+        rl_list = ['//span[@id="demandPv"]', '点击次数:', '//div[@class="right"]',
+                   '//div[@id="demandDocking"]', '未经授权,严禁转载']
+        html = remove_htmldata(rl_list, html, root)
+
+        html2 = "".join(re.findall("htmlDecode\('(.*?)'\)\);", response.text, re.S))
+        html3 = "".join(re.findall("demandPrerequisites = '(.*?)';", response.text, re.S))
+
+        file_org = "".join(re.findall('var url = "file/(.*?)"', response.text))
+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
+                      'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps']
+        if file_org:
+            attachments = {}
+            file_url = f"http://www.weain.mil.cn/file/{file_org}"
+            file_name = file_url.split('/')[-1]
+            file_type = file_url.split('.')[-1].lower()
+
+            if file_type in file_types:
+                attachment = AttachmentDownloader().fetch_attachment(
+                    file_name=file_name, file_type=file_type,
+                    download_url=file_url, proxies=self.proxy)
+                attachments[str(len(attachments) + 1)] = attachment
+
+            if attachments:
+                item['projectinfo'] = {"attachments": attachments}
+
+        item["contenthtml"] = html + html2 + html3
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Pragma": "no-cache",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
+        }
+        vv= item.get('publishtime').replace("-","").replace(":","").replace(" ","")
+        response = requests.get(url=item.get("parse_url")+f"?v={vv}", headers=headers,
+                                proxies=self.proxy, timeout=(30, 30), verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item.get('competehref')} 采集异常:{e}")
+                retry_times += 1
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+
+        with self.db_name.find({"parser_name": "ztpc_qjwqzbcgxxw", "is_crawl": False}, sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=50)
+

+ 126 - 0
lzz_theme/qjwqzbcgxxw/qjwqzb_details.py

@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-02-11
+---------
+@summary: 全军武器装备采购信息网 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from parsel import Selector
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.proxy = None
+
+    def detail_get(self, response, item):
+        response.encoding = response.apparent_encoding
+        root = Selector(response.text)
+
+        html = root.xpath('//div[@id="content"]|//div[@class="secret"]').extract_first("")
+        rl_list = ['//span[@id="demandPv"]', '点击次数:', '//div[@class="right"]',
+                   '//div[@id="demandDocking"]', '未经授权,严禁转载']
+        html = remove_htmldata(rl_list, html, root)
+
+        html2 = "".join(re.findall("htmlDecode\('(.*?)'\)\);", response.text, re.S))
+        html3 = "".join(re.findall("demandPrerequisites = '(.*?)';", response.text, re.S))
+
+        file_org = "".join(re.findall('var url = "file/(.*?)"', response.text))
+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
+                      'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps']
+        if file_org:
+            attachments = {}
+            file_url = f"http://www.weain.mil.cn/file/{file_org}"
+            file_name = file_url.split('/')[-1]
+            file_type = file_url.split('.')[-1].lower()
+
+            if file_type in file_types:
+                attachment = AttachmentDownloader().fetch_attachment(
+                    file_name=file_name, file_type=file_type,
+                    download_url=file_url, proxies=self.proxy)
+                attachments[str(len(attachments) + 1)] = attachment
+
+            if attachments:
+                item['projectinfo'] = {"attachments": attachments}
+
+        item["contenthtml"] = html + html2 + html3
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Pragma": "no-cache",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
+        }
+        vv= item.get('publishtime').replace("-","").replace(":","").replace(" ","")
+        response = requests.get(url=item.get("parse_url")+f"?v={vv}", headers=headers,
+                                proxies=self.proxy, timeout=(30, 30), verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item.get('competehref')} 采集异常:{e}")
+                retry_times += 1
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+
+        with self.db_name.find({"parser_name": "ztpc_qjwqzbcgxxw", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=50)

+ 149 - 0
lzz_theme/qjwqzbcgxxw/qjwqzb_list.py

@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-02-11
+---------
+@summary: 全军武器装备采购信息网 - 列表页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.tools import *
+import requests
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Crawl_Qjwq:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+
+        self.r = Redis_client()
+        self.redis_key = 'ztpc_qjwqzbcgxxw'
+
+        self.real_cont = 0
+
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Pragma": "no-cache",
+            "Referer": "http://www.weain.mil.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "X-Requested-With": "XMLHttpRequest",
+            "isEncrypt": "isNotEncrypt",
+            "token;": ""
+        }
+
+        url = f"http://www.weain.mil.cn/api/front/list/{menu.cid}/list"
+        params = {
+            "LMID": menu.tid,
+            "pageNo": f"{page}",
+            "_t": f"{round(time.time() * 1000)}"
+        }
+
+        request_params = {
+            "headers": headers,
+            "params": params,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        resp = requests.get(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('list').get('contentList')
+        for info in info_list:
+            href = "http://www.weain.mil.cn" + info.get('pcUrl')
+            title = info.get('nonSecretTitle').strip()
+            create_time = info.get('publishTime')
+
+            dedup = md5value(title + href)
+            if not self.r.hexists(self.redis_key, dedup):
+                item = {
+                    "site": "全军武器装备采购信息网",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": self.redis_key,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.r.hset(self.redis_key, dedup, '')
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page, menu=menu)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    self.real_cont += len(informations)
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(10, 20))
+                    break
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                retry_times += 1
+                time.sleep(5)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'tid', 'cid', 'crawl_page'])
+
+    menus = [
+        Menu('采购公告', 'a_qjwqzbcgxxw_cggg_jdgg', '1149231276155707394', 'cggg', 5),
+        Menu('采购公告', 'a_qjwqzbcgxxw_cggg_jdgg', '1149231318006472705', 'cggg', 10),
+        Menu('采购需求', 'a_qjwqzbcgxxw_cgxq_jdxq', 'HZ287281676ce46401676cf0975c000e', 'cgxq', 1),
+        Menu('采购需求', 'a_qjwqzbcgxxw_cgxq_jdxq', 'HZ287281676ce46401676cf59ca5001b', 'cgxq', 1),
+    ]
+
+    Crawl_Qjwq().start_list(menus)

+ 8 - 0
lzz_theme/qjwqzbcgxxw/start.sh

@@ -0,0 +1,8 @@
+#!/bin/bash
+
+ps -ef |grep "qjwqzb_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "qjwqzb_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+
+nohup python3 qjwqzb_list.py > log/qjwqzb_list.out 2>&1 &
+nohup python3 qjwqzb_details.py > log/qjwqzb_details.out 2>&1 &
+

+ 6 - 0
lzz_theme/rm_file.sh

@@ -0,0 +1,6 @@
+#!/bin/bash
+
+rm -rf /mnt/lzz_theme/qgzbgggsssyq/file/*
+rm -rf /mnt/lzz_theme/zgzbtbggfwpt/file/*
+rm -rf /mnt/lzz_theme/gdszfcgw/file/*
+

+ 59 - 0
lzz_theme/sfc/login_account.py

@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 账密登录 https://baseapp.norincogroup-ebuy.com/child/member/home
+---------
+@author: Lzz
+"""
+import time
+import random
+from hashlib import md5
+from pass_slide import get_token
+import uuid
+import requests
+
+
+def Login(username="TOP123", password="123qwe!A"):
+    session = requests.session()
+    session.verify = False
+    uuId = str(uuid.uuid4())
+    with open('./sfc_uuid.txt', 'w', encoding='utf-8') as fw:
+        fw.write(str(uuId))
+    try:
+        for _ in range(10):
+            session = get_token(session,uuId)
+            if session:
+                break
+            time.sleep(random.randint(3,7))
+
+        en_pwd = md5(password.encode()).hexdigest()
+
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "https://js.fwgov.cn:615",
+            "Referer": "https://js.fwgov.cn:615/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "userName": username,
+            "uuid": uuId
+        }
+        url = "https://js.fwgov.cn:868/store/passport/login/userLogin"
+        data = {
+            "username": username,
+            "password": en_pwd
+        }
+        response = session.post(url, headers=headers, data=data, timeout=20)
+
+        hcookies = response.json().get('result').get('accessToken')
+        with open('./sfc_cookies.txt', 'w', encoding='utf-8') as fw:
+            fw.write(str(hcookies))
+
+        print(f" >>> 账号:{username} 登录完成!")
+        return True
+    except Exception as e:
+        print(f" >>> 账号:{username} 登录失败!{e}")
+        return False
+

+ 113 - 0
lzz_theme/sfc/pass_slide.py

@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 滑块验证
+---------
+@author: lzz
+"""
+from PIL import Image
+import cv2
+import numpy as np
+import warnings
+import re
+import base64
+import time
+
+warnings.filterwarnings('ignore')
+
+
+def pilImgToCv2(img: Image.Image, flag=cv2.COLOR_RGB2BGR):
+    return cv2.cvtColor(np.asarray(img), flag)
+
+
+def getDistance(img: Image.Image, slice: Image.Image):
+    # 背景图和滑块图都需要做相同处理
+    grayImg = pilImgToCv2(img, cv2.COLOR_BGR2GRAY)
+    graySlice = pilImgToCv2(slice, cv2.COLOR_BGR2GRAY)
+    # 做边缘检测进一步降低干扰
+    grayImg = cv2.Canny(grayImg, 255, 255)
+    graySlice = cv2.Canny(graySlice, 255, 255)
+    # 通过模板匹配两张图片,找出缺口的位置
+    result = cv2.matchTemplate(grayImg, graySlice, cv2.TM_CCOEFF_NORMED)
+    maxLoc = cv2.minMaxLoc(result)[3]
+    # 匹配出来的滑动距离
+    distance = maxLoc[0]
+    sliceHeight, sliceWidth = graySlice.shape[:2]
+    # 左上角
+    x, y = maxLoc
+    # 右下角
+    x2, y2 = x + sliceWidth, y + sliceHeight
+    resultBg = pilImgToCv2(img, cv2.COLOR_RGB2BGR)
+    cv2.rectangle(resultBg, (x, y), (x2, y2), (0, 0, 255), 2)
+    return distance
+
+
+def get_dist(sliceimgpath, imgpath):
+    distance = getDistance(Image.open(imgpath), Image.open(sliceimgpath))
+    return distance
+
+
+def decode_image(filename, src):
+    # 1、信息提取
+    result = re.search("data:image/(?P<ext>.*?);base64,(?P<data>.*)", src, re.DOTALL)
+    if result:
+        ext = result.groupdict().get("ext")
+        data = result.groupdict().get("data")
+    else:
+        raise Exception("Do not parse!")
+
+    img = base64.urlsafe_b64decode(data)
+    with open(f"{filename}.png", "wb") as f:
+        f.write(img)
+
+    return filename
+
+def get_token(session,uuId):
+    headers = {
+        "Accept": "application/json, text/plain, */*",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Connection": "keep-alive",
+        "Origin": "https://js.fwgov.cn:615",
+        "Referer": "https://js.fwgov.cn:615/",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+        "uuid": uuId
+    }
+
+    url = "https://js.fwgov.cn:868/common/common/slider/LOGIN"
+    params = {
+        "_t": f"{int(time.time())}"
+    }
+    res = session.get(url, headers=headers, params=params, timeout=20)
+
+    img_info = res.json().get('result')
+    fullpage = img_info.get('backImage')
+    decode_image("fullpage", fullpage)
+    slicepage = img_info.get('slidingImage')
+    decode_image("slice", slicepage)
+
+    dis = get_dist('./slice.png', './fullpage.png')
+
+    headers = {
+        "Accept": "application/json, text/plain, */*",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Connection": "keep-alive",
+        "Content-Type": "application/x-www-form-urlencoded",
+        "Origin": "https://js.fwgov.cn:615",
+        "Referer": "https://js.fwgov.cn:615/",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+        "adcode": "3200",
+        "uuid": uuId
+    }
+    url = "https://js.fwgov.cn:868/common/common/slider/LOGIN"
+    data = {
+        "verificationEnums": "LOGIN",
+        "xPos": f"{dis}"
+    }
+    response = session.post(url, headers=headers, data=data)
+    if response.json().get('message') == 'success':
+        return session
+    else:
+        return None
+
+

+ 200 - 0
lzz_theme/sfc/sfc_cjgg_detail.py

@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 苏服采 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from login_account import Login
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.Authorization = ""
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def get_file_list(self, hid, proxies=False):
+
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "accessToken": f"{self.Authorization}",
+            "adcode": "3200",
+            "uuid": f"{self.get_uuid()}"
+        }
+
+        url = f"https://js.fwgov.cn:868/purchases/tenders/notice/resultAnnouncement/{hid}"
+        params = {
+            "_t": f"{int(time.time())}",
+        }
+        try:
+            response = requests.get(url, headers=headers, params=params, timeout=20, proxies=proxies, verify=False)
+            return response.json().get('result').get('simpleReasonsFile')
+        except:
+            return None
+
+    def detail_get(self, response, item):
+        hid = item['competehref'].split('/')[-1].split('?')[0]
+        dt = response.json().get('result')
+        file_list = self.get_file_list(hid)
+        file_html = ""
+        attachments = {}
+
+        if file_list:
+            for info in file_list:
+                file_name = info.get('name')
+                base_url = "https://zhenjiang.fwgov.cn:868/fileServer/"
+                file_url = base_url + info.get('path') + f"?accessToken={self.Authorization}"
+                file_type = extract_file_type(file_name=file_name, file_url=file_url)
+                if file_type:
+                    file_html += f'<div><a href="{file_url}">{file_name}</a></div>'
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url)
+                    if not attachment.get('size'):
+                        try:
+                            os.remove('./sfc_cookies.txt')
+                        except:
+                            pass
+                        raise ValueError("cookies 失效!")
+                    attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item['projectinfo'] = {"attachments": attachments}
+
+        html = f'''
+        <div>
+            <p><strong>一、项目编号:</strong><span>{dt.get('itemNo')}</span></p>
+            <p><strong>二、项目名称:</strong><span>{dt.get('itemName')}</span></p>
+            <p><strong>三、中选(成交)信息</strong></p>
+            <p> 供应商名称: <span>{dt.get('supplierName')}</span></p>
+            <p> 中选(成交)报价: <span><span>¥{dt.get('transactionPrice')}</span></span></p>
+            <p><strong>四、主要标的信息</strong></p>
+            <p> 项目名称: <span>{dt.get('itemName')}</span></p>
+            <p> 项目编号: <span>{dt.get('itemNo')}</span></p>
+            <p> 比选方式: <span>{dt.get('itemName')}</span></p>
+            <p> 服务品目: <span>{dt.get('servicePath')}</span></p>
+            <p> 项目预算: <span><span>¥{dt.get('itemBudget')}</span></span></p>
+            <p> 项目地点: <span> {dt.get('address')} <i></i></span></p>
+            <p> 评审开始时间: <span>{dt.get('reviewStartTime')}</span></p>
+            <p> 评审地点: <span> {dt.get('projectLocation')} <i></i></span></p>
+            <p> 采购单位: <span>{dt.get('procurementUnit')}</span></p>
+            <p> 联系人姓名: <span>{dt.get('itemPerson')}</span></p>
+            <p> 联系电话: <span>{dt.get('itemPersonTel')}</span></p>
+            <p> 固定电话: <span>{dt.get('itemFixedTel')}</span></p>
+            <p> 响应开始时间: <span>{dt.get('tenderStartTime')}</span></p>
+            <p> 响应截止时间: <span>{dt.get('tenderEndTime')}</span></p>
+            <p><strong>五、凡对本次公告内容提出询问,请按以下方式联系</strong></p>
+            <p>1.采购单位信息</p>
+            <p> 名称: <span>{dt.get('procurementUnit')}</span></p>
+            <p> 地址: <span> {dt.get('address')} <i></i></span></p>
+            <p>2.项目联系方式</p>
+            <p> 联系人姓名: <span>{dt.get('itemPerson')}</span></p>
+            <p> 联系电话: <span>{dt.get('itemPersonTel')}</span></p>
+            <strong>六、附件</strong>
+            {file_html}
+        </div>
+        '''
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        self.Authorization = self.get_cookies()
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "accessToken": f"{self.Authorization}",
+            "adcode": "3200",
+            "uuid": f"{self.get_uuid()}"
+        }
+        response = requests.get(url=item.get("parse_url"), headers=headers, timeout=(30, 30), verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    try:
+                        os.remove('./sfc_cookies.txt')
+                    except:
+                        pass
+                    raise ValueError("cookies 失效!")
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item.get('competehref')} 采集异常:{e}")
+                retry_times += 1
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        logger.debug("********** 等待60s列表页数据加载... **********")
+        time.sleep(60)
+        with self.db_name.find({"parser_name": "ztpc_sfc_cjgg", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=10)

+ 172 - 0
lzz_theme/sfc/sfc_cjgg_list.py

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 苏服采 - 公告信息-成交公告
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+from login_account import Login
+import warnings
+warnings.filterwarnings('ignore')
+
+
+class Crawl_sfc:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+
+        self.r = Redis_client()
+        self.redis_key = 'ztpc_sfc_cjgg'
+
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def fetch_list_page(self, page):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "adcode": "3200",
+            "uuid": self.get_uuid()
+        }
+
+        url = "https://js.fwgov.cn:868/purchases/tenders/notice/announcementResultListNew"
+        data = {
+            "biddingStatus": "",
+            "itemBudgetStart": "",
+            "itemBudgetEnd": "",
+            "transactionPriceStart": "",
+            "transactionPriceEnd": "",
+            "type": "3",
+            "sort": "releaseTime",
+            "order": "desc",
+            "nameOrunit": "",
+            "pageNumber": page,
+            "pageSize": 10,
+            "areaCode": [
+                "3200"
+            ],
+            "serviceType": "1"
+        }
+        data = json.dumps(data, separators=(',', ':'))
+
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('result').get('records')
+        for info in info_list:
+            hid = info.get('purchasesNeedId')
+            title = info.get('title').strip()
+            publish_time = info.get('releaseTime').strip() + ":00"
+            href = f"https://js.fwgov.cn/biddingAnnouncement/{hid}?type=3&serviceType=1"
+
+            dedup = md5value(title + href)
+
+            if not self.r.hexists(self.redis_key, dedup):
+                item = {
+                    "site": "苏服采",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": "江苏",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": f"https://js.fwgov.cn:868/purchases/tenders/notice/{hid}",
+                    "parser_name": self.redis_key,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.r.hset(self.redis_key, dedup, '')
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    self.real_cont += len(informations)
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(10,20))
+                    break
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                retry_times += 1
+                time.sleep(5)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('公告信息-成交公告', 'js_sfc_ggxx_cjgg', 2),
+    ]
+
+    Crawl_sfc().start_list(menus)

+ 1 - 0
lzz_theme/sfc/sfc_cookies.txt

@@ -0,0 +1 @@
+eyJhbGciOiJIUzI1NiJ9.eyJ1c2VyQ29udGV4dCI6IntcInVzZXJuYW1lXCI6XCJ0b3AxMjNcIixcIm5pY2tOYW1lXCI6XCLlvKDph5HlnaRcIixcImZhY2VcIjpcImdyb3VwMS9NMDAvMDUvMDQvd0tnQWNXZHpTeVNBUGhJc0FBQlVQVS16UUtNNy5uLmpwZyxncm91cDEvTTAwLzBGLzRDL3dLZ0FjR2R6U3ktQUlQSkRBQUE3bERVR3kyMDIubi5wbmdcIixcImlkXCI6XCIxODcxODU2NjE0MzQ2NjIwOTI4XCIsXCJsb25nVGVybVwiOmZhbHNlLFwicm9sZVwiOlwiTUVNQkVSXCIsXCJzdG9yZUlkXCI6XCIxODcxODYyOTM4MzE5ODU5NzE0XCIsXCJjbGVya0lkXCI6XCIxODczOTIzMjI4MDE2NDUxNTg2XCIsXCJzdG9yZU5hbWVcIjpcIuays-WNl-aLk-aZruiuoeeul-acuue9kee7nOW3peeoi-aciemZkOWFrOWPuFwiLFwiY29tcGFueVNjYWxlXCI6XCJtZWR1aW1cIixcImNvbXBhbnlOYW1lXCI6XCLmsrPljZfmi5Pmma7orqHnrpfmnLrnvZHnu5zlt6XnqIvmnInpmZDlhazlj7hcIixcImlzU3VwZXJcIjp0cnVlLFwiYXJlYUNvZGVcIjpcIjQxMDEwNVwiLFwibGFzdExvZ2luRGF0ZVwiOlwiQXByIDEsIDIwMjUgOToxMToxMCBBTVwiLFwicmVtb3RlSXBcIjpcIjEwMS4yMDAuMjA5LjExXCJ9Iiwic3ViIjoidG9wMTIzIiwiZXhwIjoxNzQzNTEzMDcyfQ.dXqdBhvxp7_nrtFLgbV589zwmjj9FriCP75mhyBgpbU

+ 180 - 0
lzz_theme/sfc/sfc_gkbx_list.py

@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-23
+---------
+@summary: 苏服采 - 公告信息-公开比选
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+from login_account import Login
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Crawl_sfc:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+
+        self.r = Redis_client()
+        self.redis_key = 'ztpc_sfc_gzgg'
+
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def fetch_list_page(self, page):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "adcode": "3200",
+            "uuid": self.get_uuid()
+        }
+
+        url = "https://js.fwgov.cn:868/purchases/tenders/notice/page"
+        data = {
+            "samEnterprises": "",
+            "biddingStatus": "",
+            "itemBudgetStart": "",
+            "itemBudgetEnd": "",
+            "sort": "",
+            "order": "",
+            "nameOrunit": "",
+            "pageNumber": page,
+            "pageSize": 12,
+            "areaCode": [
+                "3200"
+            ],
+            "serviceType": "1"
+        }
+        data = json.dumps(data, separators=(',', ':'))
+
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('result').get('records')
+        for info in info_list:
+            hid = info.get('id')
+            biddingStatus = str(info.get('biddingStatus','2'))
+            if biddingStatus == "1":
+                tp = "2"
+            elif biddingStatus == "2":
+                tp = "1"
+            else:
+                tp = "3"
+            title = info.get('itemName').strip()
+            notes = info.get('notes', '')
+            publish_time = info.get('publishTime').strip() + ":00"
+            href = f"https://js.fwgov.cn/biddingAnnouncement/{hid}?type={tp}&serviceType=1"
+
+            dedup = md5value(title + href)
+
+            if not self.r.hexists(self.redis_key, dedup):
+                item = {
+                    "site": "苏服采",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": "江苏",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "notes": notes,
+                    "parse_url": f"https://js.fwgov.cn:868/purchases/tenders/notice/{hid}",
+                    "parser_name": self.redis_key,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.r.hset(self.redis_key, dedup, '')
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    self.real_cont += len(informations)
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(10, 20))
+                    break
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                retry_times += 1
+                time.sleep(5)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('公开比选', 'js_sfc_gkbx', 1),
+    ]
+
+    Crawl_sfc().start_list(menus)

+ 230 - 0
lzz_theme/sfc/sfc_gzgg_detail.py

@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 苏服采 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from login_account import Login
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.Authorization = ""
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def get_file_list(self, hid, proxies=False):
+
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "accessToken": f"{self.Authorization}",
+            "adcode": "3200",
+            "uuid": f"{self.get_uuid()}"
+        }
+
+        url = "https://js.fwgov.cn:868/purchases/need/selectdocument"
+        data = {
+            "purchasesNeedId": f"{hid}",
+            "amend": False
+        }
+        try:
+            response = requests.post(url, headers=headers, json=data, timeout=20, proxies=proxies, verify=False)
+            return response.json().get('result').get('documentFile')
+        except:
+            return []
+
+    def detail_get(self, response, item):
+        hid = item['competehref'].split('/')[-1].split('?')[0]
+        dt = response.json().get('result')
+        file_list = self.get_file_list(hid)
+        file_html = ""
+        attachments = {}
+        file_list.extend(dt.get('requirementsFile'))
+        if file_list:
+            for info in file_list:
+                file_name = info.get('name') or info.get('fileOriginalName')
+                base_url = "https://zhenjiang.fwgov.cn:868/fileServer/"
+                fid = info.get('path') or info.get('param')
+                file_url = base_url + fid + f"?accessToken={self.Authorization}"
+                file_type = extract_file_type(file_name=file_name, file_url=file_url)
+                if file_type:
+                    file_html += f'<div><a href="{file_url}">{file_name}</a></div>'
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url)
+                    if not attachment.get('size'):
+                        try:
+                            os.remove('./sfc_cookies.txt')
+                        except:
+                            pass
+                        raise ValueError("cookies 失效!")
+                    attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item['projectinfo'] = {"attachments": attachments}
+
+        itemSelect_list = {"0":"公开比选","1":"邀请比选","2":"直选","4":"直选"}
+        if str(dt.get('reviewMethod','')) == "0":
+            reviewMethod = "综合评分法"
+        elif str(dt.get('reviewMethod','')) == "1":
+            reviewMethod = "最低成交价法"
+        else:
+            reviewMethod = ""
+
+        zgtj_html = ""
+        zgtj_list = dt.get('qualificationReviewContentObj') or []
+        index = 1
+        for zg in zgtj_list:
+            temp = f'''
+            <div>
+                    <div>{index}.</div>
+                    <div><p>{zg.get('reviewContent')}</p></div>
+                </div>
+            '''
+            zgtj_html += temp
+            index += 1
+
+        html = f'''
+        <div class="content">
+            <div><p><strong>更正说明</strong></p>
+                <span>{item.get('notes')}</span>>
+            </div>
+            <p><strong>一、比选信息</strong></p>
+            <p> 项目名称: <span>{dt.get('itemName')}</span></p>
+            <p> 项目编号: <span>{dt.get('itemNo')}</span></p>
+            <p> 比选方式: <span>{itemSelect_list.get(str(dt.get('itemSelect')))}</span></p>
+            <p> 评审方法: <span>{reviewMethod}</span></p>
+            <p> 服务品目: <span>{dt.get('servicePath')}</span></p>
+            <p> 项目预算: <span><span>¥{dt.get('itemBudget')}</span></span></p>
+            <p> 项目地点: <span> {dt.get('address')} <i></i></span></p>
+            <p> 评审开始时间: <span>{dt.get('reviewStartTime')}</span></p>
+            <p> 评审地点: <span> {dt.get('projectLocation')} <i></i></span></p>
+            <p> 采购单位: <span>{dt.get('procurementUnit')}</span></p>
+            <p> 联系人姓名: <span>{dt.get('itemPerson')}</span></p>
+            <p> 联系电话: <span>{dt.get('itemPersonTel')}</span></p>
+            <p> 固定电话: <span>{dt.get('itemFixedTel')}</span></p>
+            <p> 响应开始时间: <span>{dt.get('tenderStartTime')}</span></p>
+            <p> 响应截止时间: <span>{dt.get('tenderEndTime')}</span></p>
+            <p><strong>二、比选供应商资格条件</strong></p>
+            <div>
+                {zgtj_html}
+            </div>
+            <div><p><strong>三、获取比选文件时间、方式</strong></p>
+                <div>
+                    <div>1.</div>
+                    <div><p>自即日起至响应截止时间前</p></div>
+                </div>
+                <div>
+                    <div>2.</div>
+                    <div><p>附件:</p></div>
+                    <div>
+                       {file_html}
+                    </div>
+                </div>
+            </div>
+        </div>
+        '''
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        self.Authorization = self.get_cookies()
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "accessToken": f"{self.Authorization}",
+            "adcode": "3200",
+            "uuid": f"{self.get_uuid()}"
+        }
+        response = requests.get(url=item.get("parse_url"), headers=headers, timeout=(30, 30), verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    try:
+                        os.remove('./sfc_cookies.txt')
+                    except:
+                        pass
+                    raise ValueError("cookies 失效!")
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item.get('competehref')} 采集异常:{e}")
+                retry_times += 1
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        logger.debug("********** 等待60s列表页数据加载... **********")
+        time.sleep(60)
+        with self.db_name.find({"parser_name": "ztpc_sfc_gzgg", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=10)

+ 175 - 0
lzz_theme/sfc/sfc_gzgg_list.py

@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-23
+---------
+@summary: 苏服采 - 公告信息-更正公告
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+from login_account import Login
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Crawl_sfc:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+
+        self.r = Redis_client()
+        self.redis_key = 'ztpc_sfc_gzgg'
+
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def fetch_list_page(self, page):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "adcode": "3200",
+            "uuid": self.get_uuid()
+        }
+
+        url = "https://js.fwgov.cn:868/purchases/tenders/notice/announcementResultListNew"
+        data = {
+            "biddingStatus": "",
+            "itemBudgetStart": "",
+            "itemBudgetEnd": "",
+            "transactionPriceStart": "",
+            "transactionPriceEnd": "",
+            "type": 2,
+            "sort": "releaseTime",
+            "order": "desc",
+            "nameOrunit": "",
+            "pageNumber": page,
+            "pageSize": 10,
+            "areaCode": [
+                "3200"
+            ],
+            "serviceType": "1"
+        }
+        data = json.dumps(data, separators=(',', ':'))
+
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('result').get('records')
+        for info in info_list:
+            hid = info.get('purchasesNeedId')
+            title = info.get('title').strip()
+            notes = info.get('notes', '')
+            publish_time = info.get('releaseTime').strip() + ":00"
+            href = f"https://js.fwgov.cn/biddingAnnouncement/{hid}?type=2&serviceType=1"
+
+            dedup = md5value(title + href)
+
+            if not self.r.hexists(self.redis_key, dedup):
+                item = {
+                    "site": "苏服采",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": "江苏",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "notes": notes,
+                    "parse_url": f"https://js.fwgov.cn:868/purchases/tenders/notice/{hid}",
+                    "parser_name": self.redis_key,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.r.hset(self.redis_key, dedup, '')
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    self.real_cont += len(informations)
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(10, 20))
+                    break
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                retry_times += 1
+                time.sleep(5)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('公告信息-更正公告', 'js_sfc_ggxx_gzgg', 1),
+    ]
+
+    Crawl_sfc().start_list(menus)

+ 204 - 0
lzz_theme/sfc/sfc_htgg_detail.py

@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-22
+---------
+@summary: 苏服采 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+from login_account import Login
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.Authorization = ""
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def get_other(self, hid, proxies=False):
+
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "accessToken": f"{self.Authorization}",
+            "adcode": "3200",
+            "uuid": f"{self.get_uuid()}"
+        }
+
+        url = f"https://js.fwgov.cn:868/purchases/tenders/queryContract/{hid}"
+        data = {}
+        try:
+            response = requests.post(url, headers=headers, json=data, timeout=20, proxies=proxies, verify=False)
+            return response.json().get('result')
+        except:
+            return {}
+
+    def detail_get(self, response, item):
+        hid = item['competehref'].split('/')[-1].split('?')[0]
+        dt = response.json().get('result')
+        dtt = self.get_other(hid)
+        file_info = dtt.get('files')
+        file_html = ""
+        attachments = {}
+        if file_info:
+            file_name = file_info.get('fileOriginalName')
+            base_url = "https://zhenjiang.fwgov.cn:868/fileServer/"
+            fid = file_info.get('param')
+            file_url = base_url + fid + f"?accessToken={self.Authorization}"
+            file_type = extract_file_type(file_name=file_name, file_url=file_url)
+            if file_type:
+                file_html += f'<div><a href="{file_url}">{file_name}</a></div>'
+                attachment = AttachmentDownloader().fetch_attachment(
+                    file_name=file_name, file_type=file_type, download_url=file_url)
+                if not attachment.get('size'):
+                    try:
+                        os.remove('./sfc_cookies.txt')
+                    except:
+                        pass
+                    raise ValueError("cookies 失效!")
+                attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item['projectinfo'] = {"attachments": attachments}
+
+        itemSelect_list = {"0": "公开比选", "1": "邀请比选", "2": "直选", "4": "直选"}
+
+        html = f'''
+        <div>
+            <p><strong>一、合同编号:</strong><span>{dtt.get('contractNo')}</span></p>
+            <p ><strong>二、合同名称:</strong><span>{dtt.get('contractName')}</span></p>
+            <p><strong>三、项目编号:</strong><span>{dt.get('itemNo')}</span></p>
+            <p><strong>四、项目名称:</strong><span>{dt.get('itemName')}</span></p>
+            <p><strong>五、合同主体</strong></p>
+            <div><p>采购人(甲方):<span>{dt.get('procurementUnit')}</span></p>
+                <p>联系方式:<span>{dt.get('itemPersonTel')}</span></p>
+                <p>供应商(乙方):<span>{dtt.get('supplierName')}</span></p>
+                <p>联系方式:<span>{dtt.get('supplierPhone')}</span></p>
+            </div>
+            <p><strong>六、合同主体信息</strong></p>
+            <div><p>1.主要标的信息:</p>
+                <p>项目名称:<span>{dt.get('itemName')}</span></p>
+                <p>项目编号:<span>{dt.get('itemNo')}</span></p>
+                <p>比选方式:<span>{itemSelect_list.get(str(dt.get('itemSelect')))}</span></p>
+                <p>服务品目:<span>{dt.get('servicePath')}</span></p>
+                <p>项目预算:<span><span>¥{dt.get('itemBudget')}</span></span></p>
+                <p>项目地点:<span>{dt.get('address')} <i></i></span></p>
+                <p>评审开始时间:<span>{dt.get('reviewStartTime')} <i></i></span></p>
+                <p>评审地点:<span>{dt.get('projectLocation')} <i></i></span></p>
+                <p>采购单位:<span>{dt.get('procurementUnit')}</span></p>
+                <p>联系人姓名:<span>{dt.get('itemPerson')}</span></p>
+                <p>联系电话:<span>{dt.get('itemPersonTel')}</span></p>
+                <p>固定电话:<span>{dt.get('itemFixedTel')}</span></p>
+                <p> 响应开始时间:<span>{dt.get('tenderStartTime')}</span></p>
+                <p> 响应截止时间:<span>{dt.get('tenderEndTime')}</span></p>
+                <p>2.合同金额:<span><span>¥{dt.get('transactionPrice')}</span></span></p>
+                <p>3.履行时间(期限):<span>{dt.get('servicePeriod')}天</span></p>
+            </div>
+            <p><strong>七、合同签订日期:</strong><span>{dtt.get('contractSignDate')}</span></p>
+            <p><strong>八、合同公告日期:</strong><span>{dt.get('contractPublishTime')}</span></p>
+            <p>附件信息:
+            {file_html}
+        </div>
+        '''
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        self.Authorization = self.get_cookies()
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "accessToken": f"{self.Authorization}",
+            "adcode": "3200",
+            "uuid": f"{self.get_uuid()}"
+        }
+        response = requests.get(url=item.get("parse_url"), headers=headers, timeout=(30, 30), verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    try:
+                        os.remove('./sfc_cookies.txt')
+                    except:
+                        pass
+                    raise ValueError("cookies 失效!")
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item.get('competehref')} 采集异常:{e}")
+                retry_times += 1
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        logger.debug("********** 等待60s列表页数据加载... **********")
+        time.sleep(60)
+        with self.db_name.find({"parser_name": "ztpc_sfc_htgg", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=10)

+ 172 - 0
lzz_theme/sfc/sfc_htgg_list.py

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-23
+---------
+@summary: 苏服采 - 公告信息-合同公告
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+from login_account import Login
+import warnings
+warnings.filterwarnings('ignore')
+
+
+class Crawl_sfc:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+
+        self.r = Redis_client()
+        self.redis_key = 'ztpc_sfc_htgg'
+
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def fetch_list_page(self, page):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "adcode": "3200",
+            "uuid": self.get_uuid()
+        }
+
+        url = "https://js.fwgov.cn:868/purchases/tenders/notice/announcementResultListNew"
+        data = {
+            "biddingStatus": "",
+            "itemBudgetStart": "",
+            "itemBudgetEnd": "",
+            "transactionPriceStart": "",
+            "transactionPriceEnd": "",
+            "type": 4,
+            "sort": "releaseTime",
+            "order": "desc",
+            "nameOrunit": "",
+            "pageNumber": page,
+            "pageSize": 10,
+            "areaCode": [
+                "3200"
+            ],
+            "serviceType": "1"
+        }
+        data = json.dumps(data, separators=(',', ':'))
+
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('result').get('records')
+        for info in info_list:
+            hid = info.get('purchasesNeedId')
+            title = info.get('title').strip()
+            publish_time = info.get('releaseTime').strip() + ":00"
+            href = f"https://js.fwgov.cn/biddingAnnouncement/{hid}?type=4&serviceType=1"
+
+            dedup = md5value(title + href)
+
+            if not self.r.hexists(self.redis_key, dedup):
+                item = {
+                    "site": "苏服采",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": "江苏",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": f"https://js.fwgov.cn:868/purchases/tenders/notice/{hid}",
+                    "parser_name": self.redis_key,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.r.hset(self.redis_key, dedup, '')
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    self.real_cont += len(informations)
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(10,20))
+                    break
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                retry_times += 1
+                time.sleep(5)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode', 'crawl_page'])
+
+    menus = [
+        Menu('公告信息-合同公告', 'js_sfc_ggxx_htgg', 1),
+    ]
+
+    Crawl_sfc().start_list(menus)

+ 1 - 0
lzz_theme/sfc/sfc_uuid.txt

@@ -0,0 +1 @@
+91600922-8bd3-46cc-b8fc-ea99bb7173c0

+ 134 - 0
lzz_theme/sfc/sfc_zzgg_detail.py

@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-23
+---------
+@summary: 苏服采 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from utils.tools import *
+from login_account import Login
+import warnings
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.Authorization = ""
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def detail_get(self, response, item):
+        dt = response.json().get('result')
+
+        html = f'''
+        <div>
+            <p><strong>一、项目基本情况</strong></p>
+            <p> 项目编号: <span>{dt.get('itemNo')}</span>
+            </p>
+            <p> 项目名称: <span>{dt.get('itemName')}</span>
+            </p>
+            <p><strong>二、终止采购的原因</strong></p>
+            <p>
+            <div><p>{dt.get('terminationDescription')}</p></div>
+            <p><strong>三、凡对本次公告内容提出询问,请按以下方式联系。</strong></p>
+            <p> 采购单位: <span>{dt.get('procurementUnit')}</span></p>
+            <p> 联系人姓名: <span>{dt.get('itemPerson')}</span></p>
+            <p> 联系电话: <span>{dt.get('itemPersonTel')}</span></p>
+        </div>
+        '''
+
+        item["contenthtml"] = html
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        self.Authorization = self.get_cookies()
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "accessToken": f"{self.Authorization}",
+            "adcode": "3200",
+            "uuid": f"{self.get_uuid()}"
+        }
+        response = requests.get(url=item.get("parse_url"), headers=headers, timeout=(30, 30), verify=False)
+        time.sleep(1)
+        return response
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 5:
+            try:
+                response = self.fetch_request(item)
+                res_code = response.status_code
+                if response and res_code == 200:
+                    self.detail_get(response, item=item)
+                    return True
+                else:
+                    try:
+                        os.remove('./sfc_cookies.txt')
+                    except:
+                        pass
+                    raise ValueError("cookies 失效!")
+
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item.get('competehref')} 采集异常:{e}")
+                retry_times += 1
+                time.sleep(random.randint(3, 6))
+        logger.warning(f"[采集失败]{item.get('competehref')}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        logger.debug("********** 等待60s列表页数据加载... **********")
+        time.sleep(60)
+        with self.db_name.find({"parser_name": "ztpc_sfc_zzgg", "is_crawl": False, "failed": False},
+                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+            data_lsit = [dd for dd in cursor]
+        for item in data_lsit:
+            # logger.debug(item)
+            update_id = item["_id"]
+            result = self.deal_request(item)
+            if result is True:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+            else:
+                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
+            time.sleep(random.randint(5, 10))
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=10)

+ 172 - 0
lzz_theme/sfc/sfc_zzgg_list.py

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-01-23
+---------
+@summary: 苏服采 - 公告信息-终止公告
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.tools import *
+import requests
+import json
+from login_account import Login
+import warnings
+warnings.filterwarnings('ignore')
+
+
+class Crawl_sfc:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+
+        self.r = Redis_client()
+        self.redis_key = 'ztpc_sfc_zzgg'
+
+        self.real_cont = 0
+
+    def get_cookies(self):
+        if not os.path.isfile('./sfc_cookies.txt'):
+            Login()
+
+        with open('./sfc_cookies.txt', 'r', encoding='utf-8') as fr:
+            cks = fr.read()
+        return cks
+
+    def get_uuid(self):
+        with open('./sfc_uuid.txt', 'r', encoding='utf-8') as fr:
+            uid = fr.read()
+        return uid
+
+    def fetch_list_page(self, page):
+        logger.debug(f' *** 开始采集第{page}页 ***')
+        headers = {
+            "Accept": "application/json, text/plain, */*",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json",
+            "Origin": "https://js.fwgov.cn",
+            "Referer": "https://js.fwgov.cn/",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
+            "adcode": "3200",
+            "uuid": self.get_uuid()
+        }
+
+        url = "https://js.fwgov.cn:868/purchases/tenders/notice/announcementResultListNew"
+        data = {
+            "biddingStatus": "",
+            "itemBudgetStart": "",
+            "itemBudgetEnd": "",
+            "transactionPriceStart": "",
+            "transactionPriceEnd": "",
+            "type": 5,
+            "sort": "releaseTime",
+            "order": "desc",
+            "nameOrunit": "",
+            "pageNumber": page,
+            "pageSize": 10,
+            "areaCode": [
+                "3200"
+            ],
+            "serviceType": "1"
+        }
+        data = json.dumps(data, separators=(',', ':'))
+
+        request_params = {
+            "headers": headers,
+            "data": data,
+            "timeout": (30, 30),
+            "verify": False,
+        }
+        resp = requests.post(url, **request_params)
+        time.sleep(1)
+        return resp
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('result').get('records')
+        for info in info_list:
+            hid = info.get('purchasesNeedId')
+            title = info.get('title').strip()
+            publish_time = info.get('releaseTime').strip() + ":00"
+            href = f"https://js.fwgov.cn/biddingAnnouncement/{hid}?type=5&serviceType=1"
+
+            dedup = md5value(title + href)
+
+            if not self.r.hexists(self.redis_key, dedup):
+                item = {
+                    "site": "苏服采",
+                    "channel": menu.channel,
+                    "spidercode": menu.spidercode,
+                    "area": "江苏",
+                    "city": "",
+                    "district": "",
+                    "href": "#",
+                    "competehref": href,
+                    "title": title,
+                    "publishtime": publish_time,
+                    "parse_url": f"https://js.fwgov.cn:868/purchases/tenders/notice/{hid}",
+                    "parser_name": self.redis_key,
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.r.hset(self.redis_key, dedup, '')
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry_times = 0
+        while retry_times < 3:
+            try:
+                response = self.fetch_list_page(page=page)
+                res_code = response.status_code
+                logger.debug(f"第{page}页 状态码:{res_code}")
+                if response and res_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    self.real_cont += len(informations)
+                    logger.info(f"当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(10,20))
+                    break
+                else:
+                    retry_times += 1
+                    time.sleep(3)
+            except Exception as e:
+                logger.error(f"第{page}页 采集异常:{e}")
+                retry_times += 1
+                time.sleep(5)
+
+    def start_list(self, menus):
+        logger.debug("********** 列表页开始 **********")
+        for menu in menus:
+            logger.debug(f"++++++ {menu.channel} 开始采集 ++++++")
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page=page, menu=menu)
+            logger.debug(f"------ {menu.channel} 采集结束 ------")
+
+        logger.debug("********** 列表页结束 **********")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'spidercode','crawl_page'])
+
+    menus = [
+        Menu('公告信息-终止公告', 'js_sfc_ggxx_zzgg', 1),
+    ]
+
+    Crawl_sfc().start_list(menus)

二进制
lzz_theme/sfc/slice.png


+ 22 - 0
lzz_theme/sfc/start.sh

@@ -0,0 +1,22 @@
+#!/bin/bash
+
+ps -ef |grep "sfc_cjgg_detail.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_cjgg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_gkbx_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_gzgg_detail.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_gzgg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_htgg_detail.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_htgg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_zzgg_detail.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "sfc_zzgg_list.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+
+nohup python3 sfc_cjgg_detail.py > log/sfc_cjgg_detail.out 2>&1 &
+nohup python3 sfc_cjgg_list.py > log/sfc_cjgg_list.out 2>&1 &
+nohup python3 sfc_gkbx_list.py > log/sfc_gkbx_list.out 2>&1 &
+nohup python3 sfc_gzgg_detail.py > log/sfc_gzgg_detail.out 2>&1 &
+nohup python3 sfc_gzgg_list.py > log/sfc_gzgg_list.out 2>&1 &
+nohup python3 sfc_htgg_detail.py > log/sfc_htgg_detail.out 2>&1 &
+nohup python3 sfc_htgg_list.py > log/sfc_htgg_list.out 2>&1 &
+nohup python3 sfc_zzgg_detail.py > log/sfc_zzgg_detail.out 2>&1 &
+nohup python3 sfc_zzgg_list.py > log/sfc_zzgg_list.out 2>&1 &
+

+ 0 - 1
lzz_theme/sgycw/sgycw_ck.json

@@ -1 +0,0 @@
-{"ASP.NET_SessionId": "izmpld3phndpow0hiz1nhcuw", "REACH_IS_AUTO": "", "REACH_IS_REMEMBER": "1", "REACH_LOGIN_NAME": "topnet123"}

二进制
lzz_theme/sgycw/start.sh


+ 0 - 29
lzz_theme/szycycgpt/pass_slide.py

@@ -16,7 +16,6 @@ import base64
 from datetime import datetime, timedelta
 import pytz
 import json
-
 import requests
 import time
 import warnings
@@ -26,35 +25,26 @@ warnings.filterwarnings('ignore')
 
 
 
-# 将 Image 转换为 Mat,通过 flag 可以控制颜色
 def pilImgToCv2(img: Image.Image, flag=cv2.COLOR_RGB2BGR):
     return cv2.cvtColor(np.asarray(img), flag)
 
 
-# 弹窗查看图片
 def showImg(bg: cv2.Mat, name='test', delay=0):
     cv2.imshow(name, bg)
     cv2.waitKey(delay)
     cv2.destroyAllWindows()
 
 
-# 获取滑动距离
 def getDistance(img: Image.Image, slice: Image.Image):
-    # 背景图和滑块图都需要做相同处理
     grayImg = pilImgToCv2(img, cv2.COLOR_BGR2GRAY)
     graySlice = pilImgToCv2(slice, cv2.COLOR_BGR2GRAY)
-    # 做边缘检测进一步降低干扰
     grayImg = cv2.Canny(grayImg, 255, 255)
     graySlice = cv2.Canny(graySlice, 255, 255)
-    # 通过模板匹配两张图片,找出缺口的位置
     result = cv2.matchTemplate(grayImg, graySlice, cv2.TM_CCOEFF_NORMED)
     maxLoc = cv2.minMaxLoc(result)[3]
-    # 匹配出来的滑动距离
     distance = maxLoc[0]
     sliceHeight, sliceWidth = graySlice.shape[:2]
-    # 左上角
     x, y = maxLoc
-    # 右下角
     x2, y2 = x + sliceWidth, y + sliceHeight
     resultBg = pilImgToCv2(img, cv2.COLOR_RGB2BGR)
     cv2.rectangle(resultBg, (x, y), (x2, y2), (0, 0, 255), 2)
@@ -67,7 +57,6 @@ def get_dist(sliceimgpath, imgpath):
 
 
 def __ease_out_expo(sep):
-    '''轨迹相关操作'''
     if sep == 1:
         return 1
     else:
@@ -77,23 +66,17 @@ def __ease_out_expo(sep):
 def get_slide_track(distance):
     if not isinstance(distance, int) or distance < 0:
         raise ValueError(f"distance类型必须是大于等于0的整数: distance: {distance}, type: {type(distance)}")
-    # 初始化轨迹列表
     slide_track = [
         {"x": 0, "y": 0, "type": "down", "t": 4534},
         {"x": 1, "y": 0, "type": "move", "t": 4645}
     ]
-    # 共记录count次滑块位置信息
     count = 40 + int(distance / 2)
-    # 初始化滑动时间
     t = random.randint(4000, 6000)
-    # 记录上一次滑动的距离
     _x = 0
     _y = 0
     for i in range(count):
-        # 已滑动的横向距离
         x = round(__ease_out_expo(i / count) * distance)
         y = round(__ease_out_expo(i / count) * 4)
-        # 滑动过程消耗的时间
         t += random.randint(10, 100)
         if x == _x:
             continue
@@ -108,7 +91,6 @@ def get_slide_track(distance):
 
 
 def decode_image(filename, src):
-    # 1、信息提取
     result = re.search("data:image/(?P<ext>.*?);base64,(?P<data>.*)", src, re.DOTALL)
     if result:
         ext = result.groupdict().get("ext")
@@ -125,13 +107,9 @@ def decode_image(filename, src):
 
 def delta(time_str=None, tm=None):
     if time_str and tm:
-        # 去除末尾的 'Z' 并解析为datetime对象(假设时间为UTC时间)
         dt_obj = datetime.strptime(time_str[:-1], '%Y-%m-%dT%H:%M:%S.%f')
-        # 创建一个时间间隔为5秒的timedelta对象
         delta = timedelta(seconds=tm)
-        # 将时间对象加上时间间隔
         new_dt_obj = dt_obj + delta
-        # 重新格式化为原格式的字符串
         new_time_str = new_dt_obj.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
         return new_time_str
     else:
@@ -141,17 +119,11 @@ def delta(time_str=None, tm=None):
 
 
 def transfer(time_str):
-    # 定义时间格式,用于解析原始时间字符串
     format_str = "%a, %d %b %Y %H:%M:%S %Z"
-    # 将原始时间字符串解析为datetime对象
     dt_obj = datetime.strptime(time_str, format_str)
-    # 设置时区为GMT
     gmt_tz = pytz.timezone('GMT')
-    # 将dt_obj本地化到GMT时区
     localized_dt_obj = gmt_tz.localize(dt_obj)
-    # 转换为UTC时间
     utc_dt_obj = localized_dt_obj.astimezone(pytz.UTC)
-    # 格式化为目标格式的字符串
     new_time_str = utc_dt_obj.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
     return new_time_str
 
@@ -190,7 +162,6 @@ def create_cookies(base_url,proxies=False):
     }
     res = session.get(url, headers=headers, params=params, timeout=20)
     system_time = transfer(res.headers.get('Date'))
-
     img_info = res.json()
     fullpage = img_info.get('captcha').get('backgroundImage')
     decode_image("fullpage", fullpage)

二进制
lzz_theme/szycycgpt/slice.png


+ 0 - 3
lzz_theme/szycycgpt/start.sh

@@ -48,11 +48,9 @@ def fm(proxies=False):
         "params": "{\"width\":\"100\",\"height\":\"40\",\"codeNum\":\"4\",\"interferenceLine\":\"1\",\"codeGuid\":\"\"}"
     }
     resp = requests.post(url, headers=headers, data=data, proxies=proxies, timeout=30)
-
     img_info = resp.json().get('custom')
     vc = img_info.get('verificationCodeGuid')
     vv = get_code(img_info.get('imgCode'))
-
     cookies = resp.cookies.get_dict()
 
     return {"cookies": cookies, "vc": vc, "vv": vv}
@@ -91,7 +89,6 @@ class Details:
         if file_list:
             for info in file_list:
                 onclick = info.xpath('./@onclick').extract_first("")
-
                 try:
                     fh = eval(onclick)
                     attachGuid = "".join(re.findall('attachGuid=(.*?)&', fh[0]))

+ 0 - 1
lzz_theme/szycycgpt/szyc_list.py

@@ -102,7 +102,6 @@ class Crawl_Szyc:
             hid = info.get('linkurl')
             href = f"https://cjyc.hbbidding.com.cn/hubeiyth{hid}"
             create_time = info.get('infodate')
-
             dedup = [href]
             if not self.RDS.data_filter(dedup):
                 item = {

+ 1 - 4
lzz_theme/tjszfcgw/start.sh

@@ -47,9 +47,7 @@ class TjszfcgwDownFiles(AttachmentDownloader):
             "Accept-Encoding": "gzip, deflate",
             "Accept-Language": "zh-CN,zh;q=0.9",
         }
-
         ss.get(url, headers=headers1, timeout=(30,30))
-
         retries = 0
         while retries < 3:
             try:
@@ -230,8 +228,7 @@ class Details:
         count = 0
         ts = Timer(590, self.de_redis_key)  # 声明一个定时器,设置多少s后执行
         ts.start()                          # 启动定时器
-        with self.db_name.find({"parser_name": "ztpc_tjszfcgw", "failed": False, "is_crawl": False},
-                               sort=[('publishtime', -1)]) as data_lsit:
+        with self.db_name.find({"parser_name": "ztpc_tjszfcgw", "failed": False, "is_crawl": False}) as data_lsit:
             for item in data_lsit:
                 # logger.debug(item)
                 if self.end_state:

+ 1 - 4
lzz_theme/tjszfcgw/tjszfcgw_details2.py

@@ -47,9 +47,7 @@ class TjszfcgwDownFiles(AttachmentDownloader):
             "Accept-Encoding": "gzip, deflate",
             "Accept-Language": "zh-CN,zh;q=0.9",
         }
-
         ss.get(url, headers=headers1, timeout=(30,30))
-
         retries = 0
         while retries < 3:
             try:
@@ -230,8 +228,7 @@ class Details:
         count = 0
         ts = Timer(590, self.de_redis_key)  # 声明一个定时器,设置多少s后执行
         ts.start()                          # 启动定时器
-        with self.db_name.find({"parser_name": "ztpc_tjszfcgw", "failed": False, "is_crawl": False},
-                               sort=[('publishtime', -1)]) as data_lsit:
+        with self.db_name.find({"parser_name": "ztpc_tjszfcgw", "failed": False, "is_crawl": False}) as data_lsit:
             for item in data_lsit:
                 # logger.debug(item)
                 if self.end_state:

+ 67 - 0
lzz_theme/utils/PYCCS_cookies.py

@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-20
+---------
+@summary: 进入前检查浏览器 PYCCS
+---------
+@author: Lzz
+"""
+import re
+import execjs
+
+
+def get_PYCCS_ck(session,proxies=False):
+
+    session.proxies = proxies
+
+    url = "http://www.yngp.com/page/procurement/purchaseList.html"
+    headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language": "zh-CN,zh;q=0.9",
+        "Cache-Control": "max-age=0",
+        "Content-Type": "application/x-www-form-urlencoded",
+        "Origin": "http://www.yngp.com",
+        "Proxy-Connection": "keep-alive",
+        "Referer": "http://www.yngp.com/page/procurement/purchaseList.html",
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
+    }
+
+    ex_js = '''
+    function get_ck(a,b,c) {
+        var x08c924 = parseInt(a);
+        x08c924 = x08c924 * parseInt(b);
+        x08c924 = x08c924 + parseInt(c);
+        x08c924 = (x08c924 * 0x3 + 0x7);
+        if (x08c924 < 0x7b)
+            x08c924 = x08c924 + 0x929;
+        if (x08c924 > 0x929)
+            x08c924 = Math['floor'](x08c924 / 0x7b);
+        return x08c924
+    }
+    '''
+    ctx = execjs.compile(ex_js)
+    count = 0
+    while count < 3:
+        try:
+            res = session.get(url, headers=headers, timeout=60, verify=False)
+
+            pm_data = "".join(re.findall('\|function\|(.*?)\|version\|', res.text, re.S)).split('|')
+
+            answer = ctx.call('get_ck', pm_data[1], pm_data[3], pm_data[-1])
+
+            data = {
+                "answer": f"{answer}"
+            }
+            resp = session.post(url.split('?')[0], headers=headers, data=data, timeout=60, verify=False)
+            cookies = session.cookies.get_dict()
+
+            if re.findall('\|function\|(.*?)\|version\|', resp.text, re.S):
+                count += 1
+                print(f"请求解析异常!重试 {count} 次")
+            else:
+                return cookies
+        except:
+            print("cookies_PYCCS 获取失败!")
+            return {}
+

+ 92 - 0
lzz_theme/utils/RedisDB.py

@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-27
+---------
+@summary: redis 去重
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+import redis
+from utils.tools import get_sha256
+
+
+class RedisFilter():
+    redis_db = None
+
+    def __init__(self, redis_url='redis://:k5ZJR5KV4q7DRZ92DQ@172.17.189.142:7361/2', expire_time=None):
+        # redis_url='redis://:k5ZJR5KV4q7DRZ92DQ@172.17.162.28:8361/0',
+        self.__class__.redis_db = redis.StrictRedis.from_url(redis_url)  # 单机
+
+        self._ex = expire_time or 86400 * 365 * 1  # 1年 = 86400 * 365 * 1
+
+    def __repr__(self):
+        return "<RedisFilter: {}>".format(self.redis_db)
+
+    def exists(self, key):
+        """全量检索"""
+        if self.redis_db.exists(key) > 0:
+            return True
+        return False
+
+    def add(self, keys, *args, **kwargs):
+        """
+        添加数据  删除数据:redis_db.delete("pylist_" + key)
+        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
+        @return: list / 单个值(如果数据已存在 返回 False 否则返回 True, 可以理解为是否添加成功)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_added = []
+        for key in keys:
+            if not self.exists(key):
+                is_added.append(self.redis_db.set(key, 1, ex=self._ex))
+            else:
+                is_added.append(False)
+
+        return is_added if is_list else is_added[0]
+
+    def get(self, keys):
+        """
+        检查数据是否存在
+        @param keys: list / 单个值
+        @return: list / 单个值 (存在返回True 不存在返回False)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_exist = []
+        for key in keys:
+            is_exist.append(self.exists(key))
+
+        # 判断数据本身是否重复
+        temp_set = set()
+        for i, key in enumerate(keys):
+            if key in temp_set:
+                is_exist[i] = True
+            else:
+                temp_set.add(key)
+
+        return is_exist if is_list else is_exist[0]
+
+    def data_filter(self, data):
+        data = [data] if not isinstance(data, list) else data
+        args = sorted(data)
+        pykey = "pylist_" + get_sha256(*args)
+        if self.get(pykey):
+            ''' 存在 '''
+            return True
+        else:
+            ''' 不存在 '''
+            return False
+
+    def data_save_redis(self, data):
+        data = [data] if not isinstance(data, list) else data
+        args = sorted(data)
+        pykey = "pylist_" + get_sha256(*args)
+        state = self.add(pykey)
+        return state

+ 43 - 0
lzz_theme/utils/aliyun.py

@@ -0,0 +1,43 @@
+import oss2
+
+
+# 远程bucket配置
+oss_conf = {
+    "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
+    "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
+    "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
+    # "endpoint": "oss-cn-beijing.aliyuncs.com",
+    "bucket_name": "jy-datafile"
+}
+
+
+class AliYunService:
+
+    def __init__(self):
+        self.__acc_key_id = oss_conf['key_id']
+        self.__acc_key_secret = oss_conf['key_secret']
+        self.__endpoint = oss_conf['endpoint']
+        self.__bucket_name = oss_conf['bucket_name']
+
+    def push_oss_from_local(self, key, filename):
+        """
+        上传一个本地文件到OSS的普通文件
+
+        :param str key: 上传到OSS的文件名
+        :param str filename: 本地文件名,需要有可读权限
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object_from_file(key, filename)
+
+    def push_oss_from_stream(self, key, data):
+        """
+        流式上传oss
+
+        :param str key: 上传到OSS的文件名
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object(key, data)

+ 308 - 0
lzz_theme/utils/attachment.py

@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-26
+---------
+@summary: 附件下载模块
+---------
+@author: Dzr
+"""
+import sys
+import os
+sys.path.append(os.path.dirname(os.getcwd()))
+import io
+import uuid
+
+import tqdm
+import urllib3
+
+from utils.tools import *
+from utils.aliyun import AliYunService
+
+
+
+urllib3.disable_warnings()
+
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
+    "Accept": "*/*"
+}
+
+
+def clear_file_type_suffix(filename: str, filetype: str):
+    filename = filename.strip()
+    if filetype in filename:
+        filename = filename.replace(f".{filetype}", '')
+    return filename
+
+
+class AttachmentDownloader:
+
+    def __init__(self):
+        self.dir_name = "file"
+
+    def create_file(self, filename, filetype):
+        os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
+        file = "{filename}.{filetype}".format(
+            filename=get_sha1("{}_{}".format(filename, uuid.uuid4())),
+            filetype=filetype
+        )
+        return "{}/{}".format(self.dir_name, file)
+
+    @staticmethod
+    def clean_attachment(filepath):
+        """
+        删除文件
+
+        :param str filepath: 文件路径
+        """
+        try:
+            os.remove(filepath)
+        except FileNotFoundError:
+            pass
+
+    def remove(self, file):
+        self.clean_attachment(file)
+
+    @staticmethod
+    def calculate_size(data):
+        """
+        计算数据大小
+
+        :param int data: 准备计算大小的内容
+        :return: float
+        """
+        _kb = float(data / 1024.0)
+        return float(_kb / 1024.0)
+
+    @staticmethod
+    def getsize(data):
+        """
+        计算数据大小
+
+        :param data: 待上传的内容。
+        :type data: bytes,str或file-like object
+        :return str
+        """
+        size = 0
+        if isinstance(data, str):
+            try:
+                size = os.path.getsize(data)
+            except FileNotFoundError:
+                pass
+        elif isinstance(data, bytes):
+            size = len(data)
+        else:
+            pass
+
+        _kb = float(size) / 1024
+        result = "{:.1f} kb".format(_kb)
+        if _kb >= 1024:
+            _M = _kb / 1024
+            if _M >= 1024:
+                _G = _M / 1024
+                result = "{:.1f} G".format(_G)
+            else:
+                result = "{:.1f} M".format(_M)
+        return result
+
+    def fetch_data(self, url, proxies=None, file=None, show_error_log=False, **kwargs):
+        """
+        下载数据
+
+        :param str url: 文件下载地址
+        :param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        :param file: 本地文件
+        :param show_error_log: 展示错误堆栈信息日志
+        """
+        method = kwargs.pop("method", "get")
+
+        request_kwargs = {}
+        request_kwargs.setdefault("proxies", proxies)
+        request_kwargs.setdefault("headers", kwargs.get("headers") or headers)
+        request_kwargs.setdefault("params", kwargs.pop("params", None))
+        request_kwargs.setdefault("data", kwargs.pop("data", None))
+        request_kwargs.setdefault("json", kwargs.pop("json", None))
+        request_kwargs.setdefault("cookies", kwargs.pop("cookies", None))
+        request_kwargs.setdefault("timeout", kwargs.pop("timeout", (60,120)))
+        request_kwargs.setdefault("stream", kwargs.pop("stream", True))
+        request_kwargs.setdefault("verify", kwargs.pop("verify", False))
+        request_kwargs.setdefault("allow_redirects", kwargs.pop("allow_redirects", True))
+
+        stream = io.BytesIO()
+        retries = 0
+        while retries < 3:
+            try:
+                with requests.request(method, url, **request_kwargs) as req:
+                    req.raise_for_status()
+
+                    lower_headers = {k.lower(): v for k, v in req.headers.items()}
+                    content_length = lower_headers.get('content-length')
+                    if content_length is not None:
+                        content_length = self.calculate_size(int(content_length))
+                        if content_length > 50:
+                            # 丢弃超过50Mb内容长度的文件
+                            return stream.getvalue()
+                    else:
+                        content_length = None
+
+                    chunk_size = 1024 * 20  # 20KB chunks
+                    downloaded_size = 0
+                    with tqdm.tqdm(
+                            total=content_length,
+                            unit="B",
+                            initial=0,
+                            unit_scale=True,
+                            unit_divisor=1024,  # 1M=1024Kb,单位换算
+                            ascii=True,
+                            desc=file) as bar:
+
+                        iter_content = req.iter_content(chunk_size=chunk_size)
+                        if file is not None:
+                            with open(file, "wb") as f:
+                                for chunk in iter_content:
+                                    size = stream.write(chunk)
+                                    f.write(chunk)
+                                    bar.update(size)
+                                    downloaded_size += size
+                                    content_length = self.calculate_size(downloaded_size)
+                                    if content_length > 50:
+                                        stream.truncate(0)  # 截断流,保留前0个字节,即清空流
+                                        stream.seek(0)  # 将位置指针移回流的开始处
+                                        break
+                        else:
+                            for chunk in iter_content:
+                                size = stream.write(chunk)
+                                bar.update(size)
+                                downloaded_size += size
+                                content_length = self.calculate_size(downloaded_size)
+                                if content_length > 50:
+                                    stream.truncate(0)  # 截断流,保留前0个字节,即清空流
+                                    stream.seek(0)  # 将位置指针移回流的开始处
+                                    break
+
+                    return stream.getvalue()
+
+            except requests.RequestException as why:
+                stream.truncate(0)  # 截断流,保留前0个字节,即清空流
+                stream.seek(0)  # 将位置指针移回流的开始处
+                retries += 1
+                if show_error_log:
+                    logger.exception(why)
+
+        return stream.getvalue()
+
+    def _push_oss_from_stream(self, filename, filetype, url, **kwargs):
+        """
+        推送数据流到oss
+
+        :param str filename: 文件名称
+        :param str filetype: 文件类型
+        :param str url: 文件下载地址
+        """
+        stream = self.fetch_data(url, file=None, **kwargs)
+        attachment = {
+            "filename": "{}.{}".format(filename, filetype),
+            "org_url": url
+        }
+        if len(stream) > 0:
+            fid = get_sha1(stream)
+            try:
+                attachment["ftype"] = filetype
+                attachment["fid"] = "{}.{}".format(fid, filetype)
+                attachment["size"] = self.getsize(stream)
+                attachment["url"] = "oss"
+                AliYunService().push_oss_from_stream(attachment["fid"], stream)
+            except Exception as e:
+                logger.error(
+                    "[{}]上传失败,原因:{}".format(filename, e.__class__.__name__)
+                )
+
+        return attachment
+
+    def read_pdf_in_chunks(self, pdf_path, chunk_size=1024):
+        try:
+            with open(pdf_path, 'rb') as file:
+                chunk = file.read(chunk_size)
+                if "<</Names <</Dests 4 0 R>>" in str(chunk) and "SourceModified" in str(chunk):
+                    return False
+                elif "doctypehtml" not in str(chunk):
+                    return True
+                elif "%PDF" in str(chunk):
+                    return True
+                else:
+                    return False
+        except Exception as e:
+            return False
+
+    def _push_oss_from_local(self, filename, filetype, url, **kwargs):
+        """
+        上传本地文件到oss
+
+        :param str filename: 文件名称
+        :param str filetype: 文件类型
+        :param str url: 文件下载地址
+        """
+        file = self.create_file(filename, filetype)
+        stream = self.fetch_data(url, file=file, **kwargs)
+        '''上传/下载,无论失败成功都需要返回文件基础信息'''
+        attachment = {
+            "filename": "{}.{}".format(filename, filetype),
+            "org_url": url
+        }
+
+        if kwargs.get('is_check', None):
+            if not self.read_pdf_in_chunks(file):
+                self.remove(file)
+                return attachment
+
+        if len(stream) > 0:
+            content_hash = get_sha1(stream)
+            try:
+                attachment["fid"] = "{}.{}".format(content_hash, filetype)
+                attachment["size"] = self.getsize(file)
+                attachment["ftype"] = filetype
+                attachment["url"] = "oss"
+                AliYunService().push_oss_from_local(attachment["fid"], file)
+            except Exception as e:
+                logger.error(
+                    "[{}]上传失败,原因:{}".format(filename, e.__class__.__name__)
+                )
+
+        self.remove(file)  # 删除本地临时文件
+        return attachment
+
+    def fetch_attachment(
+        self,
+        file_name: str,
+        file_type: str,
+        download_url: str,
+        mode="local",
+        proxies=None,
+        **kwargs
+    ):
+        """
+        下载附件
+
+        @param file_name: 文件名称
+        @param file_type: 文件类型
+        @param download_url: 文件下载地址
+        @param mode: 附件上传模式 "local" = 本地文件 or "stream" = 数据流
+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        @return:
+        """
+        if not file_name or not file_type or not download_url:
+            raise AttachmentNullError
+
+        file_name = clear_file_type_suffix(file_name, file_type)  # 防止文件后缀重复
+        file_kwargs = dict(
+            filename=file_name,
+            filetype=file_type,
+            url=download_url,
+            proxies=proxies,
+            **kwargs
+        )
+        if mode == "stream":
+            attachment = self._push_oss_from_stream(**file_kwargs)
+        else:
+            attachment = self._push_oss_from_local(**file_kwargs)
+        return attachment

+ 89 - 0
lzz_theme/utils/chaojiying.py

@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-26
+---------
+@summary: 超级鹰
+---------
+@author: Lzz
+"""
+import requests
+
+JY_SERVER = "http://pycaptcha.spdata.jianyu360.com"
+
+
+def _pack_file(file):
+    """包装验证码格式"""
+    if isinstance(file, str) and file.startswith("data:image"):
+        files = {"file": file}
+    elif isinstance(file, bytes):
+        files = {"file": file}
+    else:
+        with open(file, "rb") as f:
+            img_bytes = f.read()
+        files = {"file": img_bytes}
+    return files
+
+
+def postpic(im, codetype=6001, jy_code=None):
+    """
+    超级鹰识别平台
+
+    pic_type,详情查询地址: https://www.chaojiying.com/price.html
+    @param str|bytes im: 验证码图片
+    @param int codetype: 验证码图片类型
+    @param str jy_code: 剑鱼爬虫代码
+    """
+    file = im
+    pic_type = codetype
+    files = _pack_file(file)
+    url = f"{JY_SERVER}/v1/images/discern?pic_type={pic_type}"
+    if spidercode is not None:
+        url = f"{JY_SERVER}/v1/images/discern?pic_type={pic_type}&jy_code={jy_code}"
+
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    headers = {"accept": "application/json"}
+
+    response = requests.post(url, headers=headers, data=data, files=files, timeout=10)
+    json_resp = response.json()
+    '''code 返回0时,打码平台正常返回数据'''
+    pic_str = json_resp["r"]["pic_str"]
+    pic_id = json_resp["r"]["pic_id"]
+    if 0 == json_resp["code"]:
+        return pic_str, pic_id
+    return None, pic_id
+
+
+def report_error(im_id):
+    """
+    超级鹰平台识别验证码错误时,提交识别错误的验证码pic_id
+
+    @param str im_id: 超级鹰验证码识别图片id
+    @return:
+    """
+    pic_id = im_id
+    url = f"{JY_SERVER}/v1/images/report_err?pic_id={pic_id}"
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/x-www-form-urlencoded'
+    }
+    response = requests.post(url, headers=headers, data=data, timeout=10)
+    '''
+    回调成功:{'msg': 'OK', 'code': 0}  
+    此接口不能随便调用!程序逻辑里要这样判断: 如果 识别结果是错的 再调用 报错返分 接口。 如果没有这个判断或是无法判断,就不要调用!
+    '''
+    return response.json()

+ 118 - 0
lzz_theme/utils/check_utils.py

@@ -0,0 +1,118 @@
+import sys
+import os
+sys.path.append(os.path.dirname(os.getcwd()))
+import re
+
+from utils.execptions import (
+    AccountError,
+    CheckError,
+)
+
+__all__ = ['CheckText', 'CheckTask']
+
+
+class CheckContent:
+
+    def __init__(self):
+        self.sensitive_words = {
+            '正式会员', '账户充值', 'VIP会员查阅', '>(注册)<', '>(登录)<', '高级会员',
+            '标准会员', '点击支付',
+            # '隐私政策及用户服务协议','.*<a href=\"(.*?)\">点击查看内容'
+        }
+
+    @staticmethod
+    def check_text_length(val: str):
+        if len(val) == 0:
+            raise CheckError(code=10101, reason='文本内容为空')
+        elif not re.findall(r'[\u4e00-\u9fa5]+', val, re.S):
+            raise CheckError(code=10102, reason='不存在中文字符')
+        else:
+            '''清洗数字、字母、中文之外的干扰元素'''
+            sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
+            for pattern in sub_pattern:
+                val = re.sub(pattern, '', val)
+            # 若文本长度小于2,表示页面内容无详情内容
+            if len(val) < 2:
+                raise CheckError(code=10102, reason='页面无有效内容')
+
+    @staticmethod
+    def check_content(val: str):
+        if val.count("部分文件可能不支持在线浏览"):
+            raise CheckError(code=10103, reason='文件不支持在线浏览')
+
+    @staticmethod
+    def check_account_privilege(val: str):
+        if val.count("高级会员"):
+            raise AccountError(code=10011, reason='账号权限等级过低')
+        elif "本招标项目仅供正式会员查阅" in val:
+            raise AccountError(code=10012, reason='账号无会员访问权限')
+
+    def check_sensitive_word(self, val: str):
+        total = set()
+        for word in self.sensitive_words:
+            result = re.search(word, val)
+            if result is not None:
+                total.add(word)
+
+        if len(total) > 0:
+            raise CheckError(code=10104, reason='敏感词过滤')
+
+    def __check(self, text):
+        self.check_sensitive_word(text)
+        self.check_text_length(text)
+        self.check_content(text)
+        self.check_account_privilege(text)
+
+    def __call__(self, text: str, *args, **kwargs):
+        self.__check(text)
+
+
+class CheckPrePareRequest:
+
+    def __init__(self):
+        self.crawl_keywords = {
+            '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
+            '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
+            '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
+            '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
+            '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
+            '终止', '系统'
+        }
+
+    @staticmethod
+    def check_es_cache(title: str, publish_time: int, rows: dict):
+        """
+
+        :param title:  标题
+        :param publish_time: 发布时间的时间戳(l_np_publishtime)
+        :param rows: 采集内容
+        """
+        # retrieved_result = es_search(title, publish_time)
+        retrieved_result = 0
+        if retrieved_result != 0:
+            '''es查询数据结果'''
+            rows['count'] = retrieved_result
+            raise CheckError(code=10105, reason='es已收录标题')
+
+    def check_crawl_title(self, title: str):
+        for keyword in self.crawl_keywords:
+            valid_keyword = re.search(keyword, title)
+            if valid_keyword is not None:
+                print(valid_keyword)
+                break
+
+        else:
+            raise CheckError(code=10106, reason='标题未检索到采集关键词', title=title)
+
+    def __check(self, rows: dict):
+        title, publish_time = rows['title'], rows['l_np_publishtime']
+        self.check_crawl_title(title)
+        self.check_es_cache(title, publish_time, rows)
+
+    def __call__(self, rows: dict, *args, **kwargs):
+        self.__check(rows)
+
+
+CheckText = CheckContent()
+CheckTask = CheckPrePareRequest()
+

+ 147 - 0
lzz_theme/utils/clean_html.py

@@ -0,0 +1,147 @@
+import re
+__all__ = ['cleaner']
+
+# 独立元素
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+# 行内元素
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<link>|<link [^>]*>|</link>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+    'data:image(.*?) ': '',            # 图片base64
+}
+# 块级元素
+BLOCK_TAGS = {
+    '<div>\s*?</div>':'',
+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>': '<br>',  # 段落
+    '</p>': '',  # 段落
+    '<div>|<div [^>]*>': '<br>',  # 分割 division
+    '</div>': '',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+# 其他
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+
+}
+# 样式
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+
+}
+# 空白符
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+# css标签集合
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+# css属性集合
+ATTRS = {'id', 'class', 'style', 'width'}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    # 不显示输入框边框
+    html = html.replace('<input', '<input style="border-color: transparent;"')
+    return html
+
+
+def _lowercase_tag(html):
+    """标签归一化处理(全部小写 + 标签修复)"""
+    tags = re.findall("<[^>]+>", html)
+    tag_sets = set(tags)
+
+    if len(tag_sets) > 10000:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, "lxml")
+        html = str(soup.body.next_element)
+    else:
+        for tag in tag_sets:
+            html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def cleaner(html, special=None, completely=False):
+    """
+    数据清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :return: 清洗后的页面源码
+    """
+    if special is None:
+        special = {}
+
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    return html

+ 58 - 0
lzz_theme/utils/es_query.py

@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-12-25 
+---------
+@summary: es
+---------
+@author: Lzz
+"""
+import sys
+import os
+sys.path.append(os.path.dirname(os.getcwd()))
+from elasticsearch import Elasticsearch
+from utils.title_participle import get_should
+
+
+# es:
+#   host: 172.17.4.184
+#   usename: "jybid"
+#   pwd: "Top2023_JEB01i@31"
+#   port: !!int 19905
+#   db: biddingall # es库别名
+
+
+def es_client():
+    cfg = {"host": "172.17.4.184",
+           "port": 19905,
+           "usename": "jybid",
+           "pwd": "Top2023_JEB01i@31"}
+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}],http_auth=(cfg['usename'], cfg['pwd']))
+
+
+def es_search(title: str, publish_time: int):
+    """
+    查询es
+
+    :param title: 标题
+    :param publish_time: 发布时间
+    :return:
+    """
+    client = es_client()
+    stime = publish_time - 432000  # 往前推5天
+    etime = publish_time + 432000
+
+    time_limit = {"range": {"publishtime": {"from": stime, "to": etime}}}
+    should_list = get_should(title)   # 对标题进行分词组合query语句
+    # 通过发布标题和发布时间范围查询
+    query = {
+        "query": {
+            "bool": {
+                "must": [time_limit],
+                "should": should_list,
+                "minimum_should_match": "10<90%",
+            }
+        }
+    }
+    result = client.search(index="biddingall", body=query, request_timeout=100)
+    total = int(result['hits']['total']['value'])
+    return total

+ 37 - 0
lzz_theme/utils/execptions.py

@@ -0,0 +1,37 @@
+class YbwCrawlError(Exception):
+
+    def __init__(self, *args, **kwargs):
+        self.code = kwargs.get('code', 10000)
+        self.reason = kwargs.get('reason', '元博网采集未知错误,请手动处理')
+
+        if 'code' not in kwargs:
+            kwargs['code'] = self.code
+        if 'reason' not in kwargs:
+            kwargs['reason'] = self.reason
+
+        [setattr(self, key, val) for key, val in kwargs.items()]
+        super(YbwCrawlError, self).__init__(*args, kwargs)
+
+
+class AccountError(YbwCrawlError):
+
+    def __init__(self, reason='账号异常', code=10001, **kwargs):
+        super(AccountError, self).__init__(code=code, reason=reason, **kwargs)
+
+
+class CheckError(YbwCrawlError):
+
+    def __init__(self, reason='数据检查异常', code=10002, **kwargs):
+        super(CheckError, self).__init__(code=code, reason=reason, **kwargs)
+
+
+class CrawlError(YbwCrawlError):
+
+    def __init__(self, reason='数据采集异常', code=10003, **kwargs):
+        super(CrawlError, self).__init__(code=code, reason=reason, **kwargs)
+
+
+class AttachmentError(YbwCrawlError):
+
+    def __init__(self, reason='附件异常', code=10004, **kwargs):
+        super(AttachmentError, self).__init__(code=code, reason=reason, **kwargs)

+ 130 - 0
lzz_theme/utils/get_imgcode.py

@@ -0,0 +1,130 @@
+import requests
+
+
+headers = {"accept": "application/json"}
+
+
+def _pack_file(file):
+    """包装验证码格式"""
+    if isinstance(file, str) and file.startswith("data:image"):
+        img_file = {"file": file}
+    elif isinstance(file, bytes):
+        img_file = {"file": file}
+    else:
+        with open(file, "rb") as f:
+            img_bytes = f.read()
+        img_file = {"file": img_bytes}
+    return img_file
+
+
+def _simple_captcha(file):
+    """
+    普通验证码
+
+    @param file: 验证码 - 可以是图片或者图片base64编码
+    @return:
+    """
+    url = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    rp_json = r.json()
+    if "msg" in rp_json and "success" == rp_json["msg"]:
+        return str(rp_json["r"]["code"])
+    return None
+
+
+def _arithmetic_captcha(file):
+    """算术验证码"""
+    url = "http://pycaptcha.spdata.jianyu360.com/v1/images/arithmetic"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    json_resp = r.json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        return str(json_resp["r"]["code"])
+    return None
+
+
+def _get_click_verify_captcha(file):
+    """点触式验证码"""
+    url = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify_det"
+    files = _pack_file(file)
+    r = requests.post(url, headers=headers, files=files, stream=True, timeout=10)
+    return r.json()
+
+
+def swordfish_platform(file, mode="simple"):
+    """剑鱼验证码识别平台"""
+    if mode.lower() == "arithmetic":
+        return _arithmetic_captcha(file)
+    elif mode.lower() == "det":
+        return _get_click_verify_captcha(file)
+    else:
+        return _simple_captcha(file)
+
+
+def chaojiying_platform(file, pic_type: int, spidercode=None):
+    """
+    超级鹰识别平台
+
+    pic_type,详情查询地址: https://www.chaojiying.com/price.html
+    @param str spidercode: 爬虫代码
+    """
+    files = _pack_file(file)
+    url = f"http://pycaptcha.spdata.jianyu360.com/v1/images/discern?pic_type={pic_type}"
+    if spidercode is not None:
+        url = f"http://pycaptcha.spdata.jianyu360.com/v1/images/discern?pic_type={pic_type}&jy_code={spidercode}"
+
+    headers = {'accept': 'application/json'}
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, files=files, timeout=20)
+    json_resp = response.json()
+    '''code 返回0时,打码平台正常返回数据'''
+    pic_str = json_resp["r"]["pic_str"]
+    pic_id = json_resp["r"]["pic_id"]
+    if 0 == json_resp["code"]:
+        return pic_str, pic_id
+    return None, pic_id
+
+
+def chaojiying_report(pic_id: str):
+    """超级鹰平台识别验证码错误时,提交识别错误的验证码pic_id"""
+    url = f"http://pycaptcha.spdata.jianyu360.com/v1/images/report_err?pic_id={pic_id}"
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/x-www-form-urlencoded'
+    }
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, timeout=10)
+    '''
+    回调成功:{'msg': 'OK', 'code': 0}  
+    此接口不能随便调用!程序逻辑里要这样判断: 如果 识别结果是错的 再调用 报错返分 接口。 如果没有这个判断或是无法判断,就不要调用!
+    '''
+    return response.json()
+
+
+def get_code(file_path: str) -> dict:
+    return swordfish_platform(file_path) or {}
+
+
+def get_code_det(image_bytes) -> dict:
+    return swordfish_platform(image_bytes, mode="det")
+
+
+# 算术
+def arithmetic_captcha(image_stream):
+    return swordfish_platform(image_stream, mode="arithmetic")
+

文件差异内容过多而无法显示
+ 6 - 0
lzz_theme/utils/js/stealth.min.js


+ 18 - 0
lzz_theme/utils/robbot.py

@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-05 
+---------
+@summary: 企业微信告警
+---------
+@author: Lzz
+"""
+import requests
+
+
+# 发送邮件
+def send_msg(content):
+    key = "6cd9e893-b351-4ffd-80a3-5c6781c77977"
+    wx_url = f'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={key}'  # 发送消息接口地址
+    markdown = {"msgtype": "markdown", "markdown": {"content": content}}
+    r2 = requests.post(url=wx_url, json=markdown)  # post请求消息
+    return r2.json()

+ 50 - 0
lzz_theme/utils/title_participle.py

@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-10-10 
+---------
+@summary: 标题分词,组合es查询语句
+---------
+@author: Lzz
+"""
+from requests.auth import HTTPBasicAuth
+import requests
+import json
+
+
+def get_should(title):
+
+    # url = "http://192.168.3.149:9201/_analyze"  # 测试
+    url = "http://172.17.4.184:19905/_analyze"  # 线上
+    username = "jybid"
+    password = "Top2023_JEB01i@31"
+
+    headers = {"Content-Type": "application/json"}
+    auth = HTTPBasicAuth(username, password)
+    data = {
+        "analyzer": "ik_smart",
+        "text": title
+    }
+
+    res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
+
+    try:
+        res_text = json.loads(res.text).get('tokens') or [{"token":title}]
+    except:
+        res_text = [{"token":title}]
+
+    should_list = []
+    for key in res_text:
+        single_dict = {
+            "multi_match": {
+                "query": f"{key.get('token')}",
+                "type": "phrase",
+                "fields": [
+                    "title"
+                ]
+            }
+        }
+        should_list.append(single_dict)
+
+    return should_list
+
+

+ 699 - 0
lzz_theme/utils/tools.py

@@ -0,0 +1,699 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-04-09
+---------
+@summary: 主题爬虫 工具类
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.getcwd()))
+import re
+import time
+import bson
+import redis
+import requests
+import datetime
+import calendar
+import hashlib
+import random
+import execjs
+import functools
+from hashlib import md5
+from loguru import logger
+from collections import namedtuple
+from pymongo import MongoClient
+from pymongo.errors import DuplicateKeyError
+from .clean_html import cleaner
+
+SearchText = namedtuple('SearchText', ['total'])
+
+
+def nsssjss():
+    ex_js = '''
+    const jsdom = require("jsdom");
+    const {JSDOM} = jsdom;
+    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
+    window = dom.window;
+    document = window.document;
+
+    JSEncrypt = require('jsencrypt')
+
+    function encryptByRSA(value) {
+        var encrypt = new JSEncrypt;
+        var RSAPublicKey = "MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCS2TZDs5+orLYCL5SsJ54+bPCVs1ZQQwP2RoPkFQF2jcT0HnNNT8ZoQgJTrGwNi5QNTBDoHC4oJesAVYe6DoxXS9Nls8WbGE8ZNgOC5tVv1WVjyBw7k2x72C/qjPoyo/kO7TYl6Qnu4jqW/ImLoup/nsJppUznF0YgbyU/dFFNBQIDAQAB";
+        encrypt.setPublicKey('-----BEGIN PUBLIC KEY-----' + RSAPublicKey + '-----END PUBLIC KEY-----')
+        return encrypt.encrypt(value)
+    }
+    function get_njs(){
+        nsssjss = encryptByRSA('/freecms' + '/rest/v1/notice/selectInfoMoreChannel.do' + '$$' + new Date().getTime())
+        return nsssjss
+    }
+    '''
+    ctx = execjs.compile(ex_js)
+    njs = ctx.call('get_njs')
+    return njs
+
+
+def get_QGIP():
+    proxy = "http://6278CF0D:41D9C796172D@tun-vdpzuj.qg.net:15254"
+    proxies = {
+        "http": proxy,
+        "https": proxy,
+    }
+    return proxies
+
+
+def pinyi_proxy(count=100):
+    url = f"http://zltiqu.pyhttp.taolop.com/getip?count={count}&neek=80160&type=2&yys=0&port=2&sb=&mr=1&sep=0&ts=1"
+    retry = 0
+    while (retry := retry + 1) < 30:
+        try:
+            res = requests.get(url, timeout=10)
+            data_list = res.json().get('data')
+            if not data_list:
+                time.sleep(3)
+                continue
+            if "白名单" in res.text:
+                logger.warning("请将此IP加入品易白名单")
+                return []
+            new_list = []
+            for pp in data_list:
+                proxy = {'http': f'http://{pp.get("ip")}:{pp.get("port")}',
+                         'https': f'http://{pp.get("ip")}:{pp.get("port")}'}
+                new_list.append(proxy)
+            return new_list
+        except:
+            logger.error("pinyi访问异常!")
+            time.sleep(3)
+
+
+def get_proxy(scheme=None, default=None, socks5h=False):
+    headers = {
+        "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+    }
+    while True:
+        proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
+        # proxy = requests.get("http://39.106.157.58:1405/crawl/proxy/socks5/fetch", headers=headers).json()
+        proxies = proxy.get("data")
+        if proxies:
+            break
+        else:
+            logger.warning("暂无代理...")
+            time.sleep(3)
+    if socks5h:
+        proxyh = {
+            "http": proxies.get("http").replace("socks5", "socks5h"),
+            "https": proxies.get("http").replace("socks5", "socks5h")
+        }
+        proxies = proxyh
+    logger.info(f"切换代理: {proxies}")
+    if not scheme:
+        return proxies
+    else:
+        return proxies.get(scheme, default)
+
+
+def Mongo_client():
+    client = MongoClient("172.17.4.87", 27080)
+    # client = MongoClient("127.0.0.1", 27017)
+    return client
+
+
+def Redis_client():
+    _pool = redis.ConnectionPool(
+        host='172.17.162.28',
+        # host='127.0.0.1',
+        port=7361,
+        password='k5ZJR5KV4q7DRZ92DQ',
+        db=1
+    )
+    # _pool = redis.ConnectionPool(
+    #     host='127.0.0.1',
+    #     port=6379,
+    #     db=1
+    # )
+    r = redis.Redis(connection_pool=_pool, decode_responses=True)
+    return r
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)
+
+
+def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
+    return datetime.datetime.now().strftime(date_format)
+
+
+def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
+    @param format:时间格式
+    ---------
+    @result: 返回时间戳
+    """
+    if ":" in date:
+        timestamp = time.mktime(time.strptime(date, time_format))
+    else:
+        timestamp = time.mktime(time.strptime(date, "%Y-%m-%d"))
+    return int(timestamp)
+
+
+def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param timestamp: 将时间戳转化为日期
+    @param format: 日期格式
+    ---------
+    @result: 返回日期
+    """
+    if timestamp is None:
+        raise ValueError("timestamp is null")
+
+    date = time.localtime(timestamp)
+    return time.strftime(time_format, date)
+
+
+def get_sha1(*args):
+    """
+    @summary: 获取唯一的40位值, 用于获取唯一的id
+    ---------
+    @param *args: 参与联合去重的值
+    ---------
+    @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e
+    """
+
+    sha1 = hashlib.sha1()
+    for arg in args:
+        sha1.update(str(arg).encode())
+    return sha1.hexdigest()  # 40位
+
+
+def get_sha256(*args):
+    """
+    @summary: 获取唯一的64位值, 用于获取唯一的id
+    ---------
+    @param *args: 参与联合去重的值
+    ---------
+    @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
+    """
+
+    sha256 = hashlib.sha256()
+    for arg in args:
+        sha256.update(str(arg).encode())
+    return sha256.hexdigest()  # 64位
+
+
+def md5value(val):
+    md5 = hashlib.md5()
+    if isinstance(val, bytes):
+        md5.update(str(val).encode("utf-8"))
+    elif isinstance(val, str):
+        md5.update(val.encode("utf-8"))
+    return md5.hexdigest()
+
+
+def ensure_int64(n):
+    """
+    >>> ensure_int64(None)
+    0
+    >>> ensure_float(False)
+    0
+    >>> ensure_float(12)
+    12
+    >>> ensure_float("72")
+    72
+    """
+    if not n:
+        return bson.int64.Int64(0)
+    return bson.int64.Int64(n)
+
+
+def get_today_of_day(day_offset=0):
+    return str(datetime.date.today() + datetime.timedelta(days=day_offset))
+
+
+def get_current_timestamp():
+    return int(time.time())
+
+
+def add_zero(n):
+    return "%02d" % n
+
+
+def sup_zero(indate):
+    deal = indate.split(' ')
+    head = deal[0].split('-')
+    tail = ""
+    if len(deal) == 2:
+        tail = " " + deal[1]
+    year = int(head[0])
+    month = int(head[1])
+    day = int(head[2])
+    fdate = datetime.datetime(year=year, month=month, day=day)
+    formatted_date = fdate.strftime("%Y-%m-%d") + tail
+    return formatted_date
+
+
+def get_days_of_month(year, month):
+    """
+    返回天数
+    """
+
+    return calendar.monthrange(year, month)[1]
+
+
+def get_year_month_and_days(month_offset=0):
+    """
+    @summary:
+    ---------
+    @param month_offset: 月份偏移量
+    ---------
+    @result: ('2019', '04', '30')
+    """
+
+    today = datetime.datetime.now()
+    year, month = today.year, today.month
+
+    this_year = int(year)
+    this_month = int(month)
+    total_month = this_month + month_offset
+    if month_offset >= 0:
+        if total_month <= 12:
+            days = str(get_days_of_month(this_year, total_month))
+            total_month = add_zero(total_month)
+            return (year, total_month, days)
+        else:
+            i = total_month // 12
+            j = total_month % 12
+            if j == 0:
+                i -= 1
+                j = 12
+            this_year += i
+            days = str(get_days_of_month(this_year, j))
+            j = add_zero(j)
+            return (str(this_year), str(j), days)
+    else:
+        if (total_month > 0) and (total_month < 12):
+            days = str(get_days_of_month(this_year, total_month))
+            total_month = add_zero(total_month)
+            return (year, total_month, days)
+        else:
+            i = total_month // 12
+            j = total_month % 12
+            if j == 0:
+                i -= 1
+                j = 12
+            this_year += i
+            days = str(get_days_of_month(this_year, j))
+            j = add_zero(j)
+            return (str(this_year), str(j), days)
+
+
+def get_month(month_offset=0):
+    """''
+    获取当前日期前后N月的日期
+    if month_offset>0, 获取当前日期前N月的日期
+    if month_offset<0, 获取当前日期后N月的日期
+    date format = "YYYY-MM-DD"
+    """
+    today = datetime.datetime.now()
+    day = add_zero(today.day)
+
+    (y, m, d) = get_year_month_and_days(month_offset)
+    arr = (y, m, d)
+    if int(day) < int(d):
+        arr = (y, m, day)
+    return "-".join("%s" % i for i in arr)
+
+
+def extract_file_type(file_name="附件名", file_url="附件地址", file_type_list=[]):
+    """
+        抽取附件类型
+    Args:
+        file_name: 附件名
+        file_url: 附件地址
+        file_type_list: 其他附件后缀
+    Returns: 附件类型
+    """
+    if file_name and file_url:
+        file_name = file_name.strip()
+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
+                      'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
+        if file_type_list:
+            ftp_list = list(map(lambda x: x.lower(), file_type_list))
+            file_types.extend(ftp_list)
+
+        file_type = file_url.split('?')[0].split('.')[-1].lower()
+        if file_type not in file_types:
+            file_type = file_url.split('?')[-1].split('.')[-1].lower()
+            if file_type in file_types:
+                return file_type
+            else:
+                for ftp in file_types:
+                    file_type = re.search(ftp, file_name) or re.search("\." + ftp, file_url)
+                    if file_type:
+                        return file_type.group(0).replace('.', '')
+
+        else:
+            return file_type
+    return None
+
+
+def remove_htmldata(remove_info_list: list, html: str, response):
+    """
+        过滤详情页无效数据
+    Args:
+        remove_info_list: 需删除内容的xpath或文本 -> list [xpath,re,str] eg:['<re>data:image/(.*?)"',]
+        html: 待清洗文本
+        response: 原文响应体
+
+    Returns: 清洗后的文本
+
+    """
+    if html and remove_info_list:
+        for extra_item in remove_info_list:
+            if re.search('^//.*', extra_item):
+                extra_html_list = response.xpath(extra_item).extract()
+                for extra_html in extra_html_list:
+                    if extra_html:
+                        html = html.replace(extra_html, '')
+            elif re.search('^<re>.*', extra_item):
+                extra_item = extra_item.replace('<re>', '')
+                extra_html_list = re.findall(f'{extra_item}', html, re.S | re.I | re.M)
+                if extra_html_list:
+                    for exhtml in extra_html_list:
+                        html = html.replace(exhtml, '')
+            else:
+                extra_html = extra_item
+                if extra_html:
+                    html = html.replace(extra_html, '')
+    return html
+
+
+def text_search(content: str) -> SearchText:
+    """
+    中文检索
+
+    :param content: 文本
+    :return: 中文数量
+    """
+    if not content:
+        return SearchText(0)
+
+    results = re.findall('[\u4e00-\u9fa5]', content, re.S)
+    # 列表长度即是中文的字数
+    return SearchText(len(results))
+
+
+def clean_title(title):
+    '''清洗标题'''
+    if title:
+        rule_list = [
+            '\(\d{1,20}\)',
+            '\[[\u4e00-\u9fa5]{1,9}\]',
+            '【[\u4e00-\u9fa5]{1,9}】',
+        ]
+        for rule in rule_list:
+            title = re.sub(rule, '', title)
+
+    return title
+
+
+def substitute(html_str, special=None, completely=False):
+    """HTML 替换"""
+    html_str = cleaner(html=html_str, special=special, completely=completely)
+    return html_str
+
+
+def handle_publish_time(publishtime):
+    '''处理发布时间'''
+    try:
+        time_str = get_current_date().split(' ')[-1]
+        if ':' not in publishtime:
+            publishtime = publishtime + ' ' + time_str
+        else:
+            if '00:00:00' in publishtime:
+                publishtime = publishtime.split(' ')[0] + ' ' + time_str
+
+        l_np_publishtime = int2long(date_to_timestamp(publishtime))
+        publishtime, l_np_publishtime = handle_publish_time_overdue(publishtime, l_np_publishtime)
+        return publishtime, l_np_publishtime
+    except:
+        raise EOFError("publishtime 格式错误!")
+
+
+def handle_publish_time_overdue(publishtime, l_np_publishtime):
+    """处理超期发布时间"""
+    if l_np_publishtime and l_np_publishtime > get_current_timestamp():
+        logger.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
+        publishtime = get_current_date()
+        l_np_publishtime = ensure_int64(date_to_timestamp(publishtime))
+
+    return publishtime, l_np_publishtime
+
+
+def handle_page_html(item):
+    '''检测正文'''
+    title = item.get('title')
+    publishtime = item.get('publishtime')
+    href = item.get('href')
+    if href == "#":
+        href = item.get('competehref')
+    contenthtml = item.get('contenthtml')
+    detail = item.get('detail')
+    if not contenthtml:
+        logger.warning(f"页面源码不能为空!\n 发布地址:{href}\n 发布时间:{publishtime}\n 标题:{title}")
+        raise ValueError("无效正文!")
+    else:
+        if text_search(detail).total == 0:
+            logger.warning("无内容数据,数据不入保存服务!")
+            item['sendflag'] = "true"
+
+
+def check_data_validity(item):
+    '''检测基础字段是否完整'''
+    title = item.get('title')
+    publishtime = item.get('publishtime')
+    href = item.get('href')
+    if href == "#":
+        href = item.get('competehref')
+    if not title or not publishtime or not href:
+        logger.error(f"基础数据不能为空!\n 发布地址:{href}\n 发布时间:{publishtime}\n 标题:{title}")
+        raise ValueError("基础数据异常")
+
+
+def format_fileds(item, **kwargs):
+    '''格式化入库字段(bidding)'''
+    req_fileds = ['title', 'publishtime', 'spidercode', 'infoformat', 'site', 'channel', 'area', 'city', 'jsondata',
+                  'district', 'href', 'is_mixed', 'comeintime', 's_title', 'l_np_publishtime', 'contenthtml','competehref',
+                  'detail', 'iscompete', 'sendflag', '_d', 'publishdept', 'type', 'T', 'projectinfo', 'is_theme']
+    rm_list = []
+    for key, val in item.items():  # 过滤非必须字段
+        if key not in req_fileds:
+            rm_list.append(key)
+
+    for kk in rm_list:
+        item.pop(kk, None)
+
+    item['detail'] = substitute(item.get('contenthtml'))
+    item['s_title'] = item.get('s_title') or item.get('title')
+    pub_time = handle_publish_time(item.get('publishtime'))
+    item['publishtime'] = pub_time[0]
+    item['l_np_publishtime'] = pub_time[1]
+    item['infoformat'] = 1
+    item['iscompete'] = True
+    item['sendflag'] = "false"
+    item['_d'] = "comeintime"
+    item['publishdept'] = ""
+    item['type'] = ""
+    item['T'] = "bidding"
+
+    for k, v in kwargs.items():
+        if k in req_fileds:
+            item[k] = v
+        else:
+            logger.error(f"{k} 入库字段未定义!")
+
+    handle_page_html(item)
+    check_data_validity(item)
+    item['comeintime'] = int2long(time.time())
+
+    return item
+
+
+def format_fileds_njpc(item, **kwargs):
+    '''格式化入库字段(拟建爬虫)'''
+    req_fileds = ['site', 'approvenumber', 'method', 'project_scale', 'area', 'is_mixed','competehref',
+                  'air_conditioner', 'funds', 'scale', 'construction_area', 'channel', 'contenthtml', 'elevator',
+                  'building_floors', 'ownertel', 'parking', 'building', 'spidercode', 'title',
+                  'detail', 'projectinfo', 'exterior', 'constructionunit', 'owner_info', 'approvetime',
+                  'project_startdate', 'investment', 'heating', 'district', 'constructionunitperson',
+                  'designunitperson', 'publishtime', 'system', 'pace', 'total', 'project_scale_info', 'passive',
+                  'phone', 'construction', 'parking_pace', 'floors', 'freshair_system', 'other_project_scale',
+                  'conditioner', 'wall', 'designunit', 'owneraddr', 'prefabricated_building', 'materials',
+                  'constructionunitaddr', 'constructionunit_info', 'project_person', 'approvecontent',
+                  'constructionunittel', 'floor', 'person', 'city', 'floor_area', 'project', 'approvestatus',
+                  'project_completedate', 'completedate', 'ownerperson', 'sendflag', 'comeintime',
+                  'steel_structure', 'projectaddr', 'freshair', 'T', 'startdate', 'house', 'projectname',
+                  'exterior_wall_materials', 'other', 'passive_house', 'jsondata', 'air', 'prefabricated',
+                  'designunit_info', 'approvedept', 'total_investment', 'infoformat', 'project_phone',
+                  'owner', 'designunittel', 'projecttype', 'approvecode', 'steel', 'is_theme', 'designunitaddr',
+                  'heating_method', 'href', 'projectperiod', 'structure']
+
+    rm_list = []
+    for key, val in item.items():  # 过滤非必须字段
+        if key not in req_fileds:
+            rm_list.append(key)
+
+    for kk in rm_list:
+        item.pop(kk, None)
+
+    item['detail'] = substitute(item.get('contenthtml'))
+    item['title'] = item.get('title') or item.get('projectname')
+    pub_time = handle_publish_time(item.get('publishtime'))
+    item['publishtime'] = pub_time[1]
+    item['infoformat'] = 2
+    item['sendflag'] = "false"
+    item['T'] = "bidding"
+
+    for k, v in kwargs.items():
+        if k in req_fileds:
+            item[k] = v
+        else:
+            logger.error(f"{k} 入库字段未定义!")
+
+    handle_page_html(item)
+    check_data_validity(item)
+    item['comeintime'] = int2long(time.time())
+
+    return item
+
+
+def search(pattern, string):
+    result = re.search(pattern, string)
+    if result:
+        return result.groups()[0]
+
+
+def sleep_time(start_time: int, end_time=0, step=-1):
+    time.sleep(random.random())
+    for i in range(start_time, end_time, step):
+        print(f"\r *** 休眠中... {i} 秒 *** ", end='')
+        time.sleep(1)
+    print("\r <* 休眠结束 *> ", end='')
+
+
+# 装饰器
+class Singleton(object):
+    def __init__(self, cls):
+        self._cls = cls
+        self._instance = {}
+
+    def __call__(self, *args, **kwargs):
+        if self._cls not in self._instance:
+            self._instance[self._cls] = self._cls(*args, **kwargs)
+        return self._instance[self._cls]
+
+
+def down_load_image(proxy=None):
+    img_url = 'https://gdgpo.czt.gd.gov.cn/freecms/verify/verifyCode.do?createTypeFlag=n'
+    header = {
+        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
+        "Accept-Language": "zh-CN,zh;q=0.9",
+        "Connection": "keep-alive",
+        "Referer": "https://gdgpo.czt.gd.gov.cn/cms-gd/site/guangdong/qwjsy/index.html?",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+    }
+    res = requests.get(img_url, headers=header, proxies=proxy, timeout=30, verify=False)
+    upload_address = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
+    content = {'file': res.content}
+    # with open('image.jpg', 'wb+') as f:
+    #     f.write(res.content)
+    headers = {'accept': 'application/json'}
+    json_resp = requests.post(upload_address, headers=headers, files=content, stream=True).json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        code = json_resp["r"]["code"]
+        if len(code) == 4:
+            return code
+    return None
+
+
+def _pack_file(file):
+    """包装验证码格式"""
+    if isinstance(file, str) and file.startswith("data:image"):
+        img_file = {"file": file}
+    elif isinstance(file, bytes):
+        img_file = {"file": file}
+    else:
+        with open(file, "rb") as f:
+            img_bytes = f.read()
+        img_file = {"file": img_bytes}
+    return img_file
+
+
+def simple_captcha(file):
+    """
+    普通验证码
+
+    @param file: 验证码 - 可以是图片或者图片base64编码
+    @return:
+    """
+    url = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
+    files = _pack_file(file)
+    r = requests.post(url, headers={"accept": "application/json"}, files=files, stream=True, timeout=10)
+    rp_json = r.json()
+    if "msg" in rp_json and "success" == rp_json["msg"]:
+        return str(rp_json["r"]["code"])
+    return None
+
+
+def retry_on_exception(retries=1, timeout=1):
+    def decorate(func):
+
+        @functools.wraps(func)
+        def warp(*args, **kwargs):
+            for _ in range(retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    print(f"执行[{func.__name__}]失败, args:{args}, kwargs:{kwargs} 异常:{e}")
+                    time.sleep(timeout)
+
+            raise RuntimeError(f"执行[{func.__name__}]达到最大重试次数")
+
+        return warp
+
+    return decorate
+
+
+class PySpiderError(Exception):
+
+    def __init__(self, *args, **kwargs):
+        if 'code' not in kwargs and 'reason' not in kwargs:
+            kwargs['code'] = 10000
+            kwargs['reason'] = '未知爬虫错误,请手动处理'
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+        super(PySpiderError, self).__init__(*args, kwargs)
+
+
+class AttachmentNullError(PySpiderError):
+
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常'):
+        super(AttachmentNullError, self).__init__(code=code, reason=reason)
+
+
+class CustomError(Exception):
+
+    def __init__(self, ErrorInfo):
+        self.ErrorInfo = ErrorInfo
+
+    def __str__(self):
+        return self.ErrorInfo

+ 461 - 0
lzz_theme/utils/webdriver.py

@@ -0,0 +1,461 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-01-19
+---------
+@summary: 远程selenium服务
+---------
+@author: dzr
+"""
+import os
+import queue
+import threading
+
+from selenium import webdriver
+from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
+from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
+from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
+
+
+
+# 浏览器渲染
+WEBDRIVER = dict(
+    pool_size=1,  # 浏览器的数量
+    load_images=False,  # 是否加载图片
+    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
+    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
+    headless=False,  # 是否为无头浏览器
+    driver_type="FIREFOX",  # CHROME、FIREFOX
+    timeout=30,  # 请求超时时间
+    window_size=(1280, 800),  # 窗口大小
+    executable_path=None,  # 浏览器路径,默认为默认路径
+    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
+    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
+    usages_local_driver=True,  # 是否加载本地驱动
+    server_addr="http://192.168.3.182:8899/wd/hub",  # selenium 远程服务地址
+    version="",  # 远程浏览器版本
+    service_log_path=os.devnull  # 日志路径
+)
+
+from loguru import logger
+from utils.tools import Singleton
+
+DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
+
+
+class WebDriver(RemoteWebDriver):
+    """浏览器采集 - selenium"""
+    CHROME = "CHROME"
+    FIREFOX = "FIREFOX"
+
+    def __init__(
+        self,
+        load_images=True,
+        user_agent=None,
+        proxy=None,
+        driver_type=CHROME,
+        timeout=20,
+        headless=False,
+        usages_local_driver=False,
+        window_size=(1024, 800),
+        server_addr=None,
+        version=None,
+        custom_argument=None,
+        executable_path=None,
+        service_log_path=None,
+        **kwargs
+    ):
+        """
+        webdirver 封装,支持 chrome 和 firefox
+        Args:
+            load_images: 是否加载图片
+            user_agent: 字符串 或 无参函数,返回值为user_agent
+            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+            headless: 是否启用无头模式
+            driver_type: CHROME 或 FIREFOX...
+            timeout: 请求超时时间
+            window_size: # 窗口大小
+            executable_path: 浏览器路径,默认为默认路径
+            server_addr: 远程服务地址
+            usages_local_driver: 是否使用本地驱动
+            service_log_path: selenium service 日志路径
+            version: 浏览器版本
+            **kwargs:
+        """
+        self._load_images = load_images or WEBDRIVER["load_images"]
+        self._user_agent = user_agent or DEFAULT_USERAGENT
+        self._proxy = proxy or WEBDRIVER["proxy"]
+        self._headless = headless or WEBDRIVER["headless"]
+        self._usages_local_driver = usages_local_driver or WEBDRIVER["_usages_local_driver"]
+        self._timeout = timeout or WEBDRIVER["timeout"]
+        self._window_size = window_size or WEBDRIVER["window_size"]
+        self._executable_path = executable_path or WEBDRIVER["executable_path"]
+        self._custom_argument = custom_argument or WEBDRIVER["custom_argument"]
+        self._server_addr = server_addr or WEBDRIVER["server_addr"]
+        self._version = version or WEBDRIVER["version"]
+        self._service_log_path = service_log_path or WEBDRIVER["service_log_path"]
+
+        if driver_type == WebDriver.CHROME:
+            self.driver = self.chrome_driver()
+
+        elif driver_type == WebDriver.FIREFOX:
+            self.driver = self.firefox_driver()
+
+        else:
+            raise TypeError(
+                "dirver_type must be one of CHROME or FIREFOX, but received {}".format(
+                    type(driver_type)
+                )
+            )
+
+        # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
+        self.driver.set_page_load_timeout(self._timeout)
+        # 设置10秒脚本超时时间
+        self.driver.set_script_timeout(self._timeout)
+
+        self._is_remote = not self._usages_local_driver
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            logger.error(exc_val)
+
+        self.quit()
+        return False
+
+    def __getattr__(self, name):
+        if self.driver:
+            return getattr(self.driver, name)
+        else:
+            raise AttributeError
+
+    def get_driver(self):
+        return self.driver
+
+    def local_firefox_driver(self):
+        firefox_profile = webdriver.FirefoxProfile()
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
+        firefox_profile.set_preference("dom.webdriver.enabled", False)
+
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = proxy.replace("socks5://", "")
+            # 使用socks5 代理
+            ip, port = proxy.split(":")
+            firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
+            firefox_profile.set_preference('network.proxy.socks', ip)
+            firefox_profile.set_preference('network.proxy.socks_port', int(port))
+
+        if self._user_agent:
+            firefox_profile.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(
+                    self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            firefox_profile.set_preference("permissions.default.image", 2)
+
+        if self._headless:
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--disable-gpu")
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        if self._executable_path:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+                executable_path=self._executable_path,
+                service_log_path=self._service_log_path
+            )
+        else:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+                service_log_path=self._service_log_path
+            )
+
+        if self._window_size:
+            driver.set_window_size(*self._window_size)
+
+        return driver
+
+    def remote_firefox_driver(self):
+        firefox_options = webdriver.FirefoxOptions()
+        desired_capabilities = firefox_options.to_capabilities()
+        firefox_options.set_preference("dom.webdriver.enabled", False)
+
+        if self._version:
+            desired_capabilities['version'] = self._version
+
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = proxy.replace("socks5://", "")
+            # 使用socks5 代理
+            ip, port = proxy.split(":")
+            firefox_options.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
+            firefox_options.set_preference('network.proxy.socks', ip)
+            firefox_options.set_preference('network.proxy.socks_port', int(port))
+
+        if self._user_agent:
+            firefox_options.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            firefox_options.set_preference("permissions.default.image", 2)
+
+        if self._headless:
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--disable-gpu")
+
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
+        browser = webdriver.Remote(
+            command_executor=executor,
+            desired_capabilities=desired_capabilities,
+            options=firefox_options
+        )
+
+        if self._window_size:
+            browser.set_window_size(*self._window_size)
+
+        return browser
+
+    def firefox_driver(self):
+        if self._usages_local_driver:
+            return self.local_firefox_driver()
+        return self.remote_firefox_driver()
+
+    def remote_chrome_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        desired_capabilities = chrome_options.to_capabilities()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
+        # docker 里运行需要
+        chrome_options.add_argument('--no-sandbox')
+        chrome_options.add_argument('--disable-extensions')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+
+        if self._version:
+            desired_capabilities['version'] = self._version
+
+        if self._proxy:
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
+
+        if self._user_agent:
+            chrome_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
+            )
+
+        if not self._load_images:
+            chrome_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            chrome_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                chrome_options.add_argument(arg)
+
+        browser = webdriver.Remote(
+            command_executor=ChromeRemoteConnection(
+                remote_server_addr=self._server_addr,
+                keep_alive=True),
+            desired_capabilities=desired_capabilities,
+            options=chrome_options
+        )
+
+        # 隐藏浏览器特征
+        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
+            js = f.read()
+            params = {
+                'cmd': 'Page.addScriptToEvaluateOnNewDocument',
+                'params': {'source': js}
+            }
+            response = browser.execute("executeCdpCommand", params)['value']
+        return browser
+
+    def local_chrome_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
+        # docker 里运行需要
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument('--disable-extensions')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+
+        if self._proxy:
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
+
+        if self._user_agent:
+            chrome_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
+            )
+
+        if not self._load_images:
+            chrome_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            chrome_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                chrome_options.add_argument(arg)
+
+        if self._executable_path:
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
+                executable_path=self._executable_path,
+                service_log_path=self._service_log_path
+            )
+        else:
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options,
+                service_log_path=self._service_log_path
+            )
+
+        # 隐藏浏览器特征
+        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
+            js = f.read()
+            driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
+
+        return driver
+
+    def chrome_driver(self):
+        if self._usages_local_driver:
+            return self.local_chrome_driver()
+        return self.remote_chrome_driver()
+
+    @property
+    def cookies(self):
+        cookies_json = {}
+        for cookie in self.driver.get_cookies():
+            cookies_json[cookie["name"]] = cookie["value"]
+        return cookies_json
+
+    @cookies.setter
+    def cookies(self, val: dict):
+        """
+        设置cookie
+        Args:
+            val: {"key":"value", "key2":"value2"}
+
+        Returns:
+
+        """
+        for key, value in val.items():
+            self.driver.add_cookie({"name": key, "value": value})
+
+    def quit(self):
+        try:
+            self.get_driver().quit()
+        except Exception:
+            # We don't care about the message because something probably has gone wrong
+            pass
+
+    # def __del__(self):
+    #     if self.driver:
+    #         self.driver.quit()
+
+
+@Singleton
+class WebDriverPool:
+    def __init__(self, pool_size=5, **kwargs):
+        self.queue = queue.Queue(maxsize=pool_size)
+        self.kwargs = kwargs
+        self.lock = threading.RLock()
+        self.driver_count = 0
+
+    @property
+    def is_full(self):
+        return self.driver_count >= self.queue.maxsize
+
+    def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
+        """
+        获取webdriver
+        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
+        Args:
+            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
+            proxy: xxx.xxx.xxx.xxx
+        Returns:
+
+        """
+        if not self.is_full:
+            with self.lock:
+                if not self.is_full:
+                    kwargs = self.kwargs.copy()
+                    if user_agent:
+                        kwargs["user_agent"] = user_agent
+                    if proxy:
+                        kwargs["proxy"] = proxy
+                    driver = WebDriver(**kwargs)
+                    self.queue.put(driver)
+                    self.driver_count += 1
+
+        driver = self.queue.get()
+
+        return driver
+
+    def put(self, driver):
+        self.queue.put(driver)
+
+    def remove(self, driver):
+        driver.quit()
+        self.driver_count -= 1
+
+    def close(self):
+        while not self.queue.empty():
+            driver = self.queue.get()
+            driver.quit()
+            self.driver_count -= 1

+ 308 - 310
lzz_theme/xgyyglj/start.sh

@@ -44,362 +44,355 @@ class Details:
 
     def detail_get(self, response, item):
 
-        if response.status_code == 306:
-            try:
-                os.remove('./yyc_ck.json')
-            except:
-                pass
-            raise ValueError("cookie失效!重新登录")
-        else:
-            if response.json().get('success') == None:
-                if response.json().get('dataTables'):
-                    d_data = response.json().get('dataTables')
-                    detail_info = d_data.get('quotationInfoDataTable').get('rows')[0].get('data')
-                    goods_info_list = d_data.get('quotationDetailDataTable').get('rows')
-
-                    goods_list = ''
-                    for goods_info in goods_info_list:
-                        if goods_info.get('data').get('reqdate'):
-                            reqdate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(
-                                int(str(goods_info.get('data').get('reqdate'))[:-3])))
-                        else:
-                            reqdate = ""
-
-                        temp = f'''
-                        <tr>
-                            <td>
-                                <span data-bind="text: row.ref('rowNo')">{goods_info.get('data').get('rowNo')}</span>
-                            </td>
-                            <td>
-                                <div style="display: block" data-bind="text: row.ref('productName')">{goods_info.get('data').get('productName')}</div>
-                                <div style="" data-bind="text: row.ref('productDesc')">{goods_info.get('data').get('productDesc')}</div>
-                                <div style="display: none" data-bind="">
-                                    <span style="color: #999;">规格型号:</span><span data-bind="text: row.ref('productType')">{goods_info.get('data').get('productType')}</span>
-                                </div>
-                                <div style="display: none" data-bind="">
-                                    <span style="color: #999;">存货描述:</span><span data-bind="text: row.ref('productSpec')">{goods_info.get('data').get('productSpec')}</span>
-                                </div>
-                                <div class="hidden" style="display: block" data-bind="">
-                                    <span style="color: #999;" data-bind=""></span>:
-                                    <span data-bind="text: row.ref('field1')">{goods_info.get('data').get('field1')}</span>
-                                </div>
-                                <div class="hidden" style="display: block" data-bind="">
-                                    <span style="color: #999;" data-bind=""></span>:
-                                    <span data-bind="text: row.ref('field2')">{goods_info.get('data').get('field2')}</span>
-                                </div>
-                                <div class="hidden" style="display: block" data-bind="">
-                                    <span style="color: #999;" data-bind=""></span>:
-                                    <span data-bind="text: row.ref('field3')">{goods_info.get('data').get('field3')}</span>
-                                </div>
-                                <div class="hidden" style="display: block" data-bind="">
-                                    <span style="color: #999;" data-bind=""></span>:
-                                    <span data-bind="text: row.ref('field4')">{goods_info.get('data').get('field4')}</span>
-                                </div>
-                                <div class="hidden" style="display: block" data-bind="">
-                                    <span style="color: #999;" data-bind=""></span>:
-                                    <span data-bind="text: row.ref('field14')">{goods_info.get('data').get('field14')}</span>
-                                </div>
-                                <div class="hidden" style="display: block" data-bind="">
-                                    <span style="color: #999;" data-bind=""></span>:
-                                    <span data-bind="text: row.ref('field15')">{goods_info.get('data').get('field15')}</span>
-                                </div>
-                            </td>
-                            <td>
-                                <span data-bind="text: row.ref('productCode')">{goods_info.get('data').get('productCode')}</span>
-                            </td>
-                            <td style="width: 80px; min-width: 80px" class="text-right p-r-m">
-                                <span data-bind="text: row.ref('purchaseamount')">{goods_info.get('data').get('purchaseamount')}</span>
-                                <span data-bind="text: row.ref('unit')">{goods_info.get('data').get('unit')}</span>
-                            </td>
-                            <td>
-                                <span data-bind="text: row.ref('reqErpRelatedInfo')">{goods_info.get('data').get('reqErpRelatedInfo')}</span>
-                            </td>
-                            <td>
-                                <div data-bind="text:row.ref('reqOrgName')">{goods_info.get('data').get('reqOrgName')}</div>
-                            </td>
-                            <td>
-                                <span data-bind="text: row.getValue('ERPProjectName')">{goods_info.get('data').get('ERPProjectName')}</span>
-                            </td>
-                            <td>
-                                <div class="twolinesintable" data-bind="visible: row.getValue('receiveOrgName')">
-                                    <div data-bind="text:row.ref('receiveOrgName')">{goods_info.get('data').get('receiveOrgName')}</div>
-                                </div>
-                                <div class="twolinesintable" data-bind="visible: row.getValue('receivePersonName')" style="display: none;">
-                                    <span style="color: #999;">收货人:</span><span data-bind="text: row.getValue('receivePersonName')">{goods_info.get('data').get('receivePersonName')}</span><br>
-                                </div>
-                                <div class="twolinesintable" data-bind="visible: row.getValue('receivePersonTel')" style="display: none;">
-                                    <span style="color: #999;">联系电话:</span>
-                                    <span><canvas class="text2canvas" height="25" data-bind="" width="125" style="height: 20px; width: 100px;">{goods_info.get('data').get('receivePersonTel')}</canvas></span>
-                                    <br>
-                                </div>
-                                <div class="twolinesintable" data-bind="visible: row.getValue('receiveAddress')" style="display: none;">
-                                    <span style="color: #999;">收货地址:</span><span data-bind="text: row.getValue('receiveAddress')">{goods_info.get('data').get('receiveAddress')}</span><br>
-                                </div>
-                            </td>
-                            <td>
-                                <div class="input-group date">
-                                    <span data-bind="text: row.ref('reqdate')">{reqdate}</span> 
-                                </div>
-                            </td>
-                            <td>
-                                <span data-bind="text: row.ref('bmemo')">{goods_info.get('data').get('bmemo')}</span>
-                            </td>
-                        </tr>
-                        '''
-                        goods_list += temp
-
-                    if detail_info.get('allowRisePrice'):
-                        allowRisePrice = "不限"
-                    else:
-                        allowRisePrice = "限制"
-                    if detail_info.get('supplierCertif') == "1":
-                        apply_require = "三证合一或传统五证"
-                    elif detail_info.get('quoteCertif') == "":
-                        apply_require = "已盖章的报价单"
-                    elif detail_info.get('supplierQualify'):
-                        apply_require = detail_info.get('supplierQualify')
+        if response.json().get('success') == None:
+            if response.json().get('dataTables'):
+                d_data = response.json().get('dataTables')
+                detail_info = d_data.get('quotationInfoDataTable').get('rows')[0].get('data')
+                goods_info_list = d_data.get('quotationDetailDataTable').get('rows')
+
+                goods_list = ''
+                for goods_info in goods_info_list:
+                    if goods_info.get('data').get('reqdate'):
+                        reqdate = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(
+                            int(str(goods_info.get('data').get('reqdate'))[:-3])))
                     else:
-                        apply_require = "无"
+                        reqdate = ""
+
+                    temp = f'''
+                    <tr>
+                        <td>
+                            <span data-bind="text: row.ref('rowNo')">{goods_info.get('data').get('rowNo')}</span>
+                        </td>
+                        <td>
+                            <div style="display: block" data-bind="text: row.ref('productName')">{goods_info.get('data').get('productName')}</div>
+                            <div style="" data-bind="text: row.ref('productDesc')">{goods_info.get('data').get('productDesc')}</div>
+                            <div style="display: none" data-bind="">
+                                <span style="color: #999;">规格型号:</span><span data-bind="text: row.ref('productType')">{goods_info.get('data').get('productType')}</span>
+                            </div>
+                            <div style="display: none" data-bind="">
+                                <span style="color: #999;">存货描述:</span><span data-bind="text: row.ref('productSpec')">{goods_info.get('data').get('productSpec')}</span>
+                            </div>
+                            <div class="hidden" style="display: block" data-bind="">
+                                <span style="color: #999;" data-bind=""></span>:
+                                <span data-bind="text: row.ref('field1')">{goods_info.get('data').get('field1')}</span>
+                            </div>
+                            <div class="hidden" style="display: block" data-bind="">
+                                <span style="color: #999;" data-bind=""></span>:
+                                <span data-bind="text: row.ref('field2')">{goods_info.get('data').get('field2')}</span>
+                            </div>
+                            <div class="hidden" style="display: block" data-bind="">
+                                <span style="color: #999;" data-bind=""></span>:
+                                <span data-bind="text: row.ref('field3')">{goods_info.get('data').get('field3')}</span>
+                            </div>
+                            <div class="hidden" style="display: block" data-bind="">
+                                <span style="color: #999;" data-bind=""></span>:
+                                <span data-bind="text: row.ref('field4')">{goods_info.get('data').get('field4')}</span>
+                            </div>
+                            <div class="hidden" style="display: block" data-bind="">
+                                <span style="color: #999;" data-bind=""></span>:
+                                <span data-bind="text: row.ref('field14')">{goods_info.get('data').get('field14')}</span>
+                            </div>
+                            <div class="hidden" style="display: block" data-bind="">
+                                <span style="color: #999;" data-bind=""></span>:
+                                <span data-bind="text: row.ref('field15')">{goods_info.get('data').get('field15')}</span>
+                            </div>
+                        </td>
+                        <td>
+                            <span data-bind="text: row.ref('productCode')">{goods_info.get('data').get('productCode')}</span>
+                        </td>
+                        <td style="width: 80px; min-width: 80px" class="text-right p-r-m">
+                            <span data-bind="text: row.ref('purchaseamount')">{goods_info.get('data').get('purchaseamount')}</span>
+                            <span data-bind="text: row.ref('unit')">{goods_info.get('data').get('unit')}</span>
+                        </td>
+                        <td>
+                            <span data-bind="text: row.ref('reqErpRelatedInfo')">{goods_info.get('data').get('reqErpRelatedInfo')}</span>
+                        </td>
+                        <td>
+                            <div data-bind="text:row.ref('reqOrgName')">{goods_info.get('data').get('reqOrgName')}</div>
+                        </td>
+                        <td>
+                            <span data-bind="text: row.getValue('ERPProjectName')">{goods_info.get('data').get('ERPProjectName')}</span>
+                        </td>
+                        <td>
+                            <div class="twolinesintable" data-bind="visible: row.getValue('receiveOrgName')">
+                                <div data-bind="text:row.ref('receiveOrgName')">{goods_info.get('data').get('receiveOrgName')}</div>
+                            </div>
+                            <div class="twolinesintable" data-bind="visible: row.getValue('receivePersonName')" style="display: none;">
+                                <span style="color: #999;">收货人:</span><span data-bind="text: row.getValue('receivePersonName')">{goods_info.get('data').get('receivePersonName')}</span><br>
+                            </div>
+                            <div class="twolinesintable" data-bind="visible: row.getValue('receivePersonTel')" style="display: none;">
+                                <span style="color: #999;">联系电话:</span>
+                                <span><canvas class="text2canvas" height="25" data-bind="" width="125" style="height: 20px; width: 100px;">{goods_info.get('data').get('receivePersonTel')}</canvas></span>
+                                <br>
+                            </div>
+                            <div class="twolinesintable" data-bind="visible: row.getValue('receiveAddress')" style="display: none;">
+                                <span style="color: #999;">收货地址:</span><span data-bind="text: row.getValue('receiveAddress')">{goods_info.get('data').get('receiveAddress')}</span><br>
+                            </div>
+                        </td>
+                        <td>
+                            <div class="input-group date">
+                                <span data-bind="text: row.ref('reqdate')">{reqdate}</span> 
+                            </div>
+                        </td>
+                        <td>
+                            <span data-bind="text: row.ref('bmemo')">{goods_info.get('data').get('bmemo')}</span>
+                        </td>
+                    </tr>
+                    '''
+                    goods_list += temp
 
-                    if detail_info.get('canSeeQt'):
-                        canSeeQt = detail_info.get('canSeeQt')
-                    else:
-                        canSeeQt = "无"
+                if detail_info.get('allowRisePrice'):
+                    allowRisePrice = "不限"
+                else:
+                    allowRisePrice = "限制"
+                if detail_info.get('supplierCertif') == "1":
+                    apply_require = "三证合一或传统五证"
+                elif detail_info.get('quoteCertif') == "":
+                    apply_require = "已盖章的报价单"
+                elif detail_info.get('supplierQualify'):
+                    apply_require = detail_info.get('supplierQualify')
+                else:
+                    apply_require = "无"
+
+                if detail_info.get('canSeeQt'):
+                    canSeeQt = detail_info.get('canSeeQt')
+                else:
+                    canSeeQt = "无"
 
-                    buyofferType_dict = {"1": "框架协议", "2": "普通合同", "3": "直接下单", "4": "价格调整", }
-                    buyofferType = buyofferType_dict.get(detail_info.get('buyofferType'))
+                buyofferType_dict = {"1": "框架协议", "2": "普通合同", "3": "直接下单", "4": "价格调整", }
+                buyofferType = buyofferType_dict.get(detail_info.get('buyofferType'))
 
-                    onOfflineType_dict = {"0": "线上", "1": "混合", "2": "线下"}
-                    onOfflineType = onOfflineType_dict.get(detail_info.get('onOfflineType'))
+                onOfflineType_dict = {"0": "线上", "1": "混合", "2": "线下"}
+                onOfflineType = onOfflineType_dict.get(detail_info.get('onOfflineType'))
 
-                    if detail_info.get('qtexpiredate'):
-                        qtexpiredate = time.strftime("%Y-%m-%d %H:%M:%S",
-                                                     time.localtime(int(str(detail_info.get('qtexpiredate'))[:-3])))
-                    else:
-                        qtexpiredate = ""
+                if detail_info.get('qtexpiredate'):
+                    qtexpiredate = time.strftime("%Y-%m-%d %H:%M:%S",
+                                                 time.localtime(int(str(detail_info.get('qtexpiredate'))[:-3])))
+                else:
+                    qtexpiredate = ""
 
-                    html = f'''
-                        <div class="ibox">
+                html = f'''
+                    <div class="ibox">
 
-                            <div class="ibox-filter-title pull-left">
-                                <span data-bind="i18n: 'cali-infor'">询价单详情</span>
-                                <span class="vbillcodeCss" data-bind="visible: quotationInfoDataTable.ref('vBuyOfferBillCode')()" style="">(询价单号:</span>
-                                <span class="vbillcodeCss" data-bind="text: quotationInfoDataTable.ref('vBuyOfferBillCode')">{detail_info.get('vBuyOfferBillCode')}</span>
-                                <span class="vbillcodeCss" data-bind="visible: quotationInfoDataTable.ref('vBuyOfferBillCode')()" style="">)</span>
-                            </div>
+                        <div class="ibox-filter-title pull-left">
+                            <span data-bind="i18n: 'cali-infor'">询价单详情</span>
+                            <span class="vbillcodeCss" data-bind="visible: quotationInfoDataTable.ref('vBuyOfferBillCode')()" style="">(询价单号:</span>
+                            <span class="vbillcodeCss" data-bind="text: quotationInfoDataTable.ref('vBuyOfferBillCode')">{detail_info.get('vBuyOfferBillCode')}</span>
+                            <span class="vbillcodeCss" data-bind="visible: quotationInfoDataTable.ref('vBuyOfferBillCode')()" style="">)</span>
+                        </div>
 
-                            <div class="ibox-content">
-                                <div class="row">
-                                    <div class="col-md-8">
-                                        <h4 class="iform-sub-title">
-                                            <span class="order-title" data-bind="text: quotationInfoDataTable.ref('subject')">{detail_info.get('subject')}</span>
-                                            <span class="label label-danger" data-bind="text: quotationInfoDataTable.ref('statusName')">{detail_info.get('statusName')}</span>
-                                        </h4>
-                                    </div>
+                        <div class="ibox-content">
+                            <div class="row">
+                                <div class="col-md-8">
+                                    <h4 class="iform-sub-title">
+                                        <span class="order-title" data-bind="text: quotationInfoDataTable.ref('subject')">{detail_info.get('subject')}</span>
+                                        <span class="label label-danger" data-bind="text: quotationInfoDataTable.ref('statusName')">{detail_info.get('statusName')}</span>
+                                    </h4>
                                 </div>
+                            </div>
 
-                                <div class="row">
-                                    <div class="col-md-12">
-                                        <h4 class="i-form-title-nomal">物资信息<span>(</span><span data-bind="text:quotationDetailDataTable.rows().length">{len(goods_info_list)}</span><span>项)</span></h4>
-                                    </div>
+                            <div class="row">
+                                <div class="col-md-12">
+                                    <h4 class="i-form-title-nomal">物资信息<span>(</span><span data-bind="text:quotationDetailDataTable.rows().length">{len(goods_info_list)}</span><span>项)</span></h4>
                                 </div>
-                                <div class="row">
-                                    <div class="col-md-12">
-                                        <div id="myTabContent" class="tab-content">
-                                            <table class="yc-table ml-table" style="table-layout:fixed;">
-                                                <thead>
-                                                    <tr>
-                                                        <td width="5%" data-bind="click: orderByNum" style="cursor: pointer;padding-left: 5px;">行号</td>
-                                                        <td width="20%" data-bind="click: orderByName" style="cursor: pointer;">
-                                                            <span data-bind="i18n: 'cali-shopname'">物料名称</span> /
-                                                            <span data-bind="i18n: 'inquiry-drscrt'">物料需求描述</span>
-                                                        </td>
-                                                        <td width="10%">物料编码</td>
-                                                        <td width="10%" align="right" class="p-r-m" data-bind="i18n: 'cali-shopnumber'">采购数量</td>
-                                                        <td width="10%">ERP相关信息</td>
-                                                        <td width="15%">
-                                                            <div data-bind="i18n: 'inquiry-org'">需求组织</div>
-                                                        </td>
-                                                        <td width="10%">
-                                                            项目名称
-                                                        </td>
-                                                        <td width="15%">
-                                                            <div>收货组织</div>-
-                                                            <div>收货信息</div>
-                                                        </td>
-                                                        <td width="12%" data-bind="i18n: 'inquiry-time'">需求时间</td>
-                                                        <td width="13%">备注</td>
-                                                    </tr>
-                                                </thead>
-                                                <tbody data-bind="">
-                                                    {goods_list}
-                                                </tbody>
-                                            </table>
-                                        </div>
+                            </div>
+                            <div class="row">
+                                <div class="col-md-12">
+                                    <div id="myTabContent" class="tab-content">
+                                        <table class="yc-table ml-table" style="table-layout:fixed;">
+                                            <thead>
+                                                <tr>
+                                                    <td width="5%" data-bind="click: orderByNum" style="cursor: pointer;padding-left: 5px;">行号</td>
+                                                    <td width="20%" data-bind="click: orderByName" style="cursor: pointer;">
+                                                        <span data-bind="i18n: 'cali-shopname'">物料名称</span> /
+                                                        <span data-bind="i18n: 'inquiry-drscrt'">物料需求描述</span>
+                                                    </td>
+                                                    <td width="10%">物料编码</td>
+                                                    <td width="10%" align="right" class="p-r-m" data-bind="i18n: 'cali-shopnumber'">采购数量</td>
+                                                    <td width="10%">ERP相关信息</td>
+                                                    <td width="15%">
+                                                        <div data-bind="i18n: 'inquiry-org'">需求组织</div>
+                                                    </td>
+                                                    <td width="10%">
+                                                        项目名称
+                                                    </td>
+                                                    <td width="15%">
+                                                        <div>收货组织</div>-
+                                                        <div>收货信息</div>
+                                                    </td>
+                                                    <td width="12%" data-bind="i18n: 'inquiry-time'">需求时间</td>
+                                                    <td width="13%">备注</td>
+                                                </tr>
+                                            </thead>
+                                            <tbody data-bind="">
+                                                {goods_list}
+                                            </tbody>
+                                        </table>
                                     </div>
                                 </div>
                             </div>
-                            <div class="sub-form-warp pull-left">
-                                <div class="row">
-                                    <div class="col-md-12">
-                                        <h4 class="i-form-title" data-bind="i18n: 'inquiry-prorequire'">采购要求</h4>
-                                    </div>
+                        </div>
+                        <div class="sub-form-warp pull-left">
+                            <div class="row">
+                                <div class="col-md-12">
+                                    <h4 class="i-form-title" data-bind="i18n: 'inquiry-prorequire'">采购要求</h4>
                                 </div>
-                                <div class="row">
-                                    <div class="col-md-12">
-                                        <div class="grid simple">
-                                            <div class="grid-body no-border">
-                                                <form class="form-horizontal" role="form">
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'useredit-purAgent'">采购员</span>:</label>
-                                                        <div class="col-sm-4 col-md-8 line-height-34">
-                                                            <div data-bind="text:quotationInfoDataTable.ref('corpSubAccountName')">{detail_info.get('corpSubAccountName')}</div>
-                                                        </div>
+                            </div>
+                            <div class="row">
+                                <div class="col-md-12">
+                                    <div class="grid simple">
+                                        <div class="grid-body no-border">
+                                            <form class="form-horizontal" role="form">
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'useredit-purAgent'">采购员</span>:</label>
+                                                    <div class="col-sm-4 col-md-8 line-height-34">
+                                                        <div data-bind="text:quotationInfoDataTable.ref('corpSubAccountName')">{detail_info.get('corpSubAccountName')}</div>
                                                     </div>
-                                                    <div class="form-group">
-                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-offerendtime'">报价截止时间</span>:</label>
-                                                        <div class="col-sm-12 col-md-8 line-height-34">
-                                                            <span data-bind="text: quotationInfoDataTable.ref('qtexpiredate')">{qtexpiredate}</span> 
-                                                            <label u-meta="" title="2022-08-07 15:37:40"></label>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group">
+                                                <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-offerendtime'">报价截止时间</span>:</label>
+                                                    <div class="col-sm-12 col-md-8 line-height-34">
+                                                        <span data-bind="text: quotationInfoDataTable.ref('qtexpiredate')">{qtexpiredate}</span> 
+                                                        <label u-meta="" title="2022-08-07 15:37:40"></label>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-recadress'">收货地址</span>:</label>
-                                                        <div class="col-sm-4 col-md-8 line-height-34">
-                                                            <span data-bind="text: quotationInfoDataTable.ref('addressName')">{detail_info.get('addressName')}</span>
-                                                            <span data-bind="text: quotationInfoDataTable.ref('receiveAddress')">{detail_info.get('receiveAddress')}</span>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-recadress'">收货地址</span>:</label>
+                                                    <div class="col-sm-4 col-md-8 line-height-34">
+                                                        <span data-bind="text: quotationInfoDataTable.ref('addressName')">{detail_info.get('addressName')}</span>
+                                                        <span data-bind="text: quotationInfoDataTable.ref('receiveAddress')">{detail_info.get('receiveAddress')}</span>
                                                     </div>
-                                                    <div class="form-group ">
-                                                        <label class="col-sm-12 col-md-3 control-label font-nomal">报价类型:</label>
-                                                        <div class="col-sm-12 col-md-8 line-height-34">
-                                                            <span data-bind="text: $root.renderOnOfflineType($root.quotationInfoDataTable.ref('onOfflineType')())">{onOfflineType}</span> 
-                                                        </div>
+                                                </div>
+                                                <div class="form-group ">
+                                                    <label class="col-sm-12 col-md-3 control-label font-nomal">报价类型:</label>
+                                                    <div class="col-sm-12 col-md-8 line-height-34">
+                                                        <span data-bind="text: $root.renderOnOfflineType($root.quotationInfoDataTable.ref('onOfflineType')())">{onOfflineType}</span> 
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'quotations.quo-request'">报价要求</span>:</label>
-                                                        <div class="col-sm-12 col-md-8 line-height-34">
-                                                            <span data-bind="if: quotationInfoDataTable.ref('canSeeQt')()">{canSeeQt}</span>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'quotations.quo-request'">报价要求</span>:</label>
+                                                    <div class="col-sm-12 col-md-8 line-height-34">
+                                                        <span data-bind="if: quotationInfoDataTable.ref('canSeeQt')()">{canSeeQt}</span>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-12 col-md-3 control-label font-nomal">询价币种:</label>
-                                                        <div class="col-sm-12 col-md-8 line-height-34">
-                                                            <span data-bind="text:quotationInfoDataTable.ref('currency_name')()">{detail_info.get('currency_name')}</span>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-12 col-md-3 control-label font-nomal">询价币种:</label>
+                                                    <div class="col-sm-12 col-md-8 line-height-34">
+                                                        <span data-bind="text:quotationInfoDataTable.ref('currency_name')()">{detail_info.get('currency_name')}</span>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-12 col-md-3 control-label font-nomal">询价类型:</label>
-                                                        <div class="col-sm-12 col-md-8 line-height-34">
-                                                            <span data-bind="text: $root.renderBuyofferType($root.quotationInfoDataTable.ref('buyofferType')())">{buyofferType}</span>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-12 col-md-3 control-label font-nomal">询价类型:</label>
+                                                    <div class="col-sm-12 col-md-8 line-height-34">
+                                                        <span data-bind="text: $root.renderBuyofferType($root.quotationInfoDataTable.ref('buyofferType')())">{buyofferType}</span>
                                                     </div>
-                                                    <div class="form-group" data-bind="" style="">
-                                                        <label class="col-sm-12 col-md-3 control-label font-nomal">是否限制报价涨价:</label>
-                                                        <div class="col-sm-12 col-md-8 line-height-34">
-                                                            <span data-bind="text: $root.renderAllowRisePrice($root.quotationInfoDataTable.ref('allowRisePrice')())">{allowRisePrice}</span>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group" data-bind="" style="">
+                                                    <label class="col-sm-12 col-md-3 control-label font-nomal">是否限制报价涨价:</label>
+                                                    <div class="col-sm-12 col-md-8 line-height-34">
+                                                        <span data-bind="text: $root.renderAllowRisePrice($root.quotationInfoDataTable.ref('allowRisePrice')())">{allowRisePrice}</span>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-paymethod'">付款方式</span>:</label>
-                                                        <div class="col-sm-12 col-md-9" style="margin-top: 6px;">
-                                                            <span class="label-text label-context ">
-                                                            <y-showdetail params="text:quotationInfoDataTable.ref('payMethod')"><div class="y-relative">
-                                                            <div data-bind="" class="y-showdetail-single">{detail_info.get('payMethod')}</div>
-                                                            </div>
-                                                            </y-showdetail>
-                                                            </span>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-paymethod'">付款方式</span>:</label>
+                                                    <div class="col-sm-12 col-md-9" style="margin-top: 6px;">
+                                                        <span class="label-text label-context ">
+                                                        <y-showdetail params="text:quotationInfoDataTable.ref('payMethod')"><div class="y-relative">
+                                                        <div data-bind="" class="y-showdetail-single">{detail_info.get('payMethod')}</div>
                                                         </div>
+                                                        </y-showdetail>
+                                                        </span>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-delterm'">交货条件</span>:</label>
-                                                        <div class="col-sm-12 col-md-8" style="margin-top: 6px;">
-                                                            <span class="label-text label-context ">
-                                                            <y-showdetail params="text:quotationInfoDataTable.ref('deliveryMethod')"><div class="y-relative">
-                                                            <div data-bind="" class="y-showdetail-single">{detail_info.get('deliveryMethod')}</div>
-                                                            <div class="y-text-link y-showdetail-singlemore" data-bind="" style="display: none;">
-                                                            </div>
-                                                            </div>
-                                                            </y-showdetail>
-                                                            </span>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-delterm'">交货条件</span>:</label>
+                                                    <div class="col-sm-12 col-md-8" style="margin-top: 6px;">
+                                                        <span class="label-text label-context ">
+                                                        <y-showdetail params="text:quotationInfoDataTable.ref('deliveryMethod')"><div class="y-relative">
+                                                        <div data-bind="" class="y-showdetail-single">{detail_info.get('deliveryMethod')}</div>
+                                                        <div class="y-text-link y-showdetail-singlemore" data-bind="" style="display: none;">
                                                         </div>
-                                                    </div>
-                                                    <div class="form-group hidden" data-bind="customfeature: customHeadField2Visible">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal">
-                                                        <span data-bind="customfeature: customHeadField2Label">自定义项2</span>:</label>
-                                                        <div class="col-sm-12 col-md-8" style="margin-top: 7px;">
-                                                            <span data-bind="text: quotationInfoDataTable.ref('field2')">{detail_info.get('field2')}</span>
                                                         </div>
+                                                        </y-showdetail>
+                                                        </span>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal p-l-zero"><span data-bind="i18n: 'inquiry-suprequire'">对供应商要求</span>:</label>
-                                                        <div class="col-md-8 col-sm-8 line-height-34">
-                                                            <span class="m-r-sm" data-bind="">{apply_require}</span>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group hidden" data-bind="customfeature: customHeadField2Visible">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal">
+                                                    <span data-bind="customfeature: customHeadField2Label">自定义项2</span>:</label>
+                                                    <div class="col-sm-12 col-md-8" style="margin-top: 7px;">
+                                                        <span data-bind="text: quotationInfoDataTable.ref('field2')">{detail_info.get('field2')}</span>
                                                     </div>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal p-l-zero"><span data-bind="i18n: 'inquiry-suprequire'">对供应商要求</span>:</label>
+                                                    <div class="col-md-8 col-sm-8 line-height-34">
+                                                        <span class="m-r-sm" data-bind="">{apply_require}</span>
+                                                    </div>
+                                                </div>
 
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'supfile-Remarks'">备注</span>:</label>
-                                                        <div class="col-sm-12 col-md-8" style="margin-top: 6px;">
-                                                            <span class="label-text label-context "><y-showdetail params="text:quotationInfoDataTable.ref('memo')"><div class="y-relative">
-                                                              <div data-bind="" class="y-showdetail-single">{detail_info.get('memo')}</div>
-                                                            </div>
-                                                            </y-showdetail></span>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'supfile-Remarks'">备注</span>:</label>
+                                                    <div class="col-sm-12 col-md-8" style="margin-top: 6px;">
+                                                        <span class="label-text label-context "><y-showdetail params="text:quotationInfoDataTable.ref('memo')"><div class="y-relative">
+                                                          <div data-bind="" class="y-showdetail-single">{detail_info.get('memo')}</div>
                                                         </div>
+                                                        </y-showdetail></span>
                                                     </div>
-                                                </form>
-                                            </div>
+                                                </div>
+                                            </form>
                                         </div>
                                     </div>
                                 </div>
                             </div>
-                            <div class="sub-form-warp pull-right">
-                                <div class="row">
-                                    <div class="col-md-12">
-                                        <h4 class="i-form-title" data-bind="i18n: 'inquiry-contactinfor'">联系方式</h4>
-                                    </div>
+                        </div>
+                        <div class="sub-form-warp pull-right">
+                            <div class="row">
+                                <div class="col-md-12">
+                                    <h4 class="i-form-title" data-bind="i18n: 'inquiry-contactinfor'">联系方式</h4>
                                 </div>
-                                <div class="row">
-                                    <div class="col-md-12">
-                                        <div class="grid simple">
-                                            <div class="grid-body no-border">
-                                                <form class="form-horizontal" role="form">
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-contact'">联系人</span>:</label>
-                                                        <div class="col-sm-6 col-md-8 line-height-34" data-bind="visible: contactInformationShow" style="">
-                                                            <div data-bind="text: quotationInfoDataTable.ref('contact')">{detail_info.get('contact')}</div>
-                                                        </div>
+                            </div>
+                            <div class="row">
+                                <div class="col-md-12">
+                                    <div class="grid simple">
+                                        <div class="grid-body no-border">
+                                            <form class="form-horizontal" role="form">
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-contact'">联系人</span>:</label>
+                                                    <div class="col-sm-6 col-md-8 line-height-34" data-bind="visible: contactInformationShow" style="">
+                                                        <div data-bind="text: quotationInfoDataTable.ref('contact')">{detail_info.get('contact')}</div>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-phone'">联系电话</span>:</label>
-                                                        <div class="col-sm-6 col-md-8 line-height-34" data-bind="visible: contactInformationShow" style="">
-                                                            <div data-bind="text:  quotationInfoDataTable.ref('phone')">{detail_info.get('phone')}</div>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-phone'">联系电话</span>:</label>
+                                                    <div class="col-sm-6 col-md-8 line-height-34" data-bind="visible: contactInformationShow" style="">
+                                                        <div data-bind="text:  quotationInfoDataTable.ref('phone')">{detail_info.get('phone')}</div>
                                                     </div>
-                                                    <div class="form-group">
-                                                        <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-email'">邮件地址</span>:</label>
-                                                        <div class="col-sm-4 col-md-8 line-height-34" data-bind="visible: contactInformationShow" style="">
-                                                            <span data-bind="text: quotationInfoDataTable.ref('email')">{detail_info.get('email')}</span>
-                                                        </div>
+                                                </div>
+                                                <div class="form-group">
+                                                    <label class="col-sm-6 col-md-3 control-label font-nomal"><span data-bind="i18n: 'inquiry-email'">邮件地址</span>:</label>
+                                                    <div class="col-sm-4 col-md-8 line-height-34" data-bind="visible: contactInformationShow" style="">
+                                                        <span data-bind="text: quotationInfoDataTable.ref('email')">{detail_info.get('email')}</span>
                                                     </div>
-                                                </form>
-                                            </div>
+                                                </div>
+                                            </form>
                                         </div>
                                     </div>
                                 </div>
                             </div>
                         </div>
-                    '''
+                    </div>
+                '''
 
-                    item["contenthtml"] = html.replace('None', '').replace('null', '')
+                item["contenthtml"] = html.replace('None', '').replace('null', '')
 
-                    item = format_fileds(item)
+                item = format_fileds(item)
 
-                    try:
-                        self.zt_details.insert_one(item)
-                        logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-                    except DuplicateKeyError:
-                        logger.warning(f"[重复采集]{item['title']}-{item['publishtime']}")
+                try:
+                    self.zt_details.insert_one(item)
+                    logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+                except DuplicateKeyError:
+                    logger.warning(f"[重复采集]{item['title']}-{item['publishtime']}")
 
     def fetch_request(self, item):
         cookies = self.get_cookies()
@@ -440,6 +433,12 @@ class Details:
                 if response and res_code == 200:
                     self.detail_get(response, item=item)
                     return True
+                elif res_code == 306:
+                    try:
+                        os.remove('./yyc_ck.json')
+                    except:
+                        pass
+                    raise ValueError("cookie失效!重新登录")
                 else:
                     time.sleep(3)
             except Exception as e:
@@ -452,8 +451,7 @@ class Details:
     def start(self, limit=1):
         logger.debug("********** 详情页采集开始 **********")
 
-        with self.db_name.find({"parser_name": "ztpc_yyc_cgxj", "is_crawl": False, "failed": False},
-                               sort=[('publishtime', -1)]).limit(limit) as cursor:
+        with self.db_name.find({"parser_name": "ztpc_yyc_cgxj", "is_crawl": False, "failed": False}).limit(limit) as cursor:
             data_lsit = [dd for dd in cursor]
         for item in data_lsit:
             # logger.debug(item)

+ 1 - 1
lzz_theme/yyc/yyc_ck.json

@@ -1 +1 @@
-{"at": "d9ef19ee-3981-43c0-8c41-8783620b9b2e", "CASPRIVACY": "true", "JSESSIONID": "A8D9185FC182D23A74A30947C6D3A8BC", "_yht_code_uuid": "1a2d4339-30cc-4c22-b0c0-6fb0a039354b", "acw_tc": "1a0c639617286353790481318e0045daee6ec17190e5bb6482c591adee504c", "redirect_url": "https%3A%2F%2Fyc.yonyoucloud.com%2Fyuncai%2FSSO%2Flogin.jsp%3Fr%3DL3l1bmNhaS9wb3J0YWw%26", "yht_default_country": "86", "yht_default_login_type": "mobile", "TGC": "eyJhbGciOiJIUzUxMiJ9.ZXlKaGJHY2lPaUprYVhJaUxDSmxibU1pT2lKQk1USTRRMEpETFVoVE1qVTJJbjAuLkRpU2I1OEMwVjJCTVFKR0hja2I3b3cuQkoxQWtZX0l0aGU3SjVqWnV3dGRCRUtyR25NUzRHX0JmNXIxdDBab1dOV1Y2dWw3TTZ0SDFObzBHZ1czT0IyNF8xS2lOT191UGFEdnM2aUV3YWZsSDRub18yTXQ1OWt4eE9wMHc5WGk0SE1kTVFTTzlNRkpEdGlrT2M1clp3a3pJWjR4empJR0dzZlNWZ2dkVFNsN1lSc0c5MjFDd0FQOTBYeTRINUlCSzZ1ZHlzU253dFJKNFVzdUdPc0xFWUhtVGFJUG9iWDQ0dTQxUFlkaVJPRHpBa0dOOFdPVndNSUozcTMwNnp5cEtEdy5ISl9vVHg1Q3ZOelZMUGdQY2hrU25R.G3bKixsKqLg5dfKSepKfEIC7lhJREM9vDDfoPdglPSHyipmx2N_YBFQr8RBnojJ2okH5qj96cnOROJ3O3jZyJg", "HWWAFSESID": "b50ef790f03767fe23", "HWWAFSESTIME": "1728635380696", "m_biztier_sign": "", "tenantid": "sfg8fw7k", "token": "d2ViLDM2MDAsOXFGQXBneDJ3WDlWazhTajNQZUY0d1FMOUNyTmU1UEZZcEVuWmg1K0dScjlqdmJ3MEQ4KzZ4bVVoYm1MR0Nua1MrbG92NVg2dzZqa1phRDRIWThnelE9PQ", "u_logints": "1728635383197", "u_usercode": "4a7ecbb9-8ca7-4b0c-b9d6-7fad20374565", "ucuserid": "4a7ecbb9-8ca7-4b0c-b9d6-7fad20374565", "yht_tenantinfo": "sfg8fw7k", "yht_username": "ST-85368487-dFH5BCAbrrwlPSPAGvDn-online__4a7ecbb9-8ca7-4b0c-b9d6-7fad20374565", "yht_usertoken": "4jRatoX0Ka5BeZSAFIpd1y3m%2BPjxtymzZPB9XXMXP61Ns8P9TUR74PnGS6iuqvjSluIDH0JFZJeGdxi6rqX9vQ%3D%3D"}
+{"at": "7d4ad986-5a99-4c7b-831d-b37f362e932d", "CASPRIVACY": "true", "JSESSIONID": "F30F44761B9C433B79395D903BC4107D", "_yht_code_uuid": "84e9f91d-1588-4fa0-944c-d21753321a7b", "acw_tc": "276082a017434638222263777e08ba224eac08a4c4507ffbdf4c0722b44f85", "redirect_url": "https%3A%2F%2Fyc.yonyoucloud.com%2Fyuncai%2FSSO%2Flogin.jsp%3Fr%3DL3l1bmNhaS9wb3J0YWw%26", "yht_default_country": "86", "yht_default_login_type": "mobile", "TGC": "eyJhbGciOiJIUzUxMiJ9.ZXlKaGJHY2lPaUprYVhJaUxDSmxibU1pT2lKQk1USTRRMEpETFVoVE1qVTJJbjAuLm9iOFdfX3I4Um5jc29udTRDeU9nWWcuTDdKSENPM25ybVRMejg3SzZVZmVsZC1BMjVJNUl6X0MxUG1LN191VXZwVkRiTkl2RkJmVHhMNkZoSHU1OFVqemRoRExMYS1kVUszUTZramRnVWdZVmpSSEdycm1fbmwtUEg0SUV4cmFoN0NoMHlPa0dtS29rdTZnNFJCZlI2R2xoTGRqOEk1SmhfS2x2R2JzM2xwbUJ4bnVIMjFoWVF4NVpkOHlIVFNQRGFlZ3lwNUVJbGhPOVJaZGlIZ29YdU9DV0NNeGhjOU4xUmp3dUJKcWg4dzAyM1psWHlJNXF6S1UyQUoyOUlGRU1ydy4tcGxZSy1VNWozS1FJbXJpb05WVmN3.RC1N-ZCjPhL5VrjSJl09Vwgew2JllSG95dC9N_dWhXVe0b0oAXii3rWW9O6aBIPZUD_-jRinpAlM1QbSThi_Zg", "HWWAFSESID": "aeaf773723d844176b", "HWWAFSESTIME": "1743463823899", "m_biztier_sign": "", "tenantid": "sfg8fw7k", "token": "d2ViLDM2MDAsaFdVRGxxMDZIRjBIZVZPSmpaNEtnMzJTbmkrVjFObC9MWVZhazR6SFo5Tk0vYldqdWw1YmVKcW1qYVF6cE1HYmJ0cjlPMEJkRFBacW9MSG5xdDA4cnc9PQ", "u_logints": "1743463824057", "u_usercode": "4a7ecbb9-8ca7-4b0c-b9d6-7fad20374565", "ucuserid": "4a7ecbb9-8ca7-4b0c-b9d6-7fad20374565", "yht_tenantinfo": "sfg8fw7k", "yht_username": "ST-45865926-uOzMXO7oKq3hwBsceflR-online__4a7ecbb9-8ca7-4b0c-b9d6-7fad20374565", "yht_usertoken": "aF%2FBoZO8QO0DytaTIR8tCLDIDsQ11t4VAcmJgZPy5FeN0muGmz8hBWvrg4oxhEsktfdWG59X59j94JnbXGnOug%3D%3D"}

文件差异内容过多而无法显示
+ 774 - 781
lzz_theme/yyc/yyc_zbgg_details.py


部分文件因为文件数量过多而无法显示