浏览代码

删除已失效代码

dzr 1 周之前
父节点
当前提交
69040f53d2

+ 0 - 324
lzz_theme/qgzbgggsssyq/py_ssyq_details.py

@@ -1,324 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-06
----------
-@summary: 全国招标公告公示搜索引擎 - 详情页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-import json
-from utils.attachment import AttachmentDownloader
-from threading import Timer
-from parsel import Selector
-from utils.tools import *
-
-
-
-class Details:
-
-    def __init__(self):
-        self.proxy = get_proxy(socks5h=True)
-        self.db_table = Mongo_client().py_spider
-        self.db_name = self.db_table.theme_list
-        self.zt_details = self.db_table.data_bak
-        self.rds = Redis_client()
-        self.redis_key = "ztpc_ssyq_msg"
-        self.delete_key = ""
-        self.end_state = False
-        self.headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-
-    def get_time__2652(self, page=None, cid=None, rid=None):
-        with open('./ssyq_pm.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-
-        return ctx.call('tm', page, cid, rid)
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_dt', page)
-
-    def get_type_1017_f(self, href):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_file', href)
-
-    def detail_get(self, response, item, new_url):
-        response.encoding = response.apparent_encoding
-        root = Selector(text=response.text)
-
-        if "来源渠道:必联电子招标投标平台" in response.text:
-            # pdf 带 必联 水印,不入保存服务
-            item["sendflag"] = "true"
-
-        dd = root.xpath('//div[@class="mian_list_03"]/@index').extract_first()
-
-        cookies = response.cookies.get_dict()
-        headers2 = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Content-Length": "0",
-            "Origin": "https://bulletin.cebpubservice.com",
-            "Pragma": "no-cache",
-            "Referer": new_url,
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            "X-Requested-With": "XMLHttpRequest",
-        }
-
-        url2 = "https://bulletin.cebpubservice.com/details/permission/getSecretKey"
-        params = {
-            "time__2652": self.get_time__2652()
-        }
-        res = requests.post(url2, headers=headers2, cookies=cookies, params=params,
-                            timeout=30, proxies=self.proxy, verify=False)
-
-        ex_js = '''
-            CryptoJS = require("crypto-js")
-
-            function decryptByDES(ciphertext, key) {
-                    var keyHex = CryptoJS.enc.Utf8.parse("Ctpsp@884*");
-                    var decrypted = CryptoJS.DES.decrypt({
-                        ciphertext: CryptoJS.enc.Base64.parse(ciphertext)
-                    }, keyHex, {
-                        mode: CryptoJS.mode.ECB,
-                        padding: CryptoJS.pad.Pkcs7
-                    });
-                    return decrypted.toString(CryptoJS.enc.Utf8);
-            }
-            '''
-        ctx = execjs.compile(ex_js)
-        pm = ctx.call('decryptByDES', res.text.replace('"', ''))
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-        ffid = json.loads(pm).get('data')
-        f_org = f"/details/bulletin/getBulletin/{ffid}/{dd}"
-
-        for i in range(5):
-            file_url = f"https://bulletin.cebpubservice.com/details/bulletin/getBulletin/{ffid}/{dd}"
-            headers = {
-                "Accept": "*/*",
-                "Accept-Language": "zh-CN,zh;q=0.9",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Pragma": "no-cache",
-                "Referer": "https://bulletin.cebpubservice.com/resource/ceb/js/pdfjs-dist/web/viewer.html",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            }
-            f_tm = self.get_time__2652(f_org)
-            params = {
-                "time__2652": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=file_url,
-                proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            time.sleep(random.randint(3, 6))
-            self.proxy = get_proxy(socks5h=True)
-            if i == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-    def decrypt_by_des(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text.replace('"', ''))
-        data_org = eval(data_org.replace('true', '1').replace('false', '1').replace('null', '1'))
-        return data_org
-
-    def get_url(self, parse_url):
-        uid = "".join(re.findall('uuid=(.*?)&', parse_url))
-        headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-        url = f"https://ctbpsp.com/cutominfoapi/bulletin/{uid}/uid/0"
-
-        retry = 0
-        while (retry := retry + 1) < 5:
-            params = {
-                "type__1017": self.get_type_1017(uid)
-            }
-            try:
-                res = requests.get(url, headers=headers, params=params, proxies=get_QGIP(), timeout=30)
-                data_org = self.decrypt_by_des(res.text.replace('"', ""))
-                break
-            except:
-                pass
-
-        new_href = data_org.get('data').get('pdfUrl')
-        pub_time = data_org.get('data').get('noticeSendTimeStr', '')
-        pbtime = pub_time.replace('年', '-').replace('月', '-').replace('日', '')
-        if "bulletinPDF" not in new_href:
-            new_href = data_org.get('data').get('noticeUrl')
-        return new_href,pbtime
-
-    def new_parse(self,item, pdfurl):
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-
-        headers = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": f"https://ctbpsp.com/web_pdf/pdfjs-dist/web/viewer.html?file={pdfurl}",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-        for _ in range(5):
-            f_tm = self.get_type_1017_f(pdfurl)
-            params = {
-                "type__1017": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=pdfurl,
-                proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            time.sleep(random.randint(3, 6))
-            if _ == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-
-    def fetch_request(self, url):
-        response = requests.get(url=url, headers=self.headers,
-                                proxies=self.proxy, timeout=(30, 60), verify=False)
-        return response
-
-    def deal_request(self, item):
-        response = None
-        retry_times = 0
-        org_item = item.copy()
-        while (retry_times := retry_times + 1) < 5:
-            try:
-                new_url,pub_time = self.get_url(item['parse_url'])
-                if "bulletinPDF" in new_url:
-                    try:
-                        date_to_timestamp(item['publishtime'])
-                    except:
-                        item['publishtime'] = pub_time
-                    self.new_parse(item=item,pdfurl=new_url)
-                    return True
-                else:
-                    response = self.fetch_request(new_url)
-                    if response is not None and response.status_code == 200:
-                        self.detail_get(response, item=item, new_url=new_url)
-                        time.sleep(random.random())
-                        return True
-            except Exception as e:
-                item = org_item
-                logger.exception(f"{item['href']} 采集异常:{e}")
-                time.sleep(random.randint(5,10))
-                self.proxy = get_proxy(socks5h=True)
-        logger.warning(f"[采集失败]{item['href']}")
-        return False
-
-    def countSec(self):
-        for count in range(5, 0, -1):
-            print(f'\r{count} 秒 后结束任务', end='')
-            time.sleep(1)
-        print('\r任务结束')
-
-    def de_redis_key(self):
-        self.end_state = True
-        self.rds.hdel(self.redis_key, self.delete_key)
-        logger.warning("当前数据未采集成功,数据已回填!")
-        self.countSec()
-
-    def start(self, limit=1):
-        logger.debug("********** 详情页采集开始 **********")
-        time.sleep(random.random())
-        count = 0
-        ts = Timer(1195, self.de_redis_key)  # 声明一个定时器,设置多少s后执行
-        ts.start()  # 启动定时器
-        with self.db_name.find({"parser_name": "ztpc_qgzbgggsssyq", "failed": False, "is_crawl": False},
-                               no_cursor_timeout=True) as data_lsit:
-            for item in data_lsit:
-                # logger.debug(item)
-                if self.end_state:
-                    break
-                if count >= limit:
-                    break
-                unicode_key = md5value(item.get('href') + item.get('title'))
-                if not self.rds.hexists(self.redis_key, unicode_key):  # 除 动态字段 外所有字段去重
-                    self.rds.hset(self.redis_key, unicode_key, '')
-                    self.delete_key = unicode_key
-                    count += 1
-                    update_id = item["_id"]
-                    retry = item["retry"]
-                    if self.deal_request(item):
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
-                    else:
-                        retry += 1
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
-
-        logger.debug("********** 详情页采集结束 **********")
-        ts.cancel()  # 脚本规定时间内正常结束,取消定时器
-
-if __name__ == "__main__":
-    Details().start(limit=300)

+ 0 - 323
lzz_theme/qgzbgggsssyq/py_ssyq_details2.py

@@ -1,323 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-06
----------
-@summary: 全国招标公告公示搜索引擎 - 详情页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-import json
-from utils.attachment import AttachmentDownloader
-from threading import Timer
-from parsel import Selector
-from utils.tools import *
-
-
-class Details:
-
-    def __init__(self):
-        self.proxy = get_proxy(socks5h=True)
-        self.db_table = Mongo_client().py_spider
-        self.db_name = self.db_table.theme_list
-        self.zt_details = self.db_table.data_bak
-        self.rds = Redis_client()
-        self.redis_key = "ztpc_ssyq_msg"
-        self.delete_key = ""
-        self.end_state = False
-        self.headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-
-    def get_time__2652(self, page=None, cid=None, rid=None):
-        with open('./ssyq_pm.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-
-        return ctx.call('tm', page, cid, rid)
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_dt', page)
-
-    def get_type_1017_f(self, href):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_file', href)
-
-    def detail_get(self, response, item, new_url):
-        response.encoding = response.apparent_encoding
-        root = Selector(text=response.text)
-
-        if "来源渠道:必联电子招标投标平台" in response.text:
-            # pdf 带 必联 水印,不入保存服务
-            item["sendflag"] = "true"
-
-        dd = root.xpath('//div[@class="mian_list_03"]/@index').extract_first()
-
-        cookies = response.cookies.get_dict()
-        headers2 = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Content-Length": "0",
-            "Origin": "https://bulletin.cebpubservice.com",
-            "Pragma": "no-cache",
-            "Referer": new_url,
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            "X-Requested-With": "XMLHttpRequest",
-        }
-
-        url2 = "https://bulletin.cebpubservice.com/details/permission/getSecretKey"
-        params = {
-            "time__2652": self.get_time__2652()
-        }
-        res = requests.post(url2, headers=headers2, cookies=cookies, params=params,
-                            timeout=30, proxies=self.proxy, verify=False)
-
-        ex_js = '''
-            CryptoJS = require("crypto-js")
-
-            function decryptByDES(ciphertext, key) {
-                    var keyHex = CryptoJS.enc.Utf8.parse("Ctpsp@884*");
-                    var decrypted = CryptoJS.DES.decrypt({
-                        ciphertext: CryptoJS.enc.Base64.parse(ciphertext)
-                    }, keyHex, {
-                        mode: CryptoJS.mode.ECB,
-                        padding: CryptoJS.pad.Pkcs7
-                    });
-                    return decrypted.toString(CryptoJS.enc.Utf8);
-            }
-            '''
-        ctx = execjs.compile(ex_js)
-        pm = ctx.call('decryptByDES', res.text.replace('"', ''))
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-        ffid = json.loads(pm).get('data')
-        f_org = f"/details/bulletin/getBulletin/{ffid}/{dd}"
-
-        for i in range(5):
-            file_url = f"https://bulletin.cebpubservice.com/details/bulletin/getBulletin/{ffid}/{dd}"
-            headers = {
-                "Accept": "*/*",
-                "Accept-Language": "zh-CN,zh;q=0.9",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Pragma": "no-cache",
-                "Referer": "https://bulletin.cebpubservice.com/resource/ceb/js/pdfjs-dist/web/viewer.html",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            }
-            f_tm = self.get_time__2652(f_org)
-            params = {
-                "time__2652": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=file_url,
-                proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            time.sleep(random.randint(3, 6))
-            self.proxy = get_proxy(socks5h=True)
-            if i == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-    def decrypt_by_des(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text.replace('"', ''))
-        data_org = eval(data_org.replace('true', '1').replace('false', '1').replace('null', '1'))
-        return data_org
-
-    def get_url(self, parse_url):
-        uid = "".join(re.findall('uuid=(.*?)&', parse_url))
-        headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-        url = f"https://ctbpsp.com/cutominfoapi/bulletin/{uid}/uid/0"
-
-        retry = 0
-        while (retry := retry + 1) < 5:
-            params = {
-                "type__1017": self.get_type_1017(uid)
-            }
-            try:
-                res = requests.get(url, headers=headers, params=params, proxies=get_QGIP(), timeout=30)
-                data_org = self.decrypt_by_des(res.text.replace('"', ""))
-                break
-            except:
-                pass
-
-        new_href = data_org.get('data').get('pdfUrl')
-        pub_time = data_org.get('data').get('noticeSendTimeStr', '')
-        pbtime = pub_time.replace('年', '-').replace('月', '-').replace('日', '')
-        if "bulletinPDF" not in new_href:
-            new_href = data_org.get('data').get('noticeUrl')
-        return new_href,pbtime
-
-    def new_parse(self,item, pdfurl):
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-
-        headers = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": f"https://ctbpsp.com/web_pdf/pdfjs-dist/web/viewer.html?file={pdfurl}",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-        for _ in range(5):
-            f_tm = self.get_type_1017_f(pdfurl)
-            params = {
-                "type__1017": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=pdfurl,
-                proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            if _ == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-
-    def fetch_request(self, url):
-        response = requests.get(url=url, headers=self.headers,
-                                proxies=self.proxy, timeout=(30, 60), verify=False)
-        return response
-
-    def deal_request(self, item):
-        response = None
-        retry_times = 0
-        org_item = item.copy()
-        while (retry_times := retry_times + 1) < 5:
-            try:
-                new_url,pub_time = self.get_url(item['parse_url'])
-                if "bulletinPDF" in new_url:
-                    try:
-                        date_to_timestamp(item['publishtime'])
-                    except:
-                        item['publishtime'] = pub_time
-                    self.new_parse(item=item,pdfurl=new_url)
-                    return True
-                else:
-                    response = self.fetch_request(new_url)
-                    if response is not None and response.status_code == 200:
-                        self.detail_get(response, item=item, new_url=new_url)
-                        time.sleep(random.random())
-                        return True
-            except Exception as e:
-                item = org_item
-                logger.exception(f"{item['href']} 采集异常:{e}")
-                time.sleep(random.randint(5,10))
-                self.proxy = get_proxy(socks5h=True)
-        logger.warning(f"[采集失败]{item['href']}")
-        return False
-
-    def countSec(self):
-        for count in range(5, 0, -1):
-            print(f'\r{count} 秒 后结束任务', end='')
-            time.sleep(1)
-        print('\r任务结束')
-
-    def de_redis_key(self):
-        self.end_state = True
-        self.rds.hdel(self.redis_key, self.delete_key)
-        logger.warning("当前数据未采集成功,数据已回填!")
-        self.countSec()
-
-    def start(self, limit=1):
-        logger.debug("********** 详情页采集开始 **********")
-        time.sleep(random.random())
-        count = 0
-        ts = Timer(1195, self.de_redis_key)  # 声明一个定时器,设置多少s后执行
-        ts.start()  # 启动定时器
-        with self.db_name.find({"parser_name": "ztpc_qgzbgggsssyq", "failed": False, "is_crawl": False},
-                               no_cursor_timeout=True) as data_lsit:
-            for item in data_lsit:
-                # logger.debug(item)
-                if self.end_state:
-                    break
-                if count >= limit:
-                    break
-                unicode_key = md5value(item.get('href') + item.get('title'))
-                if not self.rds.hexists(self.redis_key, unicode_key):  # 除 动态字段 外所有字段去重
-                    self.rds.hset(self.redis_key, unicode_key, '')
-                    self.delete_key = unicode_key
-                    count += 1
-                    update_id = item["_id"]
-                    retry = item["retry"]
-                    if self.deal_request(item):
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
-                    else:
-                        retry += 1
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
-
-        logger.debug("********** 详情页采集结束 **********")
-        ts.cancel()  # 脚本规定时间内正常结束,取消定时器
-
-if __name__ == "__main__":
-    Details().start(limit=1000)

+ 0 - 323
lzz_theme/qgzbgggsssyq/py_ssyq_details3.py

@@ -1,323 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-06
----------
-@summary: 全国招标公告公示搜索引擎 - 详情页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-import json
-from utils.attachment import AttachmentDownloader
-from threading import Timer
-from parsel import Selector
-from utils.tools import *
-
-
-
-class Details:
-
-    def __init__(self):
-        self.proxy = get_proxy(socks5h=True)
-        self.db_table = Mongo_client().py_spider
-        self.db_name = self.db_table.theme_list
-        self.zt_details = self.db_table.data_bak
-        self.rds = Redis_client()
-        self.redis_key = "ztpc_ssyq_msg"
-        self.delete_key = ""
-        self.end_state = False
-        self.headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-
-    def get_time__2652(self, page=None, cid=None, rid=None):
-        with open('./ssyq_pm.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-
-        return ctx.call('tm', page, cid, rid)
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_dt', page)
-
-    def get_type_1017_f(self, href):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_file', href)
-
-    def detail_get(self, response, item, new_url):
-        response.encoding = response.apparent_encoding
-        root = Selector(text=response.text)
-
-        if "来源渠道:必联电子招标投标平台" in response.text:
-            # pdf 带 必联 水印,不入保存服务
-            item["sendflag"] = "true"
-
-        dd = root.xpath('//div[@class="mian_list_03"]/@index').extract_first()
-
-        cookies = response.cookies.get_dict()
-        headers2 = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Content-Length": "0",
-            "Origin": "https://bulletin.cebpubservice.com",
-            "Pragma": "no-cache",
-            "Referer": new_url,
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            "X-Requested-With": "XMLHttpRequest",
-        }
-
-        url2 = "https://bulletin.cebpubservice.com/details/permission/getSecretKey"
-        params = {
-            "time__2652": self.get_time__2652()
-        }
-        res = requests.post(url2, headers=headers2, cookies=cookies, params=params,
-                            timeout=30, proxies=self.proxy, verify=False)
-
-        ex_js = '''
-            CryptoJS = require("crypto-js")
-
-            function decryptByDES(ciphertext, key) {
-                    var keyHex = CryptoJS.enc.Utf8.parse("Ctpsp@884*");
-                    var decrypted = CryptoJS.DES.decrypt({
-                        ciphertext: CryptoJS.enc.Base64.parse(ciphertext)
-                    }, keyHex, {
-                        mode: CryptoJS.mode.ECB,
-                        padding: CryptoJS.pad.Pkcs7
-                    });
-                    return decrypted.toString(CryptoJS.enc.Utf8);
-            }
-            '''
-        ctx = execjs.compile(ex_js)
-        pm = ctx.call('decryptByDES', res.text.replace('"', ''))
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-        ffid = json.loads(pm).get('data')
-        f_org = f"/details/bulletin/getBulletin/{ffid}/{dd}"
-
-        for i in range(5):
-            file_url = f"https://bulletin.cebpubservice.com/details/bulletin/getBulletin/{ffid}/{dd}"
-            headers = {
-                "Accept": "*/*",
-                "Accept-Language": "zh-CN,zh;q=0.9",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Pragma": "no-cache",
-                "Referer": "https://bulletin.cebpubservice.com/resource/ceb/js/pdfjs-dist/web/viewer.html",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            }
-            f_tm = self.get_time__2652(f_org)
-            params = {
-                "time__2652": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=file_url,
-                proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            time.sleep(random.randint(3, 6))
-            self.proxy = get_proxy(socks5h=True)
-            if i == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-    def decrypt_by_des(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text.replace('"', ''))
-        data_org = eval(data_org.replace('true', '1').replace('false', '1').replace('null', '1'))
-        return data_org
-
-    def get_url(self, parse_url):
-        uid = "".join(re.findall('uuid=(.*?)&', parse_url))
-        headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-        url = f"https://ctbpsp.com/cutominfoapi/bulletin/{uid}/uid/0"
-
-        retry = 0
-        while (retry := retry + 1) < 5:
-            params = {
-                "type__1017": self.get_type_1017(uid)
-            }
-            try:
-                res = requests.get(url, headers=headers, params=params, proxies=get_QGIP(), timeout=30)
-                data_org = self.decrypt_by_des(res.text.replace('"', ""))
-                break
-            except:
-                pass
-
-        new_href = data_org.get('data').get('pdfUrl')
-        pub_time = data_org.get('data').get('noticeSendTimeStr', '')
-        pbtime = pub_time.replace('年', '-').replace('月', '-').replace('日', '')
-        if "bulletinPDF" not in new_href:
-            new_href = data_org.get('data').get('noticeUrl')
-        return new_href,pbtime
-
-    def new_parse(self,item, pdfurl):
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-
-        headers = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": f"https://ctbpsp.com/web_pdf/pdfjs-dist/web/viewer.html?file={pdfurl}",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-        for _ in range(5):
-            f_tm = self.get_type_1017_f(pdfurl)
-            params = {
-                "type__1017": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=pdfurl,
-                proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            if _ == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-
-    def fetch_request(self, url):
-        response = requests.get(url=url, headers=self.headers,
-                                proxies=self.proxy, timeout=(30, 60), verify=False)
-        return response
-
-    def deal_request(self, item):
-        response = None
-        retry_times = 0
-        org_item = item.copy()
-        while (retry_times := retry_times + 1) < 5:
-            try:
-                new_url,pub_time = self.get_url(item['parse_url'])
-                if "bulletinPDF" in new_url:
-                    try:
-                        date_to_timestamp(item['publishtime'])
-                    except:
-                        item['publishtime'] = pub_time
-                    self.new_parse(item=item,pdfurl=new_url)
-                    return True
-                else:
-                    response = self.fetch_request(new_url)
-                    if response is not None and response.status_code == 200:
-                        self.detail_get(response, item=item, new_url=new_url)
-                        time.sleep(random.random())
-                        return True
-            except Exception as e:
-                item = org_item
-                logger.exception(f"{item['href']} 采集异常:{e}")
-                time.sleep(random.randint(5,10))
-                self.proxy = get_proxy(socks5h=True)
-        logger.warning(f"[采集失败]{item['href']}")
-        return False
-
-    def countSec(self):
-        for count in range(5, 0, -1):
-            print(f'\r{count} 秒 后结束任务', end='')
-            time.sleep(1)
-        print('\r任务结束')
-
-    def de_redis_key(self):
-        self.end_state = True
-        self.rds.hdel(self.redis_key, self.delete_key)
-        logger.warning("当前数据未采集成功,数据已回填!")
-        self.countSec()
-
-    def start(self, limit=1):
-        logger.debug("********** 详情页采集开始 **********")
-        time.sleep(random.random())
-        count = 0
-        ts = Timer(1195, self.de_redis_key)  # 声明一个定时器,设置多少s后执行
-        ts.start()  # 启动定时器
-        with self.db_name.find({"parser_name": "ztpc_qgzbgggsssyq", "failed": False, "is_crawl": False},
-                               no_cursor_timeout=True) as data_lsit:
-            for item in data_lsit:
-                # logger.debug(item)
-                if self.end_state:
-                    break
-                if count >= limit:
-                    break
-                unicode_key = md5value(item.get('href') + item.get('title'))
-                if not self.rds.hexists(self.redis_key, unicode_key):  # 除 动态字段 外所有字段去重
-                    self.rds.hset(self.redis_key, unicode_key, '')
-                    self.delete_key = unicode_key
-                    count += 1
-                    update_id = item["_id"]
-                    retry = item["retry"]
-                    if self.deal_request(item):
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
-                    else:
-                        retry += 1
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
-
-        logger.debug("********** 详情页采集结束 **********")
-        ts.cancel()  # 脚本规定时间内正常结束,取消定时器
-
-if __name__ == "__main__":
-    Details().start(limit=300)

+ 0 - 323
lzz_theme/qgzbgggsssyq/py_ssyq_details4.py

@@ -1,323 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-06
----------
-@summary: 全国招标公告公示搜索引擎 - 详情页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-import json
-from utils.attachment import AttachmentDownloader
-from threading import Timer
-from parsel import Selector
-from utils.tools import *
-
-
-
-class Details:
-
-    def __init__(self):
-        self.proxy = get_proxy(socks5h=True)
-        self.db_table = Mongo_client().py_spider
-        self.db_name = self.db_table.theme_list
-        self.zt_details = self.db_table.data_bak
-        self.rds = Redis_client()
-        self.redis_key = "ztpc_ssyq_msg"
-        self.delete_key = ""
-        self.end_state = False
-        self.headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-
-    def get_time__2652(self, page=None, cid=None, rid=None):
-        with open('./ssyq_pm.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-
-        return ctx.call('tm', page, cid, rid)
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_dt', page)
-
-    def get_type_1017_f(self, href):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_file', href)
-
-    def detail_get(self, response, item, new_url):
-        response.encoding = response.apparent_encoding
-        root = Selector(text=response.text)
-
-        if "来源渠道:必联电子招标投标平台" in response.text:
-            # pdf 带 必联 水印,不入保存服务
-            item["sendflag"] = "true"
-
-        dd = root.xpath('//div[@class="mian_list_03"]/@index').extract_first()
-
-        cookies = response.cookies.get_dict()
-        headers2 = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Content-Length": "0",
-            "Origin": "https://bulletin.cebpubservice.com",
-            "Pragma": "no-cache",
-            "Referer": new_url,
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            "X-Requested-With": "XMLHttpRequest",
-        }
-
-        url2 = "https://bulletin.cebpubservice.com/details/permission/getSecretKey"
-        params = {
-            "time__2652": self.get_time__2652()
-        }
-        res = requests.post(url2, headers=headers2, cookies=cookies, params=params,
-                            timeout=30, proxies=self.proxy, verify=False)
-
-        ex_js = '''
-            CryptoJS = require("crypto-js")
-
-            function decryptByDES(ciphertext, key) {
-                    var keyHex = CryptoJS.enc.Utf8.parse("Ctpsp@884*");
-                    var decrypted = CryptoJS.DES.decrypt({
-                        ciphertext: CryptoJS.enc.Base64.parse(ciphertext)
-                    }, keyHex, {
-                        mode: CryptoJS.mode.ECB,
-                        padding: CryptoJS.pad.Pkcs7
-                    });
-                    return decrypted.toString(CryptoJS.enc.Utf8);
-            }
-            '''
-        ctx = execjs.compile(ex_js)
-        pm = ctx.call('decryptByDES', res.text.replace('"', ''))
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-        ffid = json.loads(pm).get('data')
-        f_org = f"/details/bulletin/getBulletin/{ffid}/{dd}"
-
-        for i in range(5):
-            file_url = f"https://bulletin.cebpubservice.com/details/bulletin/getBulletin/{ffid}/{dd}"
-            headers = {
-                "Accept": "*/*",
-                "Accept-Language": "zh-CN,zh;q=0.9",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Pragma": "no-cache",
-                "Referer": "https://bulletin.cebpubservice.com/resource/ceb/js/pdfjs-dist/web/viewer.html",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            }
-            f_tm = self.get_time__2652(f_org)
-            params = {
-                "time__2652": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=file_url,
-                proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            time.sleep(random.randint(3, 6))
-            self.proxy = get_proxy(socks5h=True)
-            if i == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-    def decrypt_by_des(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text.replace('"', ''))
-        data_org = eval(data_org.replace('true', '1').replace('false', '1').replace('null', '1'))
-        return data_org
-
-    def get_url(self, parse_url):
-        uid = "".join(re.findall('uuid=(.*?)&', parse_url))
-        headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-        url = f"https://ctbpsp.com/cutominfoapi/bulletin/{uid}/uid/0"
-
-        retry = 0
-        while (retry := retry + 1) < 5:
-            params = {
-                "type__1017": self.get_type_1017(uid)
-            }
-            try:
-                res = requests.get(url, headers=headers, params=params, proxies=get_QGIP(), timeout=30)
-                data_org = self.decrypt_by_des(res.text.replace('"', ""))
-                break
-            except:
-                pass
-
-        new_href = data_org.get('data').get('pdfUrl')
-        pub_time = data_org.get('data').get('noticeSendTimeStr', '')
-        pbtime = pub_time.replace('年', '-').replace('月', '-').replace('日', '')
-        if "bulletinPDF" not in new_href:
-            new_href = data_org.get('data').get('noticeUrl')
-        return new_href,pbtime
-
-    def new_parse(self,item, pdfurl):
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-
-        headers = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": f"https://ctbpsp.com/web_pdf/pdfjs-dist/web/viewer.html?file={pdfurl}",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-        for _ in range(5):
-            f_tm = self.get_type_1017_f(pdfurl)
-            params = {
-                "type__1017": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=pdfurl,
-                proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            if _ == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-
-    def fetch_request(self, url):
-        response = requests.get(url=url, headers=self.headers,
-                                proxies=self.proxy, timeout=(30, 60), verify=False)
-        return response
-
-    def deal_request(self, item):
-        response = None
-        retry_times = 0
-        org_item = item.copy()
-        while (retry_times := retry_times + 1) < 5:
-            try:
-                new_url,pub_time = self.get_url(item['parse_url'])
-                if "bulletinPDF" in new_url:
-                    try:
-                        date_to_timestamp(item['publishtime'])
-                    except:
-                        item['publishtime'] = pub_time
-                    self.new_parse(item=item,pdfurl=new_url)
-                    return True
-                else:
-                    response = self.fetch_request(new_url)
-                    if response is not None and response.status_code == 200:
-                        self.detail_get(response, item=item, new_url=new_url)
-                        time.sleep(random.random())
-                        return True
-            except Exception as e:
-                item = org_item
-                logger.exception(f"{item['href']} 采集异常:{e}")
-                time.sleep(random.randint(5,10))
-                self.proxy = get_proxy(socks5h=True)
-        logger.warning(f"[采集失败]{item['href']}")
-        return False
-
-    def countSec(self):
-        for count in range(5, 0, -1):
-            print(f'\r{count} 秒 后结束任务', end='')
-            time.sleep(1)
-        print('\r任务结束')
-
-    def de_redis_key(self):
-        self.end_state = True
-        self.rds.hdel(self.redis_key, self.delete_key)
-        logger.warning("当前数据未采集成功,数据已回填!")
-        self.countSec()
-
-    def start(self, limit=1):
-        logger.debug("********** 详情页采集开始 **********")
-        time.sleep(random.random())
-        count = 0
-        ts = Timer(1195, self.de_redis_key)  # 声明一个定时器,设置多少s后执行
-        ts.start()  # 启动定时器
-        with self.db_name.find({"parser_name": "ztpc_qgzbgggsssyq", "failed": False, "is_crawl": False},
-                               no_cursor_timeout=True) as data_lsit:
-            for item in data_lsit:
-                # logger.debug(item)
-                if self.end_state:
-                    break
-                if count >= limit:
-                    break
-                unicode_key = md5value(item.get('href') + item.get('title'))
-                if not self.rds.hexists(self.redis_key, unicode_key):  # 除 动态字段 外所有字段去重
-                    self.rds.hset(self.redis_key, unicode_key, '')
-                    self.delete_key = unicode_key
-                    count += 1
-                    update_id = item["_id"]
-                    retry = item["retry"]
-                    if self.deal_request(item):
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
-                    else:
-                        retry += 1
-                        self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
-
-        logger.debug("********** 详情页采集结束 **********")
-        ts.cancel()  # 脚本规定时间内正常结束,取消定时器
-
-if __name__ == "__main__":
-    Details().start(limit=300)

+ 0 - 300
lzz_theme/qgzbgggsssyq/py_ssyq_details_bu.py

@@ -1,300 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-06
----------
-@summary: 全国招标公告公示搜索引擎 - 详情页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-import json
-from utils.attachment import AttachmentDownloader
-from threading import Timer
-from parsel import Selector
-from utils.tools import *
-
-
-
-
-class Details:
-
-    def __init__(self):
-        self.proxy = get_proxy(socks5h=True)
-        self.db_table = Mongo_client().py_spider
-        self.db_name = self.db_table.theme_list
-        self.zt_details = self.db_table.data_bak
-        self.rds = Redis_client()
-        self.redis_key = "ztpc_ssyq_msg"
-        self.delete_key = ""
-        self.end_state = False
-        self.headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-
-    def get_time__2652(self, page=None, cid=None, rid=None):
-        with open('./ssyq_pm.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-
-        return ctx.call('tm', page, cid, rid)
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_dt', page)
-
-    def get_type_1017_f(self, href):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_file', href)
-
-    def detail_get(self, response, item, new_url):
-        response.encoding = response.apparent_encoding
-        root = Selector(text=response.text)
-
-        if "来源渠道:必联电子招标投标平台" in response.text:
-            # pdf 带 必联 水印,不入保存服务
-            item["sendflag"] = "true"
-
-        dd = root.xpath('//div[@class="mian_list_03"]/@index').extract_first()
-
-        cookies = response.cookies.get_dict()
-        headers2 = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Content-Length": "0",
-            "Origin": "https://bulletin.cebpubservice.com",
-            "Pragma": "no-cache",
-            "Referer": new_url,
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            "X-Requested-With": "XMLHttpRequest",
-        }
-
-        url2 = "https://bulletin.cebpubservice.com/details/permission/getSecretKey"
-        params = {
-            "time__2652": self.get_time__2652()
-        }
-        res = requests.post(url2, headers=headers2, cookies=cookies, params=params,
-                            timeout=30, proxies=self.proxy, verify=False)
-
-        ex_js = '''
-            CryptoJS = require("crypto-js")
-
-            function decryptByDES(ciphertext, key) {
-                    var keyHex = CryptoJS.enc.Utf8.parse("Ctpsp@884*");
-                    var decrypted = CryptoJS.DES.decrypt({
-                        ciphertext: CryptoJS.enc.Base64.parse(ciphertext)
-                    }, keyHex, {
-                        mode: CryptoJS.mode.ECB,
-                        padding: CryptoJS.pad.Pkcs7
-                    });
-                    return decrypted.toString(CryptoJS.enc.Utf8);
-            }
-            '''
-        ctx = execjs.compile(ex_js)
-        pm = ctx.call('decryptByDES', res.text.replace('"', ''))
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-        ffid = json.loads(pm).get('data')
-        f_org = f"/details/bulletin/getBulletin/{ffid}/{dd}"
-
-        for i in range(5):
-            file_url = f"https://bulletin.cebpubservice.com/details/bulletin/getBulletin/{ffid}/{dd}"
-            headers = {
-                "Accept": "*/*",
-                "Accept-Language": "zh-CN,zh;q=0.9",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Pragma": "no-cache",
-                "Referer": "https://bulletin.cebpubservice.com/resource/ceb/js/pdfjs-dist/web/viewer.html",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            }
-            f_tm = self.get_time__2652(f_org)
-            params = {
-                "time__2652": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=file_url,
-                proxies=self.proxy,headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            time.sleep(random.randint(3, 6))
-            self.proxy = get_proxy(socks5h=True)
-            if i == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-    def decrypt_by_des(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text.replace('"', ''))
-        data_org = eval(data_org.replace('true', '1').replace('false', '1').replace('null', '1'))
-        return data_org
-
-    def get_url(self, parse_url):
-        uid = "".join(re.findall('uuid=(.*?)&', parse_url))
-        headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-        url = f"https://ctbpsp.com/cutominfoapi/bulletin/{uid}/uid/0"
-
-        retry = 0
-        while (retry := retry + 1) < 5:
-            params = {
-                "type__1017": self.get_type_1017(uid)
-            }
-            try:
-                res = requests.get(url, headers=headers, params=params, proxies=get_QGIP(), timeout=30)
-                data_org = self.decrypt_by_des(res.text.replace('"', ""))
-                break
-            except:
-                pass
-
-        new_href = data_org.get('data').get('pdfUrl')
-        pub_time = data_org.get('data').get('noticeSendTimeStr', '')
-        pbtime = pub_time.replace('年', '-').replace('月', '-').replace('日','')
-        if "bulletinPDF" not in new_href:
-            new_href = data_org.get('data').get('noticeUrl')
-        return new_href,pbtime
-
-    def new_parse(self,item, pdfurl):
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-
-        headers = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": f"https://ctbpsp.com/web_pdf/pdfjs-dist/web/viewer.html?file={pdfurl}",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-        for _ in range(5):
-            f_tm = self.get_type_1017_f(pdfurl)
-            params = {
-                "type__1017": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=pdfurl,
-                proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            if _ == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-
-    def fetch_request(self, url):
-        response = requests.get(url=url, headers=self.headers,
-                                proxies=self.proxy, timeout=(30, 60), verify=False)
-        return response
-
-    def deal_request(self, item):
-        response = None
-        retry_times = 0
-        org_item = item.copy()
-        while (retry_times := retry_times + 1) < 5:
-            try:
-                new_url,pub_time = self.get_url(item['parse_url'])
-                if "bulletinPDF" in new_url:
-                    try:
-                        date_to_timestamp(item['publishtime'])
-                    except:
-                        item['publishtime'] = pub_time
-                    self.new_parse(item=item,pdfurl=new_url)
-                    return True
-                else:
-                    response = self.fetch_request(new_url)
-                    if response is not None and response.status_code == 200:
-                        self.detail_get(response, item=item, new_url=new_url)
-                        time.sleep(random.random())
-                        return True
-            except Exception as e:
-                item = org_item
-                logger.exception(f"{item['href']} 采集异常:{e}")
-                time.sleep(random.randint(5,10))
-                self.proxy = get_proxy(socks5h=True)
-        logger.warning(f"[采集失败]{item['href']}")
-        return False
-
-    def start(self, limit=1):
-        logger.debug("********** 详情页采集开始 **********")
-
-        with self.db_name.find({"parser_name": "ztpc_qgzbgggsssyq", "failed": True, "is_crawl": False,
-                                "retry":{"$lt":3}}, no_cursor_timeout=True).limit(limit) as cursor:
-            data_lsit = [dd for dd in cursor]
-        for item in data_lsit:
-            # logger.debug(item)
-            update_id = item["_id"]
-            retry = item["retry"]
-            if self.deal_request(item):
-                self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
-            else:
-                retry += 1
-                self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
-
-        logger.debug("********** 详情页采集结束 **********")
-
-if __name__ == "__main__":
-    Details().start(limit=2000)

+ 0 - 208
lzz_theme/qgzbgggsssyq/py_ssyq_list.py

@@ -1,208 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-10-29
----------
-@summary: 全国招标公告公示搜索引擎 - 列表页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-from utils.tools import *
-from utils.RedisDB import RedisFilter
-import requests
-import warnings
-import ast
-
-warnings.filterwarnings('ignore')
-
-
-class Spider:
-
-    def __init__(self):
-        self.py_spider = Mongo_client().py_spider
-        self.zb_list = self.py_spider.theme_list
-        self.RDS = RedisFilter()
-        self.real_cont = 0
-        self.headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            # "Connection": "keep-alive",
-            # "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-    def get_acw_sc_v2(self, html):
-        try:
-            arg1 = "".join(re.findall("arg1='(.*?)'", html))
-            if arg1:
-                js_script = '''
-                    function getAcw_sc__v2(obt_arg1) {
-                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
-                            var _0x5a5d3b = '';
-                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
-                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
-                                if (_0x189e2c["length"] == 1) {
-                                    _0x189e2c = '0' + _0x189e2c;
-                                }
-                                _0x5a5d3b += _0x189e2c;
-                            }
-                            return _0x5a5d3b;
-                        };
-                        String["prototype"]["unsbox"] = function () {
-                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
-                            var _0x4da0dc = [];
-                            var _0x12605e = '';
-                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
-                                var _0x385ee3 = this[_0x20a7bf];
-                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
-                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
-                                        _0x4da0dc[_0x217721] = _0x385ee3;
-                                    }
-                                }
-                            }
-                            _0x12605e = _0x4da0dc["join"]('');
-                            return _0x12605e;
-                        };
-
-                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
-                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
-                        var arg1 = obt_arg1
-                        var _0x23a392 = arg1["unsbox"]();
-                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
-                        return arg2
-                    }
-                '''
-                ctx = execjs.compile(js_script)
-                arg2 = ctx.call('getAcw_sc__v2', arg1)
-                return {"acw_sc__v2": arg2}
-            else:
-                return {}
-        except:
-            return {}
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_lt', page)
-
-    def get_data(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text)
-        return data_org
-
-    def fetch(self, page):
-        url = f"https://ctbpsp.com/cutominfoapi/recommand/type/5/pagesize/10/currentpage/{page}"
-        params = {
-            "type__1017": self.get_type_1017(page)
-        }
-        response = requests.get(url, headers=self.headers, params=params, proxies=get_QGIP(), verify=False)
-        text = response.content.decode().replace('"', "")
-        ret = self.get_data(text)
-        if not ret:
-            raise ValueError('数据内容为空.')
-        return ret
-
-    @staticmethod
-    def data_extract(data):
-        data_info = data.replace('true', '1').replace('false', '1').replace('null', '1')
-        iter_data = ast.literal_eval(data_info)
-        if iter_data.get('data') == 1:
-            # {'success': 1, 'data': 1, 'errorMessage': ''}
-            raise TypeError('数据获取失败.')
-
-        data_list = iter_data['data'].get('dataList')
-        page_size = iter_data['data']['pageSize']
-        return data_list, page_size
-
-    def parse(self, data_info, page):
-        info_list, _ = self.data_extract(data_info)
-
-        results_list = []
-        for info in info_list:
-            hid = info.get('bulletinID')
-            did = info.get('dataSource')
-            href = f"http://ctbpsp.com/#/bulletinDetail?uuid={hid}&inpvalue=&dataSource={did}&tenderAgency="
-            title = info.get('noticeName').strip()
-            create_time = info.get('noticeSendTime')
-            reginProvince = info.get('reginProvince').replace("省", "").replace("市", "")
-
-            dedup = [title + href]
-            if not self.RDS.data_filter(dedup):
-                item = {
-                    "site": "中国招标投标公共服务平台",
-                    "channel": "全国招标公告公示搜索引擎",
-                    "spidercode": "a_qgzbgggsssyq_qbgg",
-                    "area": reginProvince,
-                    "city": "",
-                    "district": "",
-                    "href": href,
-                    "title": title,
-                    "publishtime": create_time,
-                    "parse_url": href,
-                    "parser_name": "ztpc_qgzbgggsssyq",
-                    "is_mixed": False,
-                    "is_theme": True,
-                    "retry": 0,
-                    "comeintime": int2long(time.time()),
-                    "is_crawl": False,
-                    "failed": False,
-                    "iscompete": True,
-                    "sendflag": "false",
-                    "T": "bidding",
-                    "infoformat": 1,
-                    "type": "",
-                    "publishdept": "",
-                    "_d": "comeintime",
-                }
-
-                self.zb_list.insert_one(item)
-                self.RDS.data_save_redis(dedup)
-                results_list.append(item)
-
-        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
-        self.real_cont += len(results_list)
-        return results_list
-
-    def crawl(self, page):
-        retey = 0
-        while (retey := retey + 1) < 5:
-            try:
-                data_info = self.fetch(page=page)
-                self.parse(data_info=data_info, page=page)
-                time.sleep(random.random())
-                return
-            except Exception as e:
-                logger.error(f"第{page}页 采集异常:{e}")
-                time.sleep(3)
-
-    def start(self, crawl_page):
-        logger.debug("********** 列表页开始 **********")
-        for page in range(1, crawl_page + 1):
-            self.crawl(page=page)
-            logger.info(f"当前已采集 {self.real_cont} 条数据")
-        logger.debug("********** 列表页结束 **********")
-
-
-if __name__ == '__main__':
-    Spider().start(100)

+ 0 - 208
lzz_theme/qgzbgggsssyq/py_ssyq_list_bu.py

@@ -1,208 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-06
----------
-@summary: 全国招标公告公示搜索引擎 - 列表页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-from utils.tools import *
-from utils.RedisDB import RedisFilter
-import requests
-import warnings
-import ast
-
-warnings.filterwarnings('ignore')
-
-
-class Spider:
-
-    def __init__(self):
-        self.py_spider = Mongo_client().py_spider
-        self.zb_list = self.py_spider.theme_list
-        self.RDS = RedisFilter()
-        self.real_cont = 0
-        self.headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            # "Connection": "keep-alive",
-            # "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-    def get_acw_sc_v2(self, html):
-        try:
-            arg1 = "".join(re.findall("arg1='(.*?)'", html))
-            if arg1:
-                js_script = '''
-                    function getAcw_sc__v2(obt_arg1) {
-                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
-                            var _0x5a5d3b = '';
-                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
-                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
-                                if (_0x189e2c["length"] == 1) {
-                                    _0x189e2c = '0' + _0x189e2c;
-                                }
-                                _0x5a5d3b += _0x189e2c;
-                            }
-                            return _0x5a5d3b;
-                        };
-                        String["prototype"]["unsbox"] = function () {
-                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
-                            var _0x4da0dc = [];
-                            var _0x12605e = '';
-                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
-                                var _0x385ee3 = this[_0x20a7bf];
-                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
-                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
-                                        _0x4da0dc[_0x217721] = _0x385ee3;
-                                    }
-                                }
-                            }
-                            _0x12605e = _0x4da0dc["join"]('');
-                            return _0x12605e;
-                        };
-
-                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
-                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
-                        var arg1 = obt_arg1
-                        var _0x23a392 = arg1["unsbox"]();
-                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
-                        return arg2
-                    }
-                '''
-                ctx = execjs.compile(js_script)
-                arg2 = ctx.call('getAcw_sc__v2', arg1)
-                return {"acw_sc__v2": arg2}
-            else:
-                return {}
-        except:
-            return {}
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_lt', page)
-
-    def get_data(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text)
-        return data_org
-
-    def fetch(self, page):
-        url = f"https://ctbpsp.com/cutominfoapi/recommand/type/5/pagesize/10/currentpage/{page}"
-        params = {
-            "type__1017": self.get_type_1017(page)
-        }
-        response = requests.get(url, headers=self.headers, params=params, proxies=get_QGIP(), verify=False)
-        text = response.content.decode().replace('"', "")
-        ret = self.get_data(text)
-        if not ret:
-            raise ValueError('数据内容为空.')
-        return ret
-
-    @staticmethod
-    def data_extract(data):
-        data_info = data.replace('true', '1').replace('false', '1').replace('null', '1')
-        iter_data = ast.literal_eval(data_info)
-        if iter_data.get('data') == 1:
-            # {'success': 1, 'data': 1, 'errorMessage': ''}
-            raise TypeError('数据获取失败.')
-
-        data_list = iter_data['data'].get('dataList')
-        page_size = iter_data['data']['pageSize']
-        return data_list, page_size
-
-    def parse(self, data_info, page):
-        info_list, _ = self.data_extract(data_info)
-
-        results_list = []
-        for info in info_list:
-            hid = info.get('bulletinID')
-            did = info.get('dataSource')
-            href = f"http://ctbpsp.com/#/bulletinDetail?uuid={hid}&inpvalue=&dataSource={did}&tenderAgency="
-            title = info.get('noticeName').strip()
-            create_time = info.get('noticeSendTime')
-            reginProvince = info.get('reginProvince').replace("省", "").replace("市", "")
-
-            dedup = [title + href]
-            if not self.RDS.data_filter(dedup):
-                item = {
-                    "site": "中国招标投标公共服务平台",
-                    "channel": "全国招标公告公示搜索引擎",
-                    "spidercode": "a_qgzbgggsssyq_qbgg",
-                    "area": reginProvince,
-                    "city": "",
-                    "district": "",
-                    "href": href,
-                    "title": title,
-                    "publishtime": create_time,
-                    "parse_url": href,
-                    "parser_name": "ztpc_qgzbgggsssyq",
-                    "is_mixed": False,
-                    "is_theme": True,
-                    "retry": 0,
-                    "comeintime": int2long(int(time.time())),
-                    "is_crawl": False,
-                    "failed": False,
-                    "iscompete": True,
-                    "sendflag": "false",
-                    "T": "bidding",
-                    "infoformat": 1,
-                    "type": "",
-                    "publishdept": "",
-                    "_d": "comeintime",
-                }
-
-                self.zb_list.insert_one(item)
-                self.RDS.data_save_redis(dedup)
-                results_list.append(item)
-
-        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
-        self.real_cont += len(results_list)
-        return results_list
-
-    def crawl(self, page):
-        retey = 0
-        while (retey := retey + 1) < 5:
-            try:
-                data_info = self.fetch(page=page)
-                self.parse(data_info=data_info, page=page)
-                time.sleep(random.random())
-                return
-            except Exception as e:
-                logger.error(f"第{page}页 采集异常:{e}")
-                time.sleep(3)
-
-    def start(self, crawl_page):
-        logger.debug("********** 列表页开始 **********")
-        for page in range(1, crawl_page + 1):
-            self.crawl(page=page)
-            logger.info(f"当前已采集 {self.real_cont} 条数据")
-        logger.debug("********** 列表页结束 **********")
-
-
-if __name__ == '__main__':
-    Spider().start(1000)

+ 0 - 280
lzz_theme/qgzbgggsssyq/sscrawl_details.py

@@ -1,280 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-09-18
----------
-@summary: 全国招标公告公示搜索引擎 - 详情页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-import json
-from utils.attachment import AttachmentDownloader
-from parsel import Selector
-from utils.tools import *
-
-
-
-class dt_Spider:
-
-    def __init__(self):
-        self.proxy = get_proxy(socks5h=True)
-        self.db_table = Mongo_client().py_spider
-        self.zt_details = self.db_table.data_bak
-        self.headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-
-    def get_time__2652(self, page=None, cid=None, rid=None):
-        with open('./ssyq_pm.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-
-        return ctx.call('tm', page, cid, rid)
-
-    def get_type_1017(self, page):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_dt', page)
-
-    def get_type_1017_f(self, href):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_file', href)
-
-    def detail_get(self, response, item, new_url):
-        response.encoding = response.apparent_encoding
-        root = Selector(text=response.text)
-
-        if "来源渠道:必联电子招标投标平台" in response.text:
-            # pdf 带 必联 水印,不入保存服务
-            item["sendflag"] = "true"
-
-        dd = root.xpath('//div[@class="mian_list_03"]/@index').extract_first()
-
-        cookies = response.cookies.get_dict()
-        headers2 = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Content-Length": "0",
-            "Origin": "https://bulletin.cebpubservice.com",
-            "Pragma": "no-cache",
-            "Referer": new_url,
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            "X-Requested-With": "XMLHttpRequest",
-        }
-
-        url2 = "https://bulletin.cebpubservice.com/details/permission/getSecretKey"
-        params = {
-            "time__2652": self.get_time__2652()
-        }
-        res = requests.post(url2, headers=headers2, cookies=cookies, params=params,
-                            timeout=30, proxies=self.proxy, verify=False)
-
-        ex_js = '''
-            CryptoJS = require("crypto-js")
-
-            function decryptByDES(ciphertext, key) {
-                    var keyHex = CryptoJS.enc.Utf8.parse("Ctpsp@884*");
-                    var decrypted = CryptoJS.DES.decrypt({
-                        ciphertext: CryptoJS.enc.Base64.parse(ciphertext)
-                    }, keyHex, {
-                        mode: CryptoJS.mode.ECB,
-                        padding: CryptoJS.pad.Pkcs7
-                    });
-                    return decrypted.toString(CryptoJS.enc.Utf8);
-            }
-            '''
-        ctx = execjs.compile(ex_js)
-        pm = ctx.call('decryptByDES', res.text.replace('"', ''))
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-        ffid = json.loads(pm).get('data')
-        f_org = f"/details/bulletin/getBulletin/{ffid}/{dd}"
-
-        for i in range(5):
-            file_url = f"https://bulletin.cebpubservice.com/details/bulletin/getBulletin/{ffid}/{dd}"
-            headers = {
-                "Accept": "*/*",
-                "Accept-Language": "zh-CN,zh;q=0.9",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Pragma": "no-cache",
-                "Referer": "https://bulletin.cebpubservice.com/resource/ceb/js/pdfjs-dist/web/viewer.html",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-            }
-            f_tm = self.get_time__2652(f_org)
-            params = {
-                "time__2652": f"{f_tm}"
-            }
-            attachment = AttachmentDownloader().fetch_attachment(
-                file_name=item["title"], file_type="pdf", download_url=file_url,
-                proxies=self.proxy, headers=headers, params=params, is_check=True)
-            if attachment.get('size'):
-                attachments[str(len(attachments) + 1)] = attachment
-                break
-            time.sleep(random.randint(3, 6))
-            self.proxy = get_proxy(socks5h=True)
-            if i == 4:
-                raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-    def decrypt_by_des(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text.replace('"', ''))
-        data_org = eval(data_org.replace('true', '1').replace('false', '1').replace('null', '1'))
-        return data_org
-
-    def get_url(self, parse_url):
-        uid = "".join(re.findall('uuid=(.*?)&', parse_url))
-        headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-        url = f"https://ctbpsp.com/cutominfoapi/bulletin/{uid}/uid/0"
-
-        retry = 0
-        while (retry := retry + 1) < 3:
-            params = {
-                "type__1017": self.get_type_1017(uid)
-            }
-            try:
-                res = requests.get(url, headers=headers, params=params, proxies=get_QGIP(), timeout=30)
-                data_org = self.decrypt_by_des(res.text.replace('"', ""))
-                break
-            except:
-                pass
-
-        new_href = data_org.get('data').get('pdfUrl')
-        pub_time = data_org.get('data').get('noticeSendTimeStr', '')
-        pbtime = pub_time.replace('年', '-').replace('月', '-').replace('日', '')
-        if "bulletinPDF" not in new_href:
-            new_href = data_org.get('data').get('noticeUrl')
-        return new_href, pbtime
-
-    def new_parse(self,item, pdfurl):
-
-        item["contenthtml"] = "详情请访问原网页!"
-
-        attachments = {}
-
-        headers = {
-            "Accept": "*/*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": f"https://ctbpsp.com/web_pdf/pdfjs-dist/web/viewer.html?file={pdfurl}",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-
-        f_tm = self.get_type_1017_f(pdfurl)
-        params = {
-            "type__1017": f"{f_tm}"
-        }
-        attachment = AttachmentDownloader().fetch_attachment(
-            file_name=item["title"], file_type="pdf", download_url=pdfurl,
-            proxies=get_QGIP(),headers=headers,params=params, is_check=True)
-        if attachment.get('size'):
-            attachments[str(len(attachments) + 1)] = attachment
-        else:
-            raise FileNotFoundError("附件下载失败!")
-
-        if attachments:
-            item['projectinfo'] = {"attachments": attachments}
-
-        item = format_fileds(item)
-
-        try:
-            self.zt_details.insert_one(item)
-            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
-        except DuplicateKeyError:
-            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
-
-
-    def fetch_request(self, url):
-        response = requests.get(url=url, headers=self.headers,
-                                proxies=self.proxy, timeout=(30, 60), verify=False)
-        return response
-
-    def deal_request(self, item):
-        retry_times = 0
-        while (retry_times := retry_times + 1) < 3:
-            try:
-                new_url,pub_time = self.get_url(item['href'])
-                if "bulletinPDF" in new_url:
-                    try:
-                        date_to_timestamp(item['publishtime'])
-                    except:
-                        item['publishtime'] = pub_time
-                    self.new_parse(item=item,pdfurl=new_url)
-                    return True
-                else:
-                    response = self.fetch_request(new_url)
-                    if response is not None and response.status_code == 200:
-                        self.detail_get(response, item=item, new_url=new_url)
-                        time.sleep(random.random())
-                        return True
-            except Exception as e:
-                logger.error(f"{item['href']} 采集异常:{e}")
-                time.sleep(random.randint(5,10))
-                self.proxy = get_proxy(socks5h=True)
-        logger.warning(f"[采集失败]{item['href']}")
-        return False
-
-    def start(self, item: dict):
-        logger.debug(f"********** {item['title']} 详情页采集开始 **********")
-
-        rst = self.deal_request(item)
-
-        logger.debug(f"********** {item['title']} 详情页采集结束 **********")
-
-        return rst
-
-# if __name__ == "__main__":
-#     dt_Spider().start({})

+ 0 - 195
lzz_theme/qgzbgggsssyq/sscrawl_list.py

@@ -1,195 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-09-18
----------
-@summary: 全国招标公告公示搜索引擎 - 列表页
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-from utils.tools import *
-import requests
-import warnings
-from urllib.parse import quote
-from sscrawl_details import dt_Spider
-
-
-warnings.filterwarnings('ignore')
-
-
-
-
-class Spider:
-
-    def __init__(self):
-        self.headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
-        }
-
-    def get_data(self, text: str):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        data_org = ctx.call('en_str', text)
-        return data_org
-
-    def get_acw_sc_v2(self, html):
-        try:
-            arg1 = "".join(re.findall("arg1='(.*?)'", html))
-            if arg1:
-                js_script = '''
-                    function getAcw_sc__v2(obt_arg1) {
-                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
-                            var _0x5a5d3b = '';
-                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
-                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
-                                if (_0x189e2c["length"] == 1) {
-                                    _0x189e2c = '0' + _0x189e2c;
-                                }
-                                _0x5a5d3b += _0x189e2c;
-                            }
-                            return _0x5a5d3b;
-                        };
-                        String["prototype"]["unsbox"] = function () {
-                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
-                            var _0x4da0dc = [];
-                            var _0x12605e = '';
-                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
-                                var _0x385ee3 = this[_0x20a7bf];
-                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
-                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
-                                        _0x4da0dc[_0x217721] = _0x385ee3;
-                                    }
-                                }
-                            }
-                            _0x12605e = _0x4da0dc["join"]('');
-                            return _0x12605e;
-                        };
-
-                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
-                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
-                        var arg1 = obt_arg1
-                        var _0x23a392 = arg1["unsbox"]();
-                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
-                        return arg2
-                    }
-                '''
-                ctx = execjs.compile(js_script)
-                arg2 = ctx.call('getAcw_sc__v2', arg1)
-                return {"acw_sc__v2": arg2}
-            else:
-                return {}
-        except:
-            return {}
-
-    def get_type_1017(self,typm):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_ss',typm)
-
-    def fetch(self, keyword):
-        # url = "https://ctbpsp.com/cutominfoapi/searchkeyword"
-        # params = {
-        #     "keyword": keyword,
-        #     "uid": "0",
-        #     "PageSize": "10",
-        #     "CurrentPage": "1",
-        #     "searchType": "0",
-        #     "bulletinType": "5",
-        #     "type__1017": self.get_type_1017(quote(keyword))
-        # }
-        furl = f"https://ctbpsp.com/cutominfoapi/searchkeyword?keyword={keyword}&uid=0&PageSize=10&CurrentPage=1&searchType=0&bulletinType=5&type__1017={self.get_type_1017(quote(keyword, safe='/'))}"
-
-        response = requests.get(furl, headers=self.headers, proxies=get_QGIP(), verify=False)
-        data_info = self.get_data(response.text.replace('"', ""))
-        if "error while performing request" in data_info:
-            raise ValueError("错误请求!")
-
-        return data_info
-
-    def parse(self, data_info):
-        data_info = data_info.replace('true', '1').replace('false', '1').replace('null', '1')
-        info_list = eval(data_info).get('data').get('dataList')
-        for info in info_list:
-            hid = info.get('bulletinID')
-            did = info.get('dataSource')
-            href = f"http://ctbpsp.com/#/bulletinDetail?uuid={hid}&inpvalue=&dataSource={did}&tenderAgency="
-            title = info.get('noticeName').replace('<em>','').replace('</em>','').strip()
-            create_time = info.get('noticeSendTime')
-            reginProvince = info.get('reginProvince').replace("省", "").replace("市", "")
-
-            item = {
-                "site": "中国招标投标公共服务平台",
-                "channel": "全国招标公告公示搜索引擎",
-                "spidercode": "a_qgzbgggsssyq_qbgg",
-                "area": reginProvince,
-                "city": "",
-                "district": "",
-                "href": href,
-                "title": title,
-                "publishtime": create_time,
-                "parser_name": "ztpc_qgzbgggsssyq",
-                "is_mixed": False,
-                "comeintime": int2long(int(time.time())),
-            }
-
-            return item
-        else:
-            return []
-
-    def crawl(self, keyword):
-        retry = 0
-        while (retry := retry + 1) < 5:
-            try:
-                data_info = self.fetch(keyword=keyword)
-                list_item = self.parse(data_info=data_info)
-                time.sleep(random.random())
-                if not list_item:
-                    logger.warning(f"{keyword} 补录失败!")
-                    return None
-                rr = dt_Spider().start(list_item)
-                if rr:
-                    logger.info(f" {keyword} 已补录")
-                    return list_item
-                else:
-                    logger.warning(f"{keyword} 补录失败!")
-                    return None
-
-            except Exception as e:
-                logger.error(f"{keyword} 采集异常:{e}")
-                time.sleep(3)
-
-    def start(self, keyword: str):
-        # logger.debug("********** 补录开始 **********")
-
-        list_item = self.crawl(keyword)
-
-        # logger.debug("********** 补录结束 **********")
-        return list_item
-
-
-# if __name__ == '__main__':
-#     Spider().start('局电务公司海城站6502继电联锁改造工程断路器询价采购')

+ 0 - 310
lzz_theme/qgzbgggsssyq/ssyq.js

@@ -1,310 +0,0 @@
-DJlBm = function (l1, l2) {
-    return l1 < l2;
-}
-nwYaV = function (l1, l2) {
-    return l1 + l2;
-}
-EGMkl = function (l1, l2) {
-    return l1 == l2;
-}
-EVrnI = function (l1, l2) {
-    return l1 - l2;
-}
-tYNKY = function (l1, l2) {
-    return l1(l2);
-}
-TbTmg = function (l1, l2) {
-    return l1 < l2;
-}
-MGEpz = function (l1, l2) {
-    return l1 | l2;
-}
-fQAyr = function (l1, l2) {
-    return l1 << l2;
-}
-OQJcB = function (l1, l2) {
-    return l1 & l2;
-}
-hDFsv = function (l1, l2) {
-    return l1 == l2;
-}
-DqGCc = function (l1, l2) {
-    return l1(l2);
-}
-rIHaw = function (l1, l2) {
-    return l1 < l2;
-}
-ZqRgc = function (l1, l2) {
-    return l1 | l2;
-}
-QyFsL = function (l1, l2) {
-    return l1 << l2;
-}
-vIlmM = function (l1, l2) {
-    return l1 == l2;
-}
-yFDAW = function (l1, l2) {
-    return l1(l2);
-}
-lJnvD = function (l1, l2) {
-    return l1 | l2;
-}
-hPpla = function (l1, l2) {
-    return l1 - l2;
-}
-LvJaY = function (l1, l2) {
-    return l1 < l2;
-}
-UPtKX = function (l1, l2) {
-    return l1(l2);
-}
-qoTvE = function (l1, l2) {
-    return l1 == l2;
-}
-ggJMq = function (l1, l2) {
-    return l1 - l2;
-}
-shWCy = function (l1, l2) {
-    return l1(l2);
-}
-IMCME = function (l1, l2) {
-    return l1 | l2;
-}
-VYdec = function (l1, l2) {
-    return l1 == l2;
-}
-XZzQS = function (l1, l2) {
-    return l1 - l2;
-}
-vMKQx = function (l1, l2) {
-    return l1(l2);
-}
-SPVsD = function (l1, l2) {
-    return l1 < l2;
-}
-cRdDu = function (l1, l2) {
-    return l1 < l2;
-}
-avGeL = function (l1, l2) {
-    return l1 | l2;
-}
-UjRAq = function (l1, l2) {
-    return l1 << l2;
-}
-jmDqy = function (l1, l2) {
-    return l1 & l2;
-}
-FaATM = function (l1, l2) {
-    return l1 == l2;
-}
-jBEMc = function (l1, l2) {
-    return l1 - l2;
-}
-HPcNa = function (l1, l2) {
-    return l1 == l2;
-}
-vwnaY = function (l1, l2) {
-    return l1 | l2;
-}
-tenBS = function (l1, l2) {
-    return l1 << l2;
-}
-KHkFx = function (l1, l2) {
-    return l1 & l2;
-}
-OZwQ = function (l1, l2) {
-    return l1 == l2;
-}
-XZzQS = function (l1, l2) {
-    return l1 - l2;
-}
-xzUJc = function (l1, l2) {
-    return l1(l2);
-}
-CYlhh = function (l1, l2) {
-    return l1 == l2;
-}
-BePag = function (l1, l2) {
-    return l1 - l2;
-}
-shWCy = function (l1, l2) {
-    return l1(l2);
-}
-zhKex = function (l1, l2) {
-    return l1 + l2;
-}
-dnoNS = function (l1, l2) {
-    return l1 + l2;
-}
-pWyDJ = function (l1, l2) {
-    return l1 + l2;
-}
-nwYaV = function (l1, l2) {
-    return l1 + l2;
-}
-dDYin = function (l1, l2) {
-    return l1(l2);
-}
-pGrZJ = function (lO, lX) {
-    return dDYin(lO, lX);
-}
-EzIKl = function (l1, l2) {
-    return l1 < l2;
-}
-QrNFe = function (lO, lX) {
-    return EzIKl(lO, lX);
-}
-iYbmv = function (l1, l2) {
-    return l1 + l2;
-}
-YCNgr = function (lO, lX) {
-    return iYbmv(lO, lX);
-}
-nwYaV = function (l1, l2) {
-    return l1 + l2;
-}
-lHHHe = function (lO, lX) {
-    return nwYaV(lO, lX);
-}
-imaAM = function (l1, l2) {
-    return l1 - l2;
-}
-IFaDs = function (lO, lX) {
-    return imaAM(lO, lX);
-}
-bXDKh = function (l1, l2) {
-    return l1 << l2;
-}
-fhOjT = function (lO, lX) {
-    return bXDKh(lO, lX);
-}
-
-sig = function (lO) {
-    for (var lX = 0xe59 + -0x2602 + 0x17a9, lW = pGrZJ(encodeURIComponent, lO), le = 0x4d * -0x5d + 0x2ab + -0x29 * -0x9e; QrNFe(le, lW['length']); le++)
-        lX = YCNgr(lHHHe(IFaDs(fhOjT(lX, -0x1cdc + 0x13dd + -0x2a * -0x37), lX), 0xdf2 + -0x26bd + 0x1a59), lW['charCodeAt'](le)),
-            lX |= 0x3 * 0x4a3 + 0x130d + 0x1 * -0x20f6;
-    return lX;
-}
-
-function uu(lO, lX, lW) {
-
-    for (var le, lm, lq, li, lA = {}, lS = {}, lI = '', lj = 0x2b * 0x1 + 0x22d * 0x8 + -0x5db * 0x3, ld = 0x19bc + 0x1 * -0x25e7 + 0xc2e, lf = -0x1ba5 + -0xb * -0xfc + -0x3b * -0x49, lF = [], lE = 0x15cd * 0x1 + 0x2b3 * -0xc + 0x1 * 0xa97, lJ = -0x1ccc + -0x1 * -0x147 + 0x1 * 0x1b85, lG = -0x1 * 0x1d81 + -0x1 * 0x270a + 0x448b; DJlBm(lG, lO['length']); lG += -0x1852 + -0x495 + -0x19 * -0x128)
-        if (lq = lO.charAt(lG),
-        Object['prototype']['hasOwnProp' + 'erty']['call'](lA, lq) || (lA[lq] = ld++,
-            lS[lq] = !(0x185 + 0x1 * -0x14ef + 0xe * 0x163)),
-            li = nwYaV(lI, lq),
-            Object['prototype']['hasOwnProp' + 'erty']['call'](lA, li))
-            lI = li;
-        else {
-            if (Object['prototype']['hasOwnProp' + 'erty']['call'](lS, lI)) {
-                if (DJlBm(lI['charCodeAt'](0x4db + 0xb * 0x175 + 0x129 * -0x12), 0xd * 0x1bf + 0x1 * -0x15c + 0x29 * -0x7f)) {
-                    for (le = -0x136c + 0x13c5 + -0x1 * 0x59; SPVsD(le, lf); le++)
-                        lE <<= -0x17e2 + 0x1 * -0x1943 + 0x3126,
-                            EGMkl(lJ, EVrnI(lX, -0x2 * 0x7b5 + -0x7 * -0x1d5 + 0xa6 * 0x4)) ? (lJ = 0x109c + -0x2571 + 0x14d5,
-                                lF['push'](tYNKY(lW, lE)),
-                                lE = -0x124d + -0x2bf * -0x1 + 0x16a * 0xb) : lJ++;
-                    for (lm = lI['charCodeAt'](0x205f + 0x350 + -0x23af),
-                             le = -0x1fd + 0xe8 * -0x19 + 0x18a5; TbTmg(le, 0x1a4d * -0x1 + 0x1 * -0x319 + 0x1d6e); le++)
-                        lE = MGEpz(fQAyr(lE, -0x762 + 0x137d + 0x1 * -0xc1a), OQJcB(-0x10a8 + 0xe5 * -0x26 + 0x32a7, lm)),
-                            hDFsv(lJ, EVrnI(lX, -0x1601 + 0x136b + 0x297)) ? (lJ = 0x24bd + 0x1c38 + -0x40f5,
-                                lF['push'](DqGCc(lW, lE)),
-                                lE = 0x1 * 0x1da9 + -0x14ca + -0x8df) : lJ++,
-                            lm >>= -0x1 * -0x8b + -0x2079 * -0x1 + 0xb01 * -0x3;
-                } else {
-                    for (lm = 0x20ca + -0xa3 * 0x1 + -0x2026,
-                             le = -0x9fc + 0x290 + -0x1 * -0x76c; rIHaw(le, lf); le++)
-                        lE = ZqRgc(QyFsL(lE, -0x371 * 0xb + 0xfe5 + 0x1 * 0x15f7), lm),
-                            vIlmM(lJ, ggJMq(lX, 0x13ab + 0x1ead + -0x3257)) ? (lJ = -0x11c3 * -0x1 + 0x149b + -0x265e,
-                                lF['push'](shWCy(lW, lE)),
-                                lE = 0x1a * 0x137 + 0x11 * -0x220 + -0xe * -0x53) : lJ++,
-                            lm = -0xc59 + 0x16de + 0x1 * -0xa85;
-                    for (lm = lI['charCodeAt'](0x85a + -0x8c1 + -0x67 * -0x1),
-                             le = -0xafa + 0x9 * 0x3bd + 0x16ab * -0x1; DJlBm(le, -0x14f6 * 0x1 + -0x25f + 0x1765 * 0x1); le++)
-                        lE = IMCME(QyFsL(lE, 0x1ea9 + -0x2 * -0xfe5 + 0x1f39 * -0x2), OQJcB(-0x10d0 + 0x922 * -0x2 + 0x2315, lm)),
-                            VYdec(lJ, XZzQS(lX, -0x21e5 + -0x8fc + -0x1 * -0x2ae2)) ? (lJ = -0x127b + -0x1 * -0x1e2f + -0xbb4,
-                                lF['push'](vMKQx(lW, lE)),
-                                lE = -0x1d25 + 0x6ac + 0x1679) : lJ++,
-                            lm >>= 0x1d14 * 0x1 + 0x1dea + -0x1 * 0x3afd;
-                }
-                qoTvE(0x1bc5 + 0x18ef * -0x1 + -0x2d6, --lj) && (lj = Math['pow'](-0x1 * 0x1cd5 + -0x9 * -0x290 + -0x1ed * -0x3, lf),
-                    lf++),
-                    delete lS[lI];
-            } else {
-                for (lm = lA[lI],
-                         le = -0xff2 + -0x25c3 + 0x35b5; LvJaY(le, lf); le++)
-                    lE = lJnvD(fQAyr(lE, 0x1 * 0x131c + 0x234d + -0x3668), OQJcB(0x14b6 + -0x1 * -0x189b + 0x8 * -0x5aa, lm)),
-                        vIlmM(lJ, hPpla(lX, 0x113 * 0x1f + -0xc7c + -0x14d0)) ? (lJ = -0x2281 + -0xb9 * -0x5 + 0x1ee4,
-                            lF['push'](UPtKX(lW, lE)),
-                            lE = -0xd29 + 0x102f + -0x306) : lJ++,
-                        lm >>= -0x201a + 0x849 + 0x2 * 0xbe9;
-            }
-            vIlmM(0x146f + -0x1645 + -0xeb * -0x2, --lj) && (lj = Math['pow'](-0x6bd * 0x5 + 0x555 + 0x1 * 0x1c5e, lf),
-                lf++),
-                lA[li] = ld++,
-                lI = yFDAW(String, lq);
-        }
-    for (lm = lA[lI],
-             le = 0xe6 + 0x2c * -0x15 + 0x15b * 0x2; cRdDu(le, lf); le++)
-        lE = avGeL(UjRAq(lE, 0x1ac2 + 0x10a3 + 0x4 * -0xad9), jmDqy(0xff3 + 0xfe2 + -0x2a * 0xc2, lm)),
-            FaATM(lJ, jBEMc(lX, -0xe41 + -0xcf1 + 0x1b33)) ? (lJ = -0x1ca * -0xb + -0xb52 + -0x85c,
-                lF['push'](yFDAW(lW, lE)),
-                lE = 0x168b + -0x1 * -0x1bb9 + -0x1922 * 0x2) : lJ++,
-            lm >>= -0x1026 + 0x973 * -0x2 + 0x9 * 0x3e5;
-
-    HPcNa(0x2 * -0x19c + -0x25e4 + 0x291c, --lj) && (lj = Math['pow'](0x241d + 0x1f8a + -0x1 * 0x43a5, lf),
-        lf++)
-
-    for (lm = -0x1b08 + 0x1 * 0x89b + 0xd * 0x16b,
-             le = -0x25 * -0x71 + 0x2661 + 0x1b5b * -0x2; cRdDu(le, lf); le++)
-        lE = vwnaY(tenBS(lE, 0x2297 + 0x3d0 + -0x2666), KHkFx(0xdce + -0x17fb + 0xa2e, lm)),
-            OZwQ(lJ, XZzQS(lX, 0x11eb + -0x2142 + 0xf58 * 0x1)) ? (lJ = 0xd14 + -0x25ad + 0x3 * 0x833,
-                lF['push'](xzUJc(lW, lE)),
-                lE = 0x1 * 0x225b + -0x1693 + 0x68 * -0x1d) : lJ++,
-            lm >>= 0x4 * 0x7f2 + 0x21e2 + -0x41a9;
-    for (; ;) {
-        if (lE <<= -0x20d * -0xa + -0x128c * 0x2 + 0x1097,
-            CYlhh(lJ, BePag(lX, 0xb3e + 0x6 * -0x4f6 + 0x1287))) {
-            lF['push'](shWCy(lW, lE));
-            break;
-        }
-        lJ++;
-    }
-    return lF.join("")
-}
-
-function type_1017(href) {
-    lm = href
-    lS = 0x1 * 0x2051 + -0x23a2 * 0x1 + 0x351
-    lO = zhKex(dnoNS(dnoNS(pWyDJ(nwYaV(sig(lm), '|'), lS), '|'), new Date().getTime()), '|1')
-    lX = 6
-    lW = function (lA) {
-        return "DGi0YA7BemWnQjCl4+bR3f8SKIF9tUz/xhr2oEOgPpac=61ZqwTudLkM5vHyNXsVJ".charAt(lA);
-    }
-    return uu(lO, lX, lW)
-}
-
-function type_1017_lt(page) {
-    const url = "https://ctbpsp.com/cutominfoapi/recommand/type/5/pagesize/10/currentpage/" + page
-    return type_1017(url)
-}
-
-function type_1017_fl(page, type) {
-    const url = "https://ctbpsp.com/cutominfoapi/recommand/type/" + type + "/pagesize/10/currentpage/" + page
-    return type_1017(url)
-}
-
-function type_1017_ss(key) {
-    const url = "https://ctbpsp.com/cutominfoapi/searchkeyword?keyword=" + key + "&uid=0&PageSize=10&CurrentPage=1&searchType=0&bulletinType=5"
-    return type_1017(url)
-}
-
-function type_1017_dt(hid) {
-    const url = "https://ctbpsp.com/cutominfoapi/bulletin/" + hid + "/uid/0"
-    return type_1017(url)
-}
-
-function type_1017_file(hid) {
-    return type_1017(hid)
-}
-
-// console.log(type_1017(5))

+ 0 - 230
lzz_theme/qgzbgggsssyq/ssyq_list.py

@@ -1,230 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-10-29
----------
-@summary: 全国招标公告公示搜索引擎 - 列表页 - [地区+行业+公告类型]
----------
-@author: Lzz
-"""
-import sys
-import os
-
-sys.path.append(os.path.dirname(os.getcwd()))
-from utils.RedisDB import RedisFilter
-from utils.tools import *
-from datetime import datetime
-import warnings
-import json
-import ast
-
-warnings.filterwarnings('ignore')
-
-
-class Spider:
-
-    def __init__(self):
-        self.py_spider = Mongo_client().py_spider
-        self.zb_list = self.py_spider.theme_list
-        self.RDS = RedisFilter()
-        self.real_cont = 0
-        self.paginate = True
-        self.headers = {
-            "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Cache-Control": "no-cache",
-            # "Connection": "keep-alive",
-            # "Pragma": "no-cache",
-            "Referer": "https://ctbpsp.com/",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
-        }
-
-    def get_acw_sc_v2(self, html):
-        try:
-            arg1 = "".join(re.findall("arg1='(.*?)'", html))
-            if arg1:
-                js_script = '''
-                    function getAcw_sc__v2(obt_arg1) {
-                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
-                            var _0x5a5d3b = '';
-                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
-                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
-                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
-                                if (_0x189e2c["length"] == 1) {
-                                    _0x189e2c = '0' + _0x189e2c;
-                                }
-                                _0x5a5d3b += _0x189e2c;
-                            }
-                            return _0x5a5d3b;
-                        };
-                        String["prototype"]["unsbox"] = function () {
-                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
-                            var _0x4da0dc = [];
-                            var _0x12605e = '';
-                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
-                                var _0x385ee3 = this[_0x20a7bf];
-                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
-                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
-                                        _0x4da0dc[_0x217721] = _0x385ee3;
-                                    }
-                                }
-                            }
-                            _0x12605e = _0x4da0dc["join"]('');
-                            return _0x12605e;
-                        };
-
-                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
-                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
-                        var arg1 = obt_arg1
-                        var _0x23a392 = arg1["unsbox"]();
-                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
-                        return arg2
-                    }
-                '''
-                ctx = execjs.compile(js_script)
-                arg2 = ctx.call('getAcw_sc__v2', arg1)
-                return {"acw_sc__v2": arg2}
-            else:
-                return {}
-        except:
-            return {}
-
-    def get_type_1017(self, page, tp):
-        with open('./ssyq.js', 'r') as fr:
-            ex_js = fr.read()
-        ctx = execjs.compile(ex_js)
-        return ctx.call('type_1017_fl', page, tp)
-
-    @staticmethod
-    def decrypto_data(text):
-        ex_js = '''
-        CryptoJS = require("crypto-js")
-        function en_str(t) {
-            var e = CryptoJS.enc.Utf8.parse("1qaz@wsx3e")
-              , i = CryptoJS.DES.decrypt({
-                ciphertext: CryptoJS.enc.Base64.parse(t)
-            }, e, {
-                mode: CryptoJS.mode.ECB,
-                padding: CryptoJS.pad.Pkcs7
-            });
-            return i.toString(CryptoJS.enc.Utf8)
-        }
-        '''
-        ctx = execjs.compile(ex_js)
-        return ctx.call('en_str', text)
-
-    def fetch(self, page, param):
-        logger.info(f'{param}|页码|{page}|发起请求')
-        url = f"https://ctbpsp.com/cutominfoapi/recommand/type/{param['type']}/pagesize/10/currentpage/{page}"
-        params = {
-            "type__1017": self.get_type_1017(page, param['type']),
-            "province": param['province'],
-            "industry": param['industry'],
-        }
-        response = requests.get(url, headers=self.headers, params=params, proxies=get_QGIP(), timeout=60, verify=False)
-        text = response.content.decode().replace('"', '')
-        ret = self.decrypto_data(text)
-        if not ret:
-            raise ValueError('请求结果数据为空!')
-        return ret
-
-    def parse(self, data_info, page, param):
-        now_ts = int(datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp())
-
-        data_info = data_info.replace('true', '1').replace('false', '1').replace('null', '1')
-        iter_data = ast.literal_eval(data_info)
-        if iter_data.get('data') == 1:
-            # {'success': 1, 'data': 1, 'errorMessage': ''}
-            raise TypeError('解析数据失败!')
-
-        info_list = iter_data['data']['dataList']
-        page_size = iter_data['data']['pageSize']
-
-        results_list = []
-        for info in info_list:
-            hid = info.get('bulletinID')
-            did = info.get('dataSource')
-            href = f"http://ctbpsp.com/#/bulletinDetail?uuid={hid}&inpvalue=&dataSource={did}&tenderAgency="
-            title = info.get('noticeName').strip()
-            create_time = info.get('noticeSendTime')
-            reginProvince = info.get('reginProvince').replace("省", "").replace("市", "")
-
-            pb_time = int(datetime.strptime(create_time, "%Y-%m-%d").timestamp())
-            if pb_time < now_ts:
-                # logger.info(f'当前{page}页{param["province"]}-{param["industry"]}-{param["typeName"]}--发布时间小于当前时间')
-                logger.info('当日暂无新数据')
-                self.paginate = False
-                return
-
-            dedup = [title + href]
-            if not self.RDS.data_filter(dedup):
-                item = {
-                    "site": "中国招标投标公共服务平台",
-                    "channel": "全国招标公告公示搜索引擎",
-                    "spidercode": "a_qgzbgggsssyq_qbgg",
-                    "area": reginProvince,
-                    "city": "",
-                    "district": "",
-                    "href": href,
-                    "title": title,
-                    "publishtime": create_time,
-                    "parse_url": href,
-                    "parser_name": "ztpc_qgzbgggsssyq",
-                    "is_mixed": False,
-                    "is_theme": True,
-                    "retry": 0,
-                    "comeintime": int2long(time.time()),
-                    "is_crawl": False,
-                    "failed": False,
-                    "iscompete": True,
-                    "sendflag": "false",
-                    "T": "bidding",
-                    "infoformat": 1,
-                    "type": "",
-                    "publishdept": "",
-                    "_d": "comeintime",
-                }
-                self.zb_list.insert_one(item)
-                self.RDS.data_save_redis(dedup)
-                results_list.append(item)
-
-        logger.info(f' *** 第{page}页{param["province"]}-{param["industry"]}-{param["typeName"]}采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
-        self.real_cont += len(results_list)
-        if page_size < 10:
-            logger.info(f'单页数据量小于10|{page_size}')
-            # logger.info(f'当前{page}页{param["province"]}-{param["industry"]}-{param["typeName"]}--每页条数小于10')
-            self.paginate = False
-
-        return results_list
-
-    def crawl(self, page, param):
-        retey = 0
-        while (retey := retey + 1) < 10:
-            try:
-                data_info = self.fetch(page=page, param=param)
-                self.parse(data_info=data_info, page=page, param=param)
-                time.sleep(random.random())
-                return
-            except Exception as e:
-                logger.error(f"第{page}页|采集异常|{e}")
-                time.sleep(random.randint(3, 7))
-
-    def start(self, crawl_page):
-        logger.debug("********** 列表页开始 **********")
-        with open("./param.json", "r", encoding="utf-8") as f:
-            json_text = f.read()
-
-        for param in json.loads(json_text):
-            self.paginate = True
-            for page in range(1, crawl_page + 1):
-                if not self.paginate:
-                    break
-
-                self.crawl(page=page, param=param)
-                logger.info(f"当前已采集 {self.real_cont} 条数据")
-
-        logger.debug("********** 列表页结束 **********")
-
-
-if __name__ == '__main__':
-    Spider().start(1000)

+ 0 - 55
lzz_theme/qgzbgggsssyq/ssyq_main.py

@@ -1,55 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-09-18
----------
-@summary:
----------
-@author: lzz
-"""
-import time
-
-from loguru import logger
-from pymongo import MongoClient
-
-from spider_search import SearchSpider
-
-# mgo = MongoClient('127.0.0.1', port=27080, username="", password="")
-mgo = MongoClient('172.17.4.87', port=27080, username="", password="")
-theme_list = mgo['py_spider']['theme_list']
-
-# 创建爬虫实例
-search = SearchSpider()
-
-
-def start(limit):
-    logger.debug(f"uuid失效数据,补采开始 {limit} 条")
-
-    query = {
-        "site": "中国招标投标公共服务平台",
-        "failed": True,
-        "is_crawl": False,
-        "retry": {"$gte": 4, "$lte": 10}
-    }
-    sort = [("publishtime", -1)]
-    p = {"title": 1, "retry": 1, "_id": 1}
-    with theme_list.find(query, projection=p, sort=sort, limit=limit) as cursor:
-        task_items = [doc for doc in cursor]
-
-    for item in task_items:
-        _id = item['_id']
-        title = "".join(item['title'].split()).strip()
-        result = search.spider(title)
-        if result is True:
-            theme_list.update_one({"_id": _id}, {"$set": {"is_crawl": True, "failed": False}})
-        else:
-            retry = item["retry"] + 1
-            theme_list.update_one({"_id": _id}, {"$set": {"retry": retry}})
-            logger.error(f"{title}|补采失败")
-
-        time.sleep(1)
-
-    logger.debug("uuid失效数据,补采完成!")
-
-
-if __name__ == '__main__':
-    start(200)

文件差异内容过多而无法显示
+ 0 - 15
lzz_theme/qgzbgggsssyq/ssyq_pm.js


+ 0 - 461
lzz_theme/utils/webdriver.py

@@ -1,461 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-01-19
----------
-@summary: 远程selenium服务
----------
-@author: dzr
-"""
-import os
-import queue
-import threading
-
-from selenium import webdriver
-from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
-from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
-from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
-
-
-
-# 浏览器渲染
-WEBDRIVER = dict(
-    pool_size=1,  # 浏览器的数量
-    load_images=False,  # 是否加载图片
-    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
-    headless=False,  # 是否为无头浏览器
-    driver_type="FIREFOX",  # CHROME、FIREFOX
-    timeout=30,  # 请求超时时间
-    window_size=(1280, 800),  # 窗口大小
-    executable_path=None,  # 浏览器路径,默认为默认路径
-    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
-    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
-    usages_local_driver=True,  # 是否加载本地驱动
-    server_addr="http://192.168.3.182:8899/wd/hub",  # selenium 远程服务地址
-    version="",  # 远程浏览器版本
-    service_log_path=os.devnull  # 日志路径
-)
-
-from loguru import logger
-from utils.tools import Singleton
-
-DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
-
-
-class WebDriver(RemoteWebDriver):
-    """浏览器采集 - selenium"""
-    CHROME = "CHROME"
-    FIREFOX = "FIREFOX"
-
-    def __init__(
-        self,
-        load_images=True,
-        user_agent=None,
-        proxy=None,
-        driver_type=CHROME,
-        timeout=20,
-        headless=False,
-        usages_local_driver=False,
-        window_size=(1024, 800),
-        server_addr=None,
-        version=None,
-        custom_argument=None,
-        executable_path=None,
-        service_log_path=None,
-        **kwargs
-    ):
-        """
-        webdirver 封装,支持 chrome 和 firefox
-        Args:
-            load_images: 是否加载图片
-            user_agent: 字符串 或 无参函数,返回值为user_agent
-            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
-            headless: 是否启用无头模式
-            driver_type: CHROME 或 FIREFOX...
-            timeout: 请求超时时间
-            window_size: # 窗口大小
-            executable_path: 浏览器路径,默认为默认路径
-            server_addr: 远程服务地址
-            usages_local_driver: 是否使用本地驱动
-            service_log_path: selenium service 日志路径
-            version: 浏览器版本
-            **kwargs:
-        """
-        self._load_images = load_images or WEBDRIVER["load_images"]
-        self._user_agent = user_agent or DEFAULT_USERAGENT
-        self._proxy = proxy or WEBDRIVER["proxy"]
-        self._headless = headless or WEBDRIVER["headless"]
-        self._usages_local_driver = usages_local_driver or WEBDRIVER["_usages_local_driver"]
-        self._timeout = timeout or WEBDRIVER["timeout"]
-        self._window_size = window_size or WEBDRIVER["window_size"]
-        self._executable_path = executable_path or WEBDRIVER["executable_path"]
-        self._custom_argument = custom_argument or WEBDRIVER["custom_argument"]
-        self._server_addr = server_addr or WEBDRIVER["server_addr"]
-        self._version = version or WEBDRIVER["version"]
-        self._service_log_path = service_log_path or WEBDRIVER["service_log_path"]
-
-        if driver_type == WebDriver.CHROME:
-            self.driver = self.chrome_driver()
-
-        elif driver_type == WebDriver.FIREFOX:
-            self.driver = self.firefox_driver()
-
-        else:
-            raise TypeError(
-                "dirver_type must be one of CHROME or FIREFOX, but received {}".format(
-                    type(driver_type)
-                )
-            )
-
-        # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
-        self.driver.set_page_load_timeout(self._timeout)
-        # 设置10秒脚本超时时间
-        self.driver.set_script_timeout(self._timeout)
-
-        self._is_remote = not self._usages_local_driver
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_val:
-            logger.error(exc_val)
-
-        self.quit()
-        return False
-
-    def __getattr__(self, name):
-        if self.driver:
-            return getattr(self.driver, name)
-        else:
-            raise AttributeError
-
-    def get_driver(self):
-        return self.driver
-
-    def local_firefox_driver(self):
-        firefox_profile = webdriver.FirefoxProfile()
-        firefox_options = webdriver.FirefoxOptions()
-        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
-        firefox_profile.set_preference("dom.webdriver.enabled", False)
-
-        if self._proxy:
-            proxy = self._proxy() if callable(self._proxy) else self._proxy
-            proxy = proxy.replace("socks5://", "")
-            # 使用socks5 代理
-            ip, port = proxy.split(":")
-            firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
-            firefox_profile.set_preference('network.proxy.socks', ip)
-            firefox_profile.set_preference('network.proxy.socks_port', int(port))
-
-        if self._user_agent:
-            firefox_profile.set_preference(
-                "general.useragent.override",
-                self._user_agent() if callable(
-                    self._user_agent) else self._user_agent,
-            )
-
-        if not self._load_images:
-            firefox_profile.set_preference("permissions.default.image", 2)
-
-        if self._headless:
-            firefox_options.add_argument("--headless")
-            firefox_options.add_argument("--disable-gpu")
-
-        # 添加自定义的配置参数
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                firefox_options.add_argument(arg)
-
-        if self._executable_path:
-            driver = webdriver.Firefox(
-                capabilities=firefox_capabilities,
-                options=firefox_options,
-                firefox_profile=firefox_profile,
-                executable_path=self._executable_path,
-                service_log_path=self._service_log_path
-            )
-        else:
-            driver = webdriver.Firefox(
-                capabilities=firefox_capabilities,
-                options=firefox_options,
-                firefox_profile=firefox_profile,
-                service_log_path=self._service_log_path
-            )
-
-        if self._window_size:
-            driver.set_window_size(*self._window_size)
-
-        return driver
-
-    def remote_firefox_driver(self):
-        firefox_options = webdriver.FirefoxOptions()
-        desired_capabilities = firefox_options.to_capabilities()
-        firefox_options.set_preference("dom.webdriver.enabled", False)
-
-        if self._version:
-            desired_capabilities['version'] = self._version
-
-        if self._proxy:
-            proxy = self._proxy() if callable(self._proxy) else self._proxy
-            proxy = proxy.replace("socks5://", "")
-            # 使用socks5 代理
-            ip, port = proxy.split(":")
-            firefox_options.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
-            firefox_options.set_preference('network.proxy.socks', ip)
-            firefox_options.set_preference('network.proxy.socks_port', int(port))
-
-        if self._user_agent:
-            firefox_options.set_preference(
-                "general.useragent.override",
-                self._user_agent() if callable(self._user_agent) else self._user_agent,
-            )
-
-        if not self._load_images:
-            firefox_options.set_preference("permissions.default.image", 2)
-
-        if self._headless:
-            firefox_options.add_argument("--headless")
-            firefox_options.add_argument("--disable-gpu")
-
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                firefox_options.add_argument(arg)
-
-        executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
-        browser = webdriver.Remote(
-            command_executor=executor,
-            desired_capabilities=desired_capabilities,
-            options=firefox_options
-        )
-
-        if self._window_size:
-            browser.set_window_size(*self._window_size)
-
-        return browser
-
-    def firefox_driver(self):
-        if self._usages_local_driver:
-            return self.local_firefox_driver()
-        return self.remote_firefox_driver()
-
-    def remote_chrome_driver(self):
-        chrome_options = webdriver.ChromeOptions()
-        desired_capabilities = chrome_options.to_capabilities()
-        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
-        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-        chrome_options.add_experimental_option("useAutomationExtension", False)
-        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
-        # docker 里运行需要
-        chrome_options.add_argument('--no-sandbox')
-        chrome_options.add_argument('--disable-extensions')
-        chrome_options.add_argument('--disable-dev-shm-usage')
-
-        if self._version:
-            desired_capabilities['version'] = self._version
-
-        if self._proxy:
-            chrome_options.add_argument(
-                "--proxy-server={}".format(
-                    self._proxy() if callable(self._proxy) else self._proxy
-                )
-            )
-
-        if self._user_agent:
-            chrome_options.add_argument(
-                "user-agent={}".format(
-                    self._user_agent()
-                    if callable(self._user_agent)
-                    else self._user_agent
-                )
-            )
-
-        if not self._load_images:
-            chrome_options.add_experimental_option(
-                "prefs", {"profile.managed_default_content_settings.images": 2}
-            )
-
-        if self._headless:
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--disable-gpu")
-
-        if self._window_size:
-            chrome_options.add_argument(
-                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
-            )
-
-        # 添加自定义的配置参数
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                chrome_options.add_argument(arg)
-
-        browser = webdriver.Remote(
-            command_executor=ChromeRemoteConnection(
-                remote_server_addr=self._server_addr,
-                keep_alive=True),
-            desired_capabilities=desired_capabilities,
-            options=chrome_options
-        )
-
-        # 隐藏浏览器特征
-        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
-            js = f.read()
-            params = {
-                'cmd': 'Page.addScriptToEvaluateOnNewDocument',
-                'params': {'source': js}
-            }
-            response = browser.execute("executeCdpCommand", params)['value']
-        return browser
-
-    def local_chrome_driver(self):
-        chrome_options = webdriver.ChromeOptions()
-        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
-        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-        chrome_options.add_experimental_option("useAutomationExtension", False)
-        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
-        # docker 里运行需要
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument('--disable-extensions')
-        chrome_options.add_argument('--disable-dev-shm-usage')
-
-        if self._proxy:
-            chrome_options.add_argument(
-                "--proxy-server={}".format(
-                    self._proxy() if callable(self._proxy) else self._proxy
-                )
-            )
-
-        if self._user_agent:
-            chrome_options.add_argument(
-                "user-agent={}".format(
-                    self._user_agent()
-                    if callable(self._user_agent)
-                    else self._user_agent
-                )
-            )
-
-        if not self._load_images:
-            chrome_options.add_experimental_option(
-                "prefs", {"profile.managed_default_content_settings.images": 2}
-            )
-
-        if self._headless:
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--disable-gpu")
-
-        if self._window_size:
-            chrome_options.add_argument(
-                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
-            )
-
-        # 添加自定义的配置参数
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                chrome_options.add_argument(arg)
-
-        if self._executable_path:
-            driver = webdriver.Chrome(
-                chrome_options=chrome_options,
-                executable_path=self._executable_path,
-                service_log_path=self._service_log_path
-            )
-        else:
-            driver = webdriver.Chrome(
-                chrome_options=chrome_options,
-                service_log_path=self._service_log_path
-            )
-
-        # 隐藏浏览器特征
-        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
-            js = f.read()
-            driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
-
-        return driver
-
-    def chrome_driver(self):
-        if self._usages_local_driver:
-            return self.local_chrome_driver()
-        return self.remote_chrome_driver()
-
-    @property
-    def cookies(self):
-        cookies_json = {}
-        for cookie in self.driver.get_cookies():
-            cookies_json[cookie["name"]] = cookie["value"]
-        return cookies_json
-
-    @cookies.setter
-    def cookies(self, val: dict):
-        """
-        设置cookie
-        Args:
-            val: {"key":"value", "key2":"value2"}
-
-        Returns:
-
-        """
-        for key, value in val.items():
-            self.driver.add_cookie({"name": key, "value": value})
-
-    def quit(self):
-        try:
-            self.get_driver().quit()
-        except Exception:
-            # We don't care about the message because something probably has gone wrong
-            pass
-
-    # def __del__(self):
-    #     if self.driver:
-    #         self.driver.quit()
-
-
-@Singleton
-class WebDriverPool:
-    def __init__(self, pool_size=5, **kwargs):
-        self.queue = queue.Queue(maxsize=pool_size)
-        self.kwargs = kwargs
-        self.lock = threading.RLock()
-        self.driver_count = 0
-
-    @property
-    def is_full(self):
-        return self.driver_count >= self.queue.maxsize
-
-    def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
-        """
-        获取webdriver
-        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
-        Args:
-            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
-            proxy: xxx.xxx.xxx.xxx
-        Returns:
-
-        """
-        if not self.is_full:
-            with self.lock:
-                if not self.is_full:
-                    kwargs = self.kwargs.copy()
-                    if user_agent:
-                        kwargs["user_agent"] = user_agent
-                    if proxy:
-                        kwargs["proxy"] = proxy
-                    driver = WebDriver(**kwargs)
-                    self.queue.put(driver)
-                    self.driver_count += 1
-
-        driver = self.queue.get()
-
-        return driver
-
-    def put(self, driver):
-        self.queue.put(driver)
-
-    def remove(self, driver):
-        driver.quit()
-        self.driver_count -= 1
-
-    def close(self):
-        while not self.queue.empty():
-            driver = self.queue.get()
-            driver.quit()
-            self.driver_count -= 1

部分文件因为文件数量过多而无法显示