1 周之前 · 60bfae9c27
--- a/lzz_theme/zgdtjtgsdzswpt_m30/dtpy_details.py
+++ b/lzz_theme/zgdtjtgsdzswpt_m30/dtpy_details.py
@@ -0,0 +1,192 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-09-19
			
 
				+---------
			
 
				+@summary: 中国大唐集团公司电子商务平台 - 详情页
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import sys
			
 
				+import os
			
 
				+sys.path.append(os.path.dirname(os.getcwd()))
			
 
				+
			
 
				+from utils.attachment import AttachmentDownloader
			
 
				+from utils.tools import *
			
 
				+import warnings
			
 
				+from parsel import Selector
			
 
				+
			
 
				+warnings.filterwarnings('ignore')
			
 
				+
			
 
				+
			
 
				+class Details:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.proxy = get_proxy()
			
 
				+        self.db_table = Mongo_client().py_spider
			
 
				+        self.db_name = self.db_table.theme_list
			
 
				+        self.zt_details = self.db_table.data_bak
			
 
				+        self.cookies = {}
			
 
				+        self.headers = {
			
 
				+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
			
 
				+            "Cache-Control": "no-cache",
			
 
				+            "Connection": "keep-alive",
			
 
				+            "Pragma": "no-cache",
			
 
				+            "Upgrade-Insecure-Requests": "1",
			
 
				+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
			
 
				+        }
			
 
				+
			
 
				+    def get_acw_sc_v2(self, html):
			
 
				+        try:
			
 
				+            arg1 = "".join(re.findall("arg1='(.*?)'", html))
			
 
				+            if arg1:
			
 
				+                js_script = '''
			
 
				+                    function getAcw_sc__v2(obt_arg1) {
			
 
				+                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
			
 
				+                            var _0x5a5d3b = '';
			
 
				+                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
			
 
				+                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
			
 
				+                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
			
 
				+                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
			
 
				+                                if (_0x189e2c["length"] == 1) {
			
 
				+                                    _0x189e2c = '0' + _0x189e2c;
			
 
				+                                }
			
 
				+                                _0x5a5d3b += _0x189e2c;
			
 
				+                            }
			
 
				+                            return _0x5a5d3b;
			
 
				+                        };
			
 
				+                        String["prototype"]["unsbox"] = function () {
			
 
				+                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
			
 
				+                            var _0x4da0dc = [];
			
 
				+                            var _0x12605e = '';
			
 
				+                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
			
 
				+                                var _0x385ee3 = this[_0x20a7bf];
			
 
				+                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
			
 
				+                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
			
 
				+                                        _0x4da0dc[_0x217721] = _0x385ee3;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                            }
			
 
				+                            _0x12605e = _0x4da0dc["join"]('');
			
 
				+                            return _0x12605e;
			
 
				+                        };
			
 
				+
			
 
				+                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
			
 
				+                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
			
 
				+                        var arg1 = obt_arg1
			
 
				+                        var _0x23a392 = arg1["unsbox"]();
			
 
				+                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
			
 
				+                        return arg2
			
 
				+                    }
			
 
				+                '''
			
 
				+                ctx = execjs.compile(js_script)
			
 
				+                arg2 = ctx.call('getAcw_sc__v2', arg1)
			
 
				+                return {"acw_sc__v2": arg2}
			
 
				+            else:
			
 
				+                return {}
			
 
				+        except:
			
 
				+            return {}
			
 
				+
			
 
				+    def detail_get(self, response, item):
			
 
				+
			
 
				+        root = Selector(response.text)
			
 
				+        html = root.xpath('//div[@id="container"]').extract_first("")
			
 
				+
			
 
				+        rm_list = ['//h1','//div[@class="block"]']
			
 
				+        html = remove_htmldata(rm_list, html, root)
			
 
				+
			
 
				+        item["contenthtml"] = html
			
 
				+
			
 
				+        attachments = {}
			
 
				+        file_url = "".join(re.findall('"\+getGgUrl\((.*?)\)\+"#page=1" />', response.text, re.S))
			
 
				+        if file_url:
			
 
				+            file_type = "pdf"
			
 
				+
			
 
				+            attachment = AttachmentDownloader().fetch_attachment(
			
 
				+                file_name=item['title'],
			
 
				+                file_type=file_type,
			
 
				+                download_url=file_url,
			
 
				+                proxies=self.proxy
			
 
				+            )
			
 
				+            attachments[str(len(attachments) + 1)] = attachment
			
 
				+
			
 
				+        if attachments:
			
 
				+            item['projectinfo'] = {"attachments": attachments}
			
 
				+
			
 
				+        item = format_fileds(item)
			
 
				+
			
 
				+        try:
			
 
				+            self.zt_details.insert_one(item)
			
 
				+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
			
 
				+        except DuplicateKeyError:
			
 
				+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
			
 
				+
			
 
				+    def fetch_request(self, item):
			
 
				+        session = requests.Session()
			
 
				+        session.proxies = get_QGIP()
			
 
				+
			
 
				+        retry = 0
			
 
				+        while (retry := retry + 1) < 3:
			
 
				+            response = session.get(url=item['href'], headers=self.headers, cookies=self.cookies, timeout=30)
			
 
				+
			
 
				+            self.cookies.update(session.cookies.get_dict())
			
 
				+
			
 
				+            arg1_ck = self.get_acw_sc_v2(response.text)
			
 
				+
			
 
				+            if "滑动验证页面" in response.text:
			
 
				+                logger.warning("滑动验证页面")
			
 
				+                return None
			
 
				+
			
 
				+            elif arg1_ck:
			
 
				+                logger.warning("arg1_ck")
			
 
				+                self.cookies.update(arg1_ck)
			
 
				+            else:
			
 
				+                return response
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def deal_request(self, item):
			
 
				+        retry_times = 0
			
 
				+        org_item = item.copy()
			
 
				+        while retry_times < 3:
			
 
				+            retry_times += 1
			
 
				+            try:
			
 
				+                response = self.fetch_request(item)
			
 
				+                if response is not None and response.status_code == 200:
			
 
				+                    self.detail_get(response, item=item)
			
 
				+                    time.sleep(random.random())
			
 
				+                    return True
			
 
				+                else:
			
 
				+                    time.sleep(2)
			
 
				+            except Exception as e:
			
 
				+                item = org_item
			
 
				+                logger.error(f"{item['href']} 采集异常：{e}")
			
 
				+                time.sleep(2)
			
 
				+
			
 
				+        logger.warning(f"[采集失败]{item['href']}")
			
 
				+        return False
			
 
				+
			
 
				+    def start(self, limit=1):
			
 
				+        logger.debug("********** 详情页采集开始 **********")
			
 
				+        time.sleep(random.random())
			
 
				+        count = 0
			
 
				+        with self.db_name.find({"parser_name": "ztpc_zgdtjt", "failed": False, "is_crawl": False},
			
 
				+                               no_cursor_timeout=True) as data_lsit:
			
 
				+            for item in data_lsit:
			
 
				+                # logger.debug(item)
			
 
				+                if count >= limit:
			
 
				+                    break
			
 
				+                count += 1
			
 
				+                update_id = item["_id"]
			
 
				+                retry = item["retry"]
			
 
				+                if self.deal_request(item):
			
 
				+                    self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
			
 
				+                else:
			
 
				+                    retry += 1
			
 
				+                    self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
			
 
				+
			
 
				+        logger.debug("********** 详情页采集结束 **********")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    Details().start(limit=200)
			
--- a/lzz_theme/zgdtjtgsdzswpt_m30/dtpy_spider_m30.py
+++ b/lzz_theme/zgdtjtgsdzswpt_m30/dtpy_spider_m30.py
@@ -0,0 +1,231 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-09-19
			
 
				+---------
			
 
				+@summary: 中国大唐集团公司电子商务平台 - 列表页
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import sys
			
 
				+import os
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+sys.path.append(os.path.dirname(os.getcwd()))
			
 
				+from collections import namedtuple
			
 
				+from utils.RedisDB import RedisFilter
			
 
				+from utils.tools import *
			
 
				+import warnings
			
 
				+
			
 
				+
			
 
				+
			
 
				+warnings.filterwarnings('ignore')
			
 
				+
			
 
				+
			
 
				+
			
 
				+class Crawl_Zgdt:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.py_spider = Mongo_client().py_spider
			
 
				+        self.zb_list = self.py_spider.theme_list
			
 
				+        self.RDS = RedisFilter()
			
 
				+        self.real_cont = 0
			
 
				+        self.proxy = get_proxy()
			
 
				+        self.params = {}
			
 
				+        self.cookies = {}
			
 
				+
			
 
				+    def get_acw_sc_v2(self, html):
			
 
				+        try:
			
 
				+            arg1 = "".join(re.findall("arg1='(.*?)'", html))
			
 
				+            if arg1:
			
 
				+                js_script = '''
			
 
				+                    function getAcw_sc__v2(obt_arg1) {
			
 
				+                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
			
 
				+                            var _0x5a5d3b = '';
			
 
				+                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
			
 
				+                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
			
 
				+                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
			
 
				+                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
			
 
				+                                if (_0x189e2c["length"] == 1) {
			
 
				+                                    _0x189e2c = '0' + _0x189e2c;
			
 
				+                                }
			
 
				+                                _0x5a5d3b += _0x189e2c;
			
 
				+                            }
			
 
				+                            return _0x5a5d3b;
			
 
				+                        };
			
 
				+                        String["prototype"]["unsbox"] = function () {
			
 
				+                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
			
 
				+                            var _0x4da0dc = [];
			
 
				+                            var _0x12605e = '';
			
 
				+                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
			
 
				+                                var _0x385ee3 = this[_0x20a7bf];
			
 
				+                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
			
 
				+                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
			
 
				+                                        _0x4da0dc[_0x217721] = _0x385ee3;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                            }
			
 
				+                            _0x12605e = _0x4da0dc["join"]('');
			
 
				+                            return _0x12605e;
			
 
				+                        };
			
 
				+
			
 
				+                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
			
 
				+                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
			
 
				+                        var arg1 = obt_arg1
			
 
				+                        var _0x23a392 = arg1["unsbox"]();
			
 
				+                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
			
 
				+                        return arg2
			
 
				+                    }
			
 
				+                '''
			
 
				+                ctx = execjs.compile(js_script)
			
 
				+                arg2 = ctx.call('getAcw_sc__v2', arg1)
			
 
				+                return {"acw_sc__v2": arg2}
			
 
				+            else:
			
 
				+                return {}
			
 
				+        except:
			
 
				+            return {}
			
 
				+
			
 
				+    def fetch_list_page(self, page, menu):
			
 
				+        logger.debug(f' *** {menu.channel} 开始采集第{page}页 ***')
			
 
				+
			
 
				+        session = requests.Session()
			
 
				+
			
 
				+        session.proxies = get_QGIP()
			
 
				+        # session.verify = False
			
 
				+
			
 
				+        headers = {
			
 
				+            "Accept": "application/json, text/javascript, */*; q=0.01",
			
 
				+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
			
 
				+            "Cache-Control": "no-cache",
			
 
				+            "Connection": "keep-alive",
			
 
				+            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
			
 
				+            "Origin": "https://www.cdt-ec.com",
			
 
				+            "Pragma": "no-cache",
			
 
				+            "Referer": "https://www.cdt-ec.com/notice/webpage/jsp/more.jsp",
			
 
				+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
			
 
				+            "X-Requested-With": "XMLHttpRequest",
			
 
				+        }
			
 
				+
			
 
				+        if menu.data == "x":
			
 
				+            data = {
			
 
				+                "page": f"{page}",
			
 
				+                "limit": "90",
			
 
				+                "messagetype": f"{menu.type}",
			
 
				+                "startDate": "",
			
 
				+                "endDate": ""
			
 
				+            }
			
 
				+        else:
			
 
				+            data = {
			
 
				+                "page": f"{page}",
			
 
				+                "limit": "90",
			
 
				+                "messagetype": f"{menu.type}",
			
 
				+                "title": "",
			
 
				+                "code": "",
			
 
				+                "purchase_en": "",
			
 
				+                "startDate": "",
			
 
				+                "endDate": ""
			
 
				+            }
			
 
				+
			
 
				+        retry = 0
			
 
				+        while (retry := retry + 1) < 3:
			
 
				+            url = f'https://www.cdt-ec.com/notice/{menu.req_param}'
			
 
				+            resp = session.post(url, headers=headers, cookies=self.cookies, data=data, timeout=20)
			
 
				+            self.cookies.update(session.cookies.get_dict())
			
 
				+
			
 
				+            arg1_ck = self.get_acw_sc_v2(resp.text)
			
 
				+
			
 
				+            if "滑动验证页面" in resp.text:
			
 
				+                logger.warning("滑动验证页面")
			
 
				+                return None
			
 
				+
			
 
				+            elif arg1_ck:
			
 
				+                logger.warning("arg1_ck")
			
 
				+                self.cookies.update(arg1_ck)
			
 
				+            else:
			
 
				+                return resp
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def parser_list_page(self, response, page, menu):
			
 
				+        results_list = []
			
 
				+        info_list = response.json().get('data') or []
			
 
				+        for info in info_list:
			
 
				+            href_id = info.get('id')
			
 
				+            href = f'https://www.cdt-ec.com/notice/{menu.href_param}?id={href_id}'
			
 
				+            title = info.get('message_title') or info.get('title')
			
 
				+            create_time = info.get('publish_time') or info.get('push_time')
			
 
				+            if "." in create_time:
			
 
				+                create_time = create_time.split(".")[0]
			
 
				+
			
 
				+            dedup = md5value(title + href)
			
 
				+
			
 
				+            if not self.RDS.data_filter(dedup):
			
 
				+                item = {
			
 
				+                    "site": "中国大唐集团公司电子商务平台",
			
 
				+                    "channel": menu.channel,
			
 
				+                    "spidercode": menu.code,
			
 
				+                    "area": "全国",
			
 
				+                    "city": "",
			
 
				+                    "district": "",
			
 
				+                    "href": href,
			
 
				+                    "title": title,
			
 
				+                    "publishtime": create_time,
			
 
				+                    "parse_url": href,
			
 
				+                    "parser_name": "ztpc_zgdtjt",
			
 
				+                    "is_mixed": False,
			
 
				+                    "is_theme": True,
			
 
				+                    "retry": 0,
			
 
				+                    "comeintime": int2long(int(time.time())),
			
 
				+                    "is_crawl": False,
			
 
				+                    "failed": False,
			
 
				+                }
			
 
				+
			
 
				+                self.zb_list.insert_one(item)
			
 
				+                self.RDS.data_save_redis(dedup)
			
 
				+                results_list.append(item)
			
 
				+
			
 
				+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
			
 
				+
			
 
				+        return results_list
			
 
				+
			
 
				+    def crawl_list_spider(self, page, menu):
			
 
				+        retry = 0
			
 
				+        while (retry := retry + 1) < 10:
			
 
				+            try:
			
 
				+                logger.debug(f"{menu.channel}_第{page}页 start")
			
 
				+                response = self.fetch_list_page(page=page, menu=menu)
			
 
				+                if response is not None and response.status_code == 200:
			
 
				+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
			
 
				+                    crawl_num = len(informations)
			
 
				+                    self.real_cont += crawl_num
			
 
				+                    logger.info(f"{menu.channel}_第 {page} 页 end, 当前已采集 {self.real_cont} 条数据")
			
 
				+                    time.sleep(random.randint(1,4))
			
 
				+                    return
			
 
				+                else:
			
 
				+                    time.sleep(2)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                logger.error(e)
			
 
				+                time.sleep(2)
			
 
				+
			
 
				+    def start(self, menus):
			
 
				+        logger.debug(" 采集开始 》》》 ")
			
 
				+        for menu in menus:
			
 
				+            page = menu.crawl_page
			
 
				+            for page in range(1, page + 1):
			
 
				+                self.crawl_list_spider(page, menu)
			
 
				+        logger.debug(" 《 《 《 采集结束 ")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    Menu = namedtuple('Menu', ['channel', 'code', 'type', 'href_param', 'req_param', 'data', 'crawl_page'])
			
 
				+
			
 
				+    menus = [
			
 
				+        Menu('招标公告-中标候选人公示', 'a_zgdtjtgsdzswpt_zbgg_zbhxrgs', '2', 'moreController/moreall', 'moreController/getList', 'x', 1),
			
 
				+        Menu('招标公告-终止公告', 'a_zgdtjtgsdzswpt_zbgg_zzgg', '21', 'moreController/moreall', 'moreController/getList', 'x', 1),
			
 
				+        Menu('招标公告-流标公告', 'a_zgdtjtgsdzswpt_zbgg_lbgg', '22', 'moreController/moreall', 'moreController/getList', 'x', 1),
			
 
				+        Menu('非招标公告-终止公告', 'a_zgdtjtgsdzswpt_fzbgg_zzgg', '23', 'moreController/xjdhtml', 'moreController/getList', 'x', 1),
			
 
				+    ]
			
 
				+
			
 
				+    Crawl_Zgdt().start(menus)
			
 
				+
			
--- a/lzz_theme/zgdtjtgsdzswpt_m30/start_m30.sh
+++ b/lzz_theme/zgdtjtgsdzswpt_m30/start_m30.sh
@@ -0,0 +1,6 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+ps -ef |grep "dtpy_spider_m30.py" |grep -v grep |awk '{print $2}' |xargs kill -9
			
 
				+ps -ef |grep "dtpy_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
			
 
				+nohup python3 dtpy_spider_m30.py > log/dtpy_spider_m30.out 2>&1 &
			
 
				+nohup python3 dtpy_details.py > log/dtpy_details.out 2>&1 &