瀏覽代碼

按照中国移动要求改采集频率

dzr 1 周之前
父節點
當前提交
60bfae9c27

+ 192 - 0
lzz_theme/zgdtjtgsdzswpt_m30/dtpy_details.py

@@ -0,0 +1,192 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-09-19
+---------
+@summary: 中国大唐集团公司电子商务平台 - 详情页
+---------
+@author: Lzz
+"""
+import sys
+import os
+sys.path.append(os.path.dirname(os.getcwd()))
+
+from utils.attachment import AttachmentDownloader
+from utils.tools import *
+import warnings
+from parsel import Selector
+
+warnings.filterwarnings('ignore')
+
+
+class Details:
+
+    def __init__(self):
+        self.proxy = get_proxy()
+        self.db_table = Mongo_client().py_spider
+        self.db_name = self.db_table.theme_list
+        self.zt_details = self.db_table.data_bak
+        self.cookies = {}
+        self.headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Pragma": "no-cache",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+        }
+
+    def get_acw_sc_v2(self, html):
+        try:
+            arg1 = "".join(re.findall("arg1='(.*?)'", html))
+            if arg1:
+                js_script = '''
+                    function getAcw_sc__v2(obt_arg1) {
+                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
+                            var _0x5a5d3b = '';
+                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
+                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
+                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
+                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
+                                if (_0x189e2c["length"] == 1) {
+                                    _0x189e2c = '0' + _0x189e2c;
+                                }
+                                _0x5a5d3b += _0x189e2c;
+                            }
+                            return _0x5a5d3b;
+                        };
+                        String["prototype"]["unsbox"] = function () {
+                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
+                            var _0x4da0dc = [];
+                            var _0x12605e = '';
+                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
+                                var _0x385ee3 = this[_0x20a7bf];
+                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
+                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
+                                        _0x4da0dc[_0x217721] = _0x385ee3;
+                                    }
+                                }
+                            }
+                            _0x12605e = _0x4da0dc["join"]('');
+                            return _0x12605e;
+                        };
+
+                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
+                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
+                        var arg1 = obt_arg1
+                        var _0x23a392 = arg1["unsbox"]();
+                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
+                        return arg2
+                    }
+                '''
+                ctx = execjs.compile(js_script)
+                arg2 = ctx.call('getAcw_sc__v2', arg1)
+                return {"acw_sc__v2": arg2}
+            else:
+                return {}
+        except:
+            return {}
+
+    def detail_get(self, response, item):
+
+        root = Selector(response.text)
+        html = root.xpath('//div[@id="container"]').extract_first("")
+
+        rm_list = ['//h1','//div[@class="block"]']
+        html = remove_htmldata(rm_list, html, root)
+
+        item["contenthtml"] = html
+
+        attachments = {}
+        file_url = "".join(re.findall('"\+getGgUrl\((.*?)\)\+"#page=1" />', response.text, re.S))
+        if file_url:
+            file_type = "pdf"
+
+            attachment = AttachmentDownloader().fetch_attachment(
+                file_name=item['title'],
+                file_type=file_type,
+                download_url=file_url,
+                proxies=self.proxy
+            )
+            attachments[str(len(attachments) + 1)] = attachment
+
+        if attachments:
+            item['projectinfo'] = {"attachments": attachments}
+
+        item = format_fileds(item)
+
+        try:
+            self.zt_details.insert_one(item)
+            logger.info(f"[采集成功]{item['title']}-{item['publishtime']}")
+        except DuplicateKeyError:
+            logger.info(f"[重复采集]{item['title']}-{item['publishtime']}")
+
+    def fetch_request(self, item):
+        session = requests.Session()
+        session.proxies = get_QGIP()
+
+        retry = 0
+        while (retry := retry + 1) < 3:
+            response = session.get(url=item['href'], headers=self.headers, cookies=self.cookies, timeout=30)
+
+            self.cookies.update(session.cookies.get_dict())
+
+            arg1_ck = self.get_acw_sc_v2(response.text)
+
+            if "滑动验证页面" in response.text:
+                logger.warning("滑动验证页面")
+                return None
+
+            elif arg1_ck:
+                logger.warning("arg1_ck")
+                self.cookies.update(arg1_ck)
+            else:
+                return response
+
+        return None
+
+    def deal_request(self, item):
+        retry_times = 0
+        org_item = item.copy()
+        while retry_times < 3:
+            retry_times += 1
+            try:
+                response = self.fetch_request(item)
+                if response is not None and response.status_code == 200:
+                    self.detail_get(response, item=item)
+                    time.sleep(random.random())
+                    return True
+                else:
+                    time.sleep(2)
+            except Exception as e:
+                item = org_item
+                logger.error(f"{item['href']} 采集异常:{e}")
+                time.sleep(2)
+
+        logger.warning(f"[采集失败]{item['href']}")
+        return False
+
+    def start(self, limit=1):
+        logger.debug("********** 详情页采集开始 **********")
+        time.sleep(random.random())
+        count = 0
+        with self.db_name.find({"parser_name": "ztpc_zgdtjt", "failed": False, "is_crawl": False},
+                               no_cursor_timeout=True) as data_lsit:
+            for item in data_lsit:
+                # logger.debug(item)
+                if count >= limit:
+                    break
+                count += 1
+                update_id = item["_id"]
+                retry = item["retry"]
+                if self.deal_request(item):
+                    self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
+                else:
+                    retry += 1
+                    self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True, "retry": retry}})
+
+        logger.debug("********** 详情页采集结束 **********")
+
+
+if __name__ == "__main__":
+    Details().start(limit=200)

+ 231 - 0
lzz_theme/zgdtjtgsdzswpt_m30/dtpy_spider_m30.py

@@ -0,0 +1,231 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-09-19
+---------
+@summary: 中国大唐集团公司电子商务平台 - 列表页
+---------
+@author: Lzz
+"""
+import sys
+import os
+
+import requests
+
+sys.path.append(os.path.dirname(os.getcwd()))
+from collections import namedtuple
+from utils.RedisDB import RedisFilter
+from utils.tools import *
+import warnings
+
+
+
+warnings.filterwarnings('ignore')
+
+
+
+class Crawl_Zgdt:
+
+    def __init__(self):
+        self.py_spider = Mongo_client().py_spider
+        self.zb_list = self.py_spider.theme_list
+        self.RDS = RedisFilter()
+        self.real_cont = 0
+        self.proxy = get_proxy()
+        self.params = {}
+        self.cookies = {}
+
+    def get_acw_sc_v2(self, html):
+        try:
+            arg1 = "".join(re.findall("arg1='(.*?)'", html))
+            if arg1:
+                js_script = '''
+                    function getAcw_sc__v2(obt_arg1) {
+                        String["prototype"]["hexXor"] = function (_0x4e08d8) {
+                            var _0x5a5d3b = '';
+                            for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
+                                var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
+                                var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
+                                var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
+                                if (_0x189e2c["length"] == 1) {
+                                    _0x189e2c = '0' + _0x189e2c;
+                                }
+                                _0x5a5d3b += _0x189e2c;
+                            }
+                            return _0x5a5d3b;
+                        };
+                        String["prototype"]["unsbox"] = function () {
+                            var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
+                            var _0x4da0dc = [];
+                            var _0x12605e = '';
+                            for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
+                                var _0x385ee3 = this[_0x20a7bf];
+                                for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
+                                    if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
+                                        _0x4da0dc[_0x217721] = _0x385ee3;
+                                    }
+                                }
+                            }
+                            _0x12605e = _0x4da0dc["join"]('');
+                            return _0x12605e;
+                        };
+
+                        var _0x5e8b26 = "3000176000856006061501533003690027800375";
+                        // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
+                        var arg1 = obt_arg1
+                        var _0x23a392 = arg1["unsbox"]();
+                        arg2 = _0x23a392["hexXor"](_0x5e8b26);
+                        return arg2
+                    }
+                '''
+                ctx = execjs.compile(js_script)
+                arg2 = ctx.call('getAcw_sc__v2', arg1)
+                return {"acw_sc__v2": arg2}
+            else:
+                return {}
+        except:
+            return {}
+
+    def fetch_list_page(self, page, menu):
+        logger.debug(f' *** {menu.channel} 开始采集第{page}页 ***')
+
+        session = requests.Session()
+
+        session.proxies = get_QGIP()
+        # session.verify = False
+
+        headers = {
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "Origin": "https://www.cdt-ec.com",
+            "Pragma": "no-cache",
+            "Referer": "https://www.cdt-ec.com/notice/webpage/jsp/more.jsp",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "X-Requested-With": "XMLHttpRequest",
+        }
+
+        if menu.data == "x":
+            data = {
+                "page": f"{page}",
+                "limit": "90",
+                "messagetype": f"{menu.type}",
+                "startDate": "",
+                "endDate": ""
+            }
+        else:
+            data = {
+                "page": f"{page}",
+                "limit": "90",
+                "messagetype": f"{menu.type}",
+                "title": "",
+                "code": "",
+                "purchase_en": "",
+                "startDate": "",
+                "endDate": ""
+            }
+
+        retry = 0
+        while (retry := retry + 1) < 3:
+            url = f'https://www.cdt-ec.com/notice/{menu.req_param}'
+            resp = session.post(url, headers=headers, cookies=self.cookies, data=data, timeout=20)
+            self.cookies.update(session.cookies.get_dict())
+
+            arg1_ck = self.get_acw_sc_v2(resp.text)
+
+            if "滑动验证页面" in resp.text:
+                logger.warning("滑动验证页面")
+                return None
+
+            elif arg1_ck:
+                logger.warning("arg1_ck")
+                self.cookies.update(arg1_ck)
+            else:
+                return resp
+
+        return None
+
+    def parser_list_page(self, response, page, menu):
+        results_list = []
+        info_list = response.json().get('data') or []
+        for info in info_list:
+            href_id = info.get('id')
+            href = f'https://www.cdt-ec.com/notice/{menu.href_param}?id={href_id}'
+            title = info.get('message_title') or info.get('title')
+            create_time = info.get('publish_time') or info.get('push_time')
+            if "." in create_time:
+                create_time = create_time.split(".")[0]
+
+            dedup = md5value(title + href)
+
+            if not self.RDS.data_filter(dedup):
+                item = {
+                    "site": "中国大唐集团公司电子商务平台",
+                    "channel": menu.channel,
+                    "spidercode": menu.code,
+                    "area": "全国",
+                    "city": "",
+                    "district": "",
+                    "href": href,
+                    "title": title,
+                    "publishtime": create_time,
+                    "parse_url": href,
+                    "parser_name": "ztpc_zgdtjt",
+                    "is_mixed": False,
+                    "is_theme": True,
+                    "retry": 0,
+                    "comeintime": int2long(int(time.time())),
+                    "is_crawl": False,
+                    "failed": False,
+                }
+
+                self.zb_list.insert_one(item)
+                self.RDS.data_save_redis(dedup)
+                results_list.append(item)
+
+        logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
+
+        return results_list
+
+    def crawl_list_spider(self, page, menu):
+        retry = 0
+        while (retry := retry + 1) < 10:
+            try:
+                logger.debug(f"{menu.channel}_第{page}页 start")
+                response = self.fetch_list_page(page=page, menu=menu)
+                if response is not None and response.status_code == 200:
+                    informations = self.parser_list_page(response=response, page=page, menu=menu)
+                    crawl_num = len(informations)
+                    self.real_cont += crawl_num
+                    logger.info(f"{menu.channel}_第 {page} 页 end, 当前已采集 {self.real_cont} 条数据")
+                    time.sleep(random.randint(1,4))
+                    return
+                else:
+                    time.sleep(2)
+
+            except Exception as e:
+                logger.error(e)
+                time.sleep(2)
+
+    def start(self, menus):
+        logger.debug(" 采集开始 》》》 ")
+        for menu in menus:
+            page = menu.crawl_page
+            for page in range(1, page + 1):
+                self.crawl_list_spider(page, menu)
+        logger.debug(" 《 《 《 采集结束 ")
+
+
+if __name__ == '__main__':
+    Menu = namedtuple('Menu', ['channel', 'code', 'type', 'href_param', 'req_param', 'data', 'crawl_page'])
+
+    menus = [
+        Menu('招标公告-中标候选人公示', 'a_zgdtjtgsdzswpt_zbgg_zbhxrgs', '2', 'moreController/moreall', 'moreController/getList', 'x', 1),
+        Menu('招标公告-终止公告', 'a_zgdtjtgsdzswpt_zbgg_zzgg', '21', 'moreController/moreall', 'moreController/getList', 'x', 1),
+        Menu('招标公告-流标公告', 'a_zgdtjtgsdzswpt_zbgg_lbgg', '22', 'moreController/moreall', 'moreController/getList', 'x', 1),
+        Menu('非招标公告-终止公告', 'a_zgdtjtgsdzswpt_fzbgg_zzgg', '23', 'moreController/xjdhtml', 'moreController/getList', 'x', 1),
+    ]
+
+    Crawl_Zgdt().start(menus)
+

+ 6 - 0
lzz_theme/zgdtjtgsdzswpt_m30/start_m30.sh

@@ -0,0 +1,6 @@
+#!/bin/bash
+
+ps -ef |grep "dtpy_spider_m30.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+ps -ef |grep "dtpy_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
+nohup python3 dtpy_spider_m30.py > log/dtpy_spider_m30.out 2>&1 &
+nohup python3 dtpy_details.py > log/dtpy_details.out 2>&1 &