|
@@ -0,0 +1,231 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+Created on 2024-09-19
|
|
|
+---------
|
|
|
+@summary: 中国大唐集团公司电子商务平台 - 列表页
|
|
|
+---------
|
|
|
+@author: Lzz
|
|
|
+"""
|
|
|
+import sys
|
|
|
+import os
|
|
|
+
|
|
|
+import requests
|
|
|
+
|
|
|
+sys.path.append(os.path.dirname(os.getcwd()))
|
|
|
+from collections import namedtuple
|
|
|
+from utils.RedisDB import RedisFilter
|
|
|
+from utils.tools import *
|
|
|
+import warnings
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+warnings.filterwarnings('ignore')
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+class Crawl_Zgdt:
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.py_spider = Mongo_client().py_spider
|
|
|
+ self.zb_list = self.py_spider.theme_list
|
|
|
+ self.RDS = RedisFilter()
|
|
|
+ self.real_cont = 0
|
|
|
+ self.proxy = get_proxy()
|
|
|
+ self.params = {}
|
|
|
+ self.cookies = {}
|
|
|
+
|
|
|
+ def get_acw_sc_v2(self, html):
|
|
|
+ try:
|
|
|
+ arg1 = "".join(re.findall("arg1='(.*?)'", html))
|
|
|
+ if arg1:
|
|
|
+ js_script = '''
|
|
|
+ function getAcw_sc__v2(obt_arg1) {
|
|
|
+ String["prototype"]["hexXor"] = function (_0x4e08d8) {
|
|
|
+ var _0x5a5d3b = '';
|
|
|
+ for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
|
|
|
+ var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
|
|
|
+ var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
|
|
|
+ var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
|
|
|
+ if (_0x189e2c["length"] == 1) {
|
|
|
+ _0x189e2c = '0' + _0x189e2c;
|
|
|
+ }
|
|
|
+ _0x5a5d3b += _0x189e2c;
|
|
|
+ }
|
|
|
+ return _0x5a5d3b;
|
|
|
+ };
|
|
|
+ String["prototype"]["unsbox"] = function () {
|
|
|
+ var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
|
|
|
+ var _0x4da0dc = [];
|
|
|
+ var _0x12605e = '';
|
|
|
+ for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
|
|
|
+ var _0x385ee3 = this[_0x20a7bf];
|
|
|
+ for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
|
|
|
+ if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
|
|
|
+ _0x4da0dc[_0x217721] = _0x385ee3;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ _0x12605e = _0x4da0dc["join"]('');
|
|
|
+ return _0x12605e;
|
|
|
+ };
|
|
|
+
|
|
|
+ var _0x5e8b26 = "3000176000856006061501533003690027800375";
|
|
|
+ // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
|
|
|
+ var arg1 = obt_arg1
|
|
|
+ var _0x23a392 = arg1["unsbox"]();
|
|
|
+ arg2 = _0x23a392["hexXor"](_0x5e8b26);
|
|
|
+ return arg2
|
|
|
+ }
|
|
|
+ '''
|
|
|
+ ctx = execjs.compile(js_script)
|
|
|
+ arg2 = ctx.call('getAcw_sc__v2', arg1)
|
|
|
+ return {"acw_sc__v2": arg2}
|
|
|
+ else:
|
|
|
+ return {}
|
|
|
+ except:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ def fetch_list_page(self, page, menu):
|
|
|
+ logger.debug(f' *** {menu.channel} 开始采集第{page}页 ***')
|
|
|
+
|
|
|
+ session = requests.Session()
|
|
|
+
|
|
|
+ session.proxies = get_QGIP()
|
|
|
+ # session.verify = False
|
|
|
+
|
|
|
+ headers = {
|
|
|
+ "Accept": "application/json, text/javascript, */*; q=0.01",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
+ "Connection": "keep-alive",
|
|
|
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
|
|
+ "Origin": "https://www.cdt-ec.com",
|
|
|
+ "Pragma": "no-cache",
|
|
|
+ "Referer": "https://www.cdt-ec.com/notice/webpage/jsp/more.jsp",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
|
|
+ "X-Requested-With": "XMLHttpRequest",
|
|
|
+ }
|
|
|
+
|
|
|
+ if menu.data == "x":
|
|
|
+ data = {
|
|
|
+ "page": f"{page}",
|
|
|
+ "limit": "90",
|
|
|
+ "messagetype": f"{menu.type}",
|
|
|
+ "startDate": "",
|
|
|
+ "endDate": ""
|
|
|
+ }
|
|
|
+ else:
|
|
|
+ data = {
|
|
|
+ "page": f"{page}",
|
|
|
+ "limit": "90",
|
|
|
+ "messagetype": f"{menu.type}",
|
|
|
+ "title": "",
|
|
|
+ "code": "",
|
|
|
+ "purchase_en": "",
|
|
|
+ "startDate": "",
|
|
|
+ "endDate": ""
|
|
|
+ }
|
|
|
+
|
|
|
+ retry = 0
|
|
|
+ while (retry := retry + 1) < 3:
|
|
|
+ url = f'https://www.cdt-ec.com/notice/{menu.req_param}'
|
|
|
+ resp = session.post(url, headers=headers, cookies=self.cookies, data=data, timeout=20)
|
|
|
+ self.cookies.update(session.cookies.get_dict())
|
|
|
+
|
|
|
+ arg1_ck = self.get_acw_sc_v2(resp.text)
|
|
|
+
|
|
|
+ if "滑动验证页面" in resp.text:
|
|
|
+ logger.warning("滑动验证页面")
|
|
|
+ return None
|
|
|
+
|
|
|
+ elif arg1_ck:
|
|
|
+ logger.warning("arg1_ck")
|
|
|
+ self.cookies.update(arg1_ck)
|
|
|
+ else:
|
|
|
+ return resp
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+ def parser_list_page(self, response, page, menu):
|
|
|
+ results_list = []
|
|
|
+ info_list = response.json().get('data') or []
|
|
|
+ for info in info_list:
|
|
|
+ href_id = info.get('id')
|
|
|
+ href = f'https://www.cdt-ec.com/notice/{menu.href_param}?id={href_id}'
|
|
|
+ title = info.get('message_title') or info.get('title')
|
|
|
+ create_time = info.get('publish_time') or info.get('push_time')
|
|
|
+ if "." in create_time:
|
|
|
+ create_time = create_time.split(".")[0]
|
|
|
+
|
|
|
+ dedup = md5value(title + href)
|
|
|
+
|
|
|
+ if not self.RDS.data_filter(dedup):
|
|
|
+ item = {
|
|
|
+ "site": "中国大唐集团公司电子商务平台",
|
|
|
+ "channel": menu.channel,
|
|
|
+ "spidercode": menu.code,
|
|
|
+ "area": "全国",
|
|
|
+ "city": "",
|
|
|
+ "district": "",
|
|
|
+ "href": href,
|
|
|
+ "title": title,
|
|
|
+ "publishtime": create_time,
|
|
|
+ "parse_url": href,
|
|
|
+ "parser_name": "ztpc_zgdtjt",
|
|
|
+ "is_mixed": False,
|
|
|
+ "is_theme": True,
|
|
|
+ "retry": 0,
|
|
|
+ "comeintime": int2long(int(time.time())),
|
|
|
+ "is_crawl": False,
|
|
|
+ "failed": False,
|
|
|
+ }
|
|
|
+
|
|
|
+ self.zb_list.insert_one(item)
|
|
|
+ self.RDS.data_save_redis(dedup)
|
|
|
+ results_list.append(item)
|
|
|
+
|
|
|
+ logger.info(f' *** 第{page}页采集完毕 - 共{len(info_list)}条 - 入库{len(results_list)}条 ***')
|
|
|
+
|
|
|
+ return results_list
|
|
|
+
|
|
|
+ def crawl_list_spider(self, page, menu):
|
|
|
+ retry = 0
|
|
|
+ while (retry := retry + 1) < 10:
|
|
|
+ try:
|
|
|
+ logger.debug(f"{menu.channel}_第{page}页 start")
|
|
|
+ response = self.fetch_list_page(page=page, menu=menu)
|
|
|
+ if response is not None and response.status_code == 200:
|
|
|
+ informations = self.parser_list_page(response=response, page=page, menu=menu)
|
|
|
+ crawl_num = len(informations)
|
|
|
+ self.real_cont += crawl_num
|
|
|
+ logger.info(f"{menu.channel}_第 {page} 页 end, 当前已采集 {self.real_cont} 条数据")
|
|
|
+ time.sleep(random.randint(1,4))
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(e)
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ def start(self, menus):
|
|
|
+ logger.debug(" 采集开始 》》》 ")
|
|
|
+ for menu in menus:
|
|
|
+ page = menu.crawl_page
|
|
|
+ for page in range(1, page + 1):
|
|
|
+ self.crawl_list_spider(page, menu)
|
|
|
+ logger.debug(" 《 《 《 采集结束 ")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ Menu = namedtuple('Menu', ['channel', 'code', 'type', 'href_param', 'req_param', 'data', 'crawl_page'])
|
|
|
+
|
|
|
+ menus = [
|
|
|
+ Menu('招标公告-中标候选人公示', 'a_zgdtjtgsdzswpt_zbgg_zbhxrgs', '2', 'moreController/moreall', 'moreController/getList', 'x', 1),
|
|
|
+ Menu('招标公告-终止公告', 'a_zgdtjtgsdzswpt_zbgg_zzgg', '21', 'moreController/moreall', 'moreController/getList', 'x', 1),
|
|
|
+ Menu('招标公告-流标公告', 'a_zgdtjtgsdzswpt_zbgg_lbgg', '22', 'moreController/moreall', 'moreController/getList', 'x', 1),
|
|
|
+ Menu('非招标公告-终止公告', 'a_zgdtjtgsdzswpt_fzbgg_zzgg', '23', 'moreController/xjdhtml', 'moreController/getList', 'x', 1),
|
|
|
+ ]
|
|
|
+
|
|
|
+ Crawl_Zgdt().start(menus)
|
|
|
+
|