|
@@ -1,13 +1,16 @@
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
"""
|
|
-Created on 2025-02-11
|
|
|
|
|
|
+Created on 2025-04-25
|
|
---------
|
|
---------
|
|
@summary: 全军武器装备采购信息网 - 详情页
|
|
@summary: 全军武器装备采购信息网 - 详情页
|
|
---------
|
|
---------
|
|
@author: Lzz
|
|
@author: Lzz
|
|
"""
|
|
"""
|
|
|
|
+import json
|
|
|
|
+import random
|
|
import sys
|
|
import sys
|
|
import os
|
|
import os
|
|
|
|
+import time
|
|
|
|
|
|
sys.path.append(os.path.dirname(os.getcwd()))
|
|
sys.path.append(os.path.dirname(os.getcwd()))
|
|
from utils.attachment import AttachmentDownloader
|
|
from utils.attachment import AttachmentDownloader
|
|
@@ -25,14 +28,20 @@ class Details:
|
|
self.db_name = self.db_table.theme_list
|
|
self.db_name = self.db_table.theme_list
|
|
self.zt_details = self.db_table.data_bak
|
|
self.zt_details = self.db_table.data_bak
|
|
self.proxy = None
|
|
self.proxy = None
|
|
|
|
+ self.cookies = None
|
|
|
|
+ self.count = 0
|
|
|
|
|
|
def detail_get(self, response, item):
|
|
def detail_get(self, response, item):
|
|
response.encoding = response.apparent_encoding
|
|
response.encoding = response.apparent_encoding
|
|
|
|
+ if '页面将在<span id="minnum">3</span>秒后跳转' in response.text:
|
|
|
|
+ logger.warning(" <<< cookies过期 >>> ")
|
|
|
|
+ return "500"
|
|
root = Selector(response.text)
|
|
root = Selector(response.text)
|
|
|
|
|
|
html = root.xpath('//div[@id="content"]|//div[@class="secret"]').extract_first("")
|
|
html = root.xpath('//div[@id="content"]|//div[@class="secret"]').extract_first("")
|
|
- rl_list = ['//span[@id="demandPv"]', '点击次数:', '//div[@class="right"]',
|
|
|
|
- '//div[@id="demandDocking"]', '未经授权,严禁转载']
|
|
|
|
|
|
+ rl_list = ['//span[@id="demandPv"]', '点击次数:', '//div[@class="right "]', '//span[@id="pv"]',
|
|
|
|
+ '//div[@id="demandDocking"]', '(未经授权,严禁转载)', '未经授权,严禁转载',
|
|
|
|
+ '//div[@class="right"]', ]
|
|
html = remove_htmldata(rl_list, html, root)
|
|
html = remove_htmldata(rl_list, html, root)
|
|
|
|
|
|
html2 = "".join(re.findall("htmlDecode\('(.*?)'\)\);", response.text, re.S))
|
|
html2 = "".join(re.findall("htmlDecode\('(.*?)'\)\);", response.text, re.S))
|
|
@@ -76,13 +85,17 @@ class Details:
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
}
|
|
}
|
|
- vv= item.get('publishtime').replace("-","").replace(":","").replace(" ","")
|
|
|
|
- response = requests.get(url=item.get("parse_url")+f"?v={vv}", headers=headers,
|
|
|
|
|
|
+
|
|
|
|
+ vv = item.get('publishtime').replace("-", "").replace(":", "").replace(" ", "")
|
|
|
|
+ response = requests.get(url=item.get("parse_url") + f"?v={vv}", headers=headers, cookies=self.cookies,
|
|
proxies=self.proxy, timeout=(30, 30), verify=False)
|
|
proxies=self.proxy, timeout=(30, 30), verify=False)
|
|
time.sleep(1)
|
|
time.sleep(1)
|
|
return response
|
|
return response
|
|
|
|
|
|
def deal_request(self, item):
|
|
def deal_request(self, item):
|
|
|
|
+ if not self.search_key(item.get('title')):
|
|
|
|
+ return True
|
|
|
|
+
|
|
retry_times = 0
|
|
retry_times = 0
|
|
org_item = item.copy()
|
|
org_item = item.copy()
|
|
while retry_times < 5:
|
|
while retry_times < 5:
|
|
@@ -90,7 +103,9 @@ class Details:
|
|
response = self.fetch_request(item)
|
|
response = self.fetch_request(item)
|
|
res_code = response.status_code
|
|
res_code = response.status_code
|
|
if response and res_code == 200:
|
|
if response and res_code == 200:
|
|
- self.detail_get(response, item=item)
|
|
|
|
|
|
+ rr = self.detail_get(response, item=item)
|
|
|
|
+ if rr and rr == "500":
|
|
|
|
+ return "500"
|
|
return True
|
|
return True
|
|
else:
|
|
else:
|
|
retry_times += 1
|
|
retry_times += 1
|
|
@@ -103,10 +118,46 @@ class Details:
|
|
logger.warning(f"[采集失败]{item.get('competehref')}")
|
|
logger.warning(f"[采集失败]{item.get('competehref')}")
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
+ def search_key(self, title):
|
|
|
|
+ keywords_list = ['数据中心', '视频会议系统', '大数据', '虚拟化', '服务器', '交换机', '防火墙', '入侵检测',
|
|
|
|
+ '云计算',
|
|
|
|
+ '学习室', '网络', '云桌面', '智慧教室', '网络设备', '路由器', '负载均衡', 'SDN', '国产化改造',
|
|
|
|
+ '智能管理平台', 'IMC', '存储', '分布式', '网络资源管理', '人工智能', '数据中心', '信息化',
|
|
|
|
+ '云办公', '磁盘阵列', 'GPU', '硬件建设', '超算', '高算', '模拟中心', '机房设备', '高性能',
|
|
|
|
+ '条件建设', '配套系统', '平台建设', '数据', '通用平台', '数据采集', '智慧卫勤', '推演', '孪生',
|
|
|
|
+ '仿真', '兵棋', '智慧校园', '数档', 'AI', '大模型', '模拟', '态势', '数据治理', '财务数据',
|
|
|
|
+ '云办公', '入侵防御', '机房设备', '模拟中心', 'RoCE', 'HPC']
|
|
|
|
+
|
|
|
|
+ for keyword in keywords_list:
|
|
|
|
+ if keyword in title:
|
|
|
|
+ return True
|
|
|
|
+
|
|
|
|
+ return False
|
|
|
|
+
|
|
def start(self, limit=1):
|
|
def start(self, limit=1):
|
|
logger.debug("********** 详情页采集开始 **********")
|
|
logger.debug("********** 详情页采集开始 **********")
|
|
-
|
|
|
|
- with self.db_name.find({"parser_name": "ztpc_qjwqzbcgxxw", "is_crawl": False, "failed": False},
|
|
|
|
|
|
+ # time.sleep(random.randint(10,60))
|
|
|
|
+ with open('./cookies_info.json', 'r') as fr:
|
|
|
|
+ cks = json.loads(fr.read())
|
|
|
|
+ now_time = int(time.time())
|
|
|
|
+ login_time = cks.get('login_time')
|
|
|
|
+ total_count = cks.get('total_count')
|
|
|
|
+ self.cookies = cks.get('cookies')
|
|
|
|
+ self.count = cks.get('perday_count')
|
|
|
|
+
|
|
|
|
+ if login_time + 520000 < now_time:
|
|
|
|
+ logger.warning(" <<< cookies失效,更换cookies >>> ")
|
|
|
|
+ return
|
|
|
|
+ if total_count > 3000:
|
|
|
|
+ logger.warning(" <<< 本次登录采集上限,重新登陆继续采集 >>> ")
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ if self.count > 500:
|
|
|
|
+ logger.warning(" <<< 今日采集上限,明日继续采集 >>> ")
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ # with self.db_name.find({"parser_name": "ztpc_qjwqzbcgxxw", "is_crawl": False, "failed": False},
|
|
|
|
+ with self.db_name.find({"parser_name": "ztpc_qjwqzbcgxxw", "is_crawl": False,},
|
|
sort=[('publishtime', -1)]).limit(limit) as cursor:
|
|
sort=[('publishtime', -1)]).limit(limit) as cursor:
|
|
data_lsit = [dd for dd in cursor]
|
|
data_lsit = [dd for dd in cursor]
|
|
for item in data_lsit:
|
|
for item in data_lsit:
|
|
@@ -114,13 +165,26 @@ class Details:
|
|
update_id = item["_id"]
|
|
update_id = item["_id"]
|
|
result = self.deal_request(item)
|
|
result = self.deal_request(item)
|
|
if result is True:
|
|
if result is True:
|
|
|
|
+ self.count += 1
|
|
self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
|
|
self.db_name.update_one({"_id": update_id}, {"$set": {"is_crawl": True}})
|
|
|
|
+ elif result == "500":
|
|
|
|
+ break
|
|
else:
|
|
else:
|
|
self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
|
|
self.db_name.update_one({"_id": update_id}, {"$set": {"failed": True}})
|
|
time.sleep(random.randint(5, 10))
|
|
time.sleep(random.randint(5, 10))
|
|
|
|
|
|
|
|
+ total_count += self.count
|
|
|
|
+ new_info = {
|
|
|
|
+ "cookies": self.cookies,
|
|
|
|
+ "total_count": total_count,
|
|
|
|
+ "perday_count": self.count,
|
|
|
|
+ "login_time": login_time
|
|
|
|
+ }
|
|
|
|
+ with open('./cookies_info.json', 'w+') as fw:
|
|
|
|
+ fw.write(json.dumps(new_info))
|
|
|
|
+
|
|
logger.debug("********** 详情页采集结束 **********")
|
|
logger.debug("********** 详情页采集结束 **********")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
- Details().start(limit=50)
|
|
|
|
|
|
+ Details().start(limit=random.randint(60,100))
|