|
@@ -2,141 +2,182 @@
|
|
# 中国招标投标公共服务平台
|
|
# 中国招标投标公共服务平台
|
|
# @CreatDate : 4/11/2021 上午 10:04
|
|
# @CreatDate : 4/11/2021 上午 10:04
|
|
# @Author : 马国鹏
|
|
# @Author : 马国鹏
|
|
-# @File : qgzb_spider.py
|
|
|
|
-import sys
|
|
|
|
-sys.path.append('/mnt/FworkSpider')
|
|
|
|
-
|
|
|
|
-import datetime
|
|
|
|
|
|
+import json
|
|
import time
|
|
import time
|
|
from collections import namedtuple
|
|
from collections import namedtuple
|
|
|
|
|
|
import requests
|
|
import requests
|
|
|
|
|
|
-from utils.databases import redis_cluster, mongo_table, int2long
|
|
|
|
|
|
+from utils.databases import mongo_table, int2long, redis_client
|
|
from utils.log import logger
|
|
from utils.log import logger
|
|
from utils.tools import redis_exists, redis_set
|
|
from utils.tools import redis_exists, redis_set
|
|
-from feapder.network.proxy_pool import swordfish_proxy
|
|
|
|
|
|
|
|
-Menu = namedtuple('Menu', ['channel', 'code', 'id', 'crawl_page', "businessKeyWord"])
|
|
|
|
|
|
+Menu = namedtuple('Menu', ['channel', 'code', 'type', 'businessKeyWord'])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def socks_proxy():
|
|
|
|
+ """剑鱼代理"""
|
|
|
|
+ url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
|
|
|
|
+ headers = {"Authorization": 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'}
|
|
|
|
+ proxy = requests.get(url, headers=headers).json()
|
|
|
|
+ proxies = proxy.get('data')
|
|
|
|
+ logger.info(f"切换代理:{proxies}")
|
|
|
|
+ return proxies
|
|
|
|
+
|
|
|
|
|
|
|
|
+def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
|
|
|
|
+ """
|
|
|
|
+ @summary:
|
|
|
|
+ ---------
|
|
|
|
+ @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
|
|
|
|
+ @param time_format:时间格式
|
|
|
|
+ ---------
|
|
|
|
+ @result: 返回时间戳
|
|
|
|
+ """
|
|
|
|
|
|
-class CebPubServiceListPageSpider(object):
|
|
|
|
|
|
+ timestamp = time.mktime(time.strptime(date, time_format))
|
|
|
|
+ return int(timestamp)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class CebPubServiceListPageSpider:
|
|
|
|
|
|
def __init__(self):
|
|
def __init__(self):
|
|
self.menus = [
|
|
self.menus = [
|
|
- Menu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm', "招标项目", 1, "tenderProject"),
|
|
|
|
- Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', "招标公告", 1,"tenderBulletin"),
|
|
|
|
- Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', "开标记录", 1, "openBidRecord"),
|
|
|
|
- Menu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs', "评标公示", 1, "winCandidateBulletin"),
|
|
|
|
- Menu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg', "中标公告", 1, "winBidBulletin"),
|
|
|
|
- # Menu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx', "签约履行", 1,"tenderBulletin"),
|
|
|
|
|
|
+ Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', '招标公告', 'tenderBulletin'),
|
|
|
|
+ Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', '开标记录', 'openBidRecord'),
|
|
|
|
+ Menu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs', '评标公示', 'winCandidateBulletin'),
|
|
|
|
+ Menu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg', '中标公告', 'winBidBulletin'),
|
|
|
|
+ # Menu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx', "签约履行", "tenderBulletin"),
|
|
|
|
+ # Menu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm', '招标项目', 'tenderProject'), # 已废除
|
|
]
|
|
]
|
|
self.crawl_list = mongo_table('py_spider', 'zgzb_list')
|
|
self.crawl_list = mongo_table('py_spider', 'zgzb_list')
|
|
- self.r = redis_cluster()
|
|
|
|
|
|
+ self.r = redis_client()
|
|
self.url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getStringMethod.do'
|
|
self.url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getStringMethod.do'
|
|
|
|
|
|
def start(self):
|
|
def start(self):
|
|
- for menu in self.menus:
|
|
|
|
- self.list_page(menu)
|
|
|
|
-
|
|
|
|
- def list_page(self, menu):
|
|
|
|
- header = {
|
|
|
|
- "Origin": "http://www.cebpubservice.com",
|
|
|
|
- "Host": "www.cebpubservice.com",
|
|
|
|
- "Content-Length": "228",
|
|
|
|
- "X-Requested-With": "XMLHttpRequest",
|
|
|
|
- "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
|
|
|
|
|
+ headers = {
|
|
|
|
+ 'Accept': 'application/json, text/javascript, */*; q=0.01',
|
|
|
|
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
|
|
|
|
+ 'Cache-Control': 'no-cache',
|
|
|
|
+ 'Connection': 'keep-alive',
|
|
|
|
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
|
|
|
+ 'Origin': 'http://www.cebpubservice.com',
|
|
|
|
+ 'Pragma': 'no-cache',
|
|
|
|
+ 'X-Requested-With': 'XMLHttpRequest',
|
|
"Referer": "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2",
|
|
"Referer": "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2",
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4506.400"
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4506.400"
|
|
}
|
|
}
|
|
- data = "searchName=&searchArea=&searchIndustry=¢erPlat=&businessType=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&searchTimeStart=&searchTimeStop=&timeTypeParam=&bulletinIssnTime=&bulletinIssnTimeStop="
|
|
|
|
- data = {i.split("=")[0]: i.split("=")[-1] for i in data.split("&")}
|
|
|
|
- data["businessType"] = menu.id
|
|
|
|
|
|
+ start_time = '2天'
|
|
|
|
+ page_size = 1000
|
|
|
|
|
|
- for page_no in range(1, 10):
|
|
|
|
|
|
+ # today = datetime.date.today()
|
|
|
|
+ # end_day = today - datetime.timedelta(days=-3)
|
|
|
|
+ for menu in self.menus:
|
|
|
|
+ business_type = menu.type
|
|
|
|
+ for page in range(1, 10):
|
|
|
|
+ msg = f'{business_type}-第{page}页'
|
|
|
|
+ data = {
|
|
|
|
+ 'searchName': '',
|
|
|
|
+ 'searchArea': '',
|
|
|
|
+ 'searchIndustry': '',
|
|
|
|
+ 'centerPlat': '',
|
|
|
|
+ 'businessType': business_type,
|
|
|
|
+ 'searchTimeStart': '',
|
|
|
|
+ 'searchTimeStop': '',
|
|
|
|
+ 'timeTypeParam': '',
|
|
|
|
+ 'bulletinIssnTime': start_time,
|
|
|
|
+ 'bulletinIssnTimeStart': '',
|
|
|
|
+ 'bulletinIssnTimeStop': '',
|
|
|
|
+ 'pageNo': page,
|
|
|
|
+ 'row': page_size,
|
|
|
|
+ }
|
|
|
|
+ response = self.request(data, headers, msg=msg)
|
|
|
|
+ if not response:
|
|
|
|
+ logger.info(f'{msg}-接口无数据')
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ resp_json = response.json()
|
|
|
|
+ items = resp_json.get("object")
|
|
|
|
+ if not items:
|
|
|
|
+ logger.info(f'{msg}-无列表数据')
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ return_list = items.get("returnlist")
|
|
|
|
+ logger.info(f"{msg}-采集{len(return_list)}条数据")
|
|
|
|
+ total_page = items.get("page").get("totalPage", 0)
|
|
|
|
+ logger.info(f'{business_type}-共{total_page}页')
|
|
|
|
+ self.parse(return_list, menu)
|
|
|
|
+
|
|
|
|
+ def request(self, data, headers, **kwargs):
|
|
|
|
+ logger.info(f"开始请求{kwargs.get('msg')}")
|
|
|
|
+ while True:
|
|
try:
|
|
try:
|
|
- today = datetime.date.today()
|
|
|
|
- endday = today - datetime.timedelta(days=-3)
|
|
|
|
- data["pageNo"] = str(page_no)
|
|
|
|
- data["row"] = '1000'
|
|
|
|
- data["bulletinIssnTime"] = "2天"
|
|
|
|
-
|
|
|
|
- if menu.id == '招标公告':
|
|
|
|
- # data["bulletinIssnTimeStart"] = str(endday)
|
|
|
|
- # data["bulletinIssnTimeStart"] = str(today)
|
|
|
|
- data["bulletinIssnTimeStart"] = ""
|
|
|
|
- data["bulletinIssnTimeStart"] = ""
|
|
|
|
- else:
|
|
|
|
- # data["searchTimeStop"] = str(endday)
|
|
|
|
- #
|
|
|
|
- # data["searchTimeStart"] = str(today)
|
|
|
|
- data["searchTimeStart"] = ""
|
|
|
|
- data["searchTimeStop"] = ""
|
|
|
|
-
|
|
|
|
- logger.info(f"开始请求第{page_no}页")
|
|
|
|
- # 发起请求
|
|
|
|
- res = requests.post(
|
|
|
|
- self.url,
|
|
|
|
- headers=header,
|
|
|
|
|
|
+ request_param = dict(
|
|
|
|
+ headers=headers,
|
|
data=data,
|
|
data=data,
|
|
|
|
+ proxies=socks_proxy(),
|
|
timeout=5,
|
|
timeout=5,
|
|
- proxies=swordfish_proxy()
|
|
|
|
)
|
|
)
|
|
- res = res.json()
|
|
|
|
- # 解析结果
|
|
|
|
- max_page = res.get("object").get("page").get("totalPage") or 0
|
|
|
|
- list_page_datas = res.get("object").get("returnlist")
|
|
|
|
- logger.info(f"请求成功,最大{max_page}页-{len(list_page_datas)}条数据")
|
|
|
|
- # 数据处理
|
|
|
|
- for jtme in list_page_datas:
|
|
|
|
- businessid = jtme.get("businessId")
|
|
|
|
- tenderprojectcode = jtme.get("tenderProjectCode")
|
|
|
|
- businessobjectname = jtme.get("businessObjectName")
|
|
|
|
- transactionplatfcode = jtme.get("transactionPlatfCode")
|
|
|
|
- transactionplatfname = jtme.get("transactionPlatfName")
|
|
|
|
-
|
|
|
|
- regionname = jtme.get("regionName")
|
|
|
|
- city = ''
|
|
|
|
- if regionname is not None:
|
|
|
|
- city = "" if "市" in regionname else regionname.split(" ")[-1]
|
|
|
|
-
|
|
|
|
- if jtme.get("businessObjectName") is None:
|
|
|
|
- continue
|
|
|
|
- if jtme.get("businessObjectName") == '':
|
|
|
|
- continue
|
|
|
|
-
|
|
|
|
- item = {
|
|
|
|
- "schemaVersion": jtme.get("schemaVersion"),
|
|
|
|
- "type": jtme.get("type"),
|
|
|
|
- "businessKeyWord": menu.businessKeyWord,
|
|
|
|
- "rowGuid": jtme.get("rowGuid"),
|
|
|
|
- "site": "中国招标投标公共服务平台",
|
|
|
|
- "channel": menu.channel,
|
|
|
|
- "area": jtme.get("regionName"),
|
|
|
|
- "_d": "comeintime",
|
|
|
|
- "comeintime": int2long(int(time.time())),
|
|
|
|
- "T": "bidding",
|
|
|
|
- "sendflag": "false",
|
|
|
|
- "spidercode": menu.code,
|
|
|
|
- "city": city,
|
|
|
|
- "iscompete": "true",
|
|
|
|
- "publishdept": "",
|
|
|
|
- "title": jtme.get("businessObjectName"),
|
|
|
|
- "href": businessid + "&" + tenderprojectcode + "&" + transactionplatfcode,
|
|
|
|
- "publishtime": str(jtme.get("receiveTime")) + " 00:00:00",
|
|
|
|
- "l_np_publishtime": int2long(int(time.mktime(time.strptime(jtme.get("receiveTime"), "%Y-%m-%d")))),
|
|
|
|
- "detail": "",
|
|
|
|
- "contenthtml": "",
|
|
|
|
- "infoformat": 1
|
|
|
|
- }
|
|
|
|
- feature = businessid + "&" + tenderprojectcode + "&" + transactionplatfcode
|
|
|
|
- if not redis_exists(feature, self.r):
|
|
|
|
- result = self.crawl_list.insert_one(item)
|
|
|
|
- logger.info(f"{menu.channel} >>> {result.inserted_id}-{item['title']} --上传成功")
|
|
|
|
- redis_set(feature, self.r)
|
|
|
|
|
|
+ response = requests.post(self.url, **request_param)
|
|
|
|
+ logger.info(f'{kwargs.get("msg")}--请求成功')
|
|
|
|
+ response.json() # 检测数据是否请求成功
|
|
|
|
+ return response
|
|
|
|
+ except json.decoder.JSONDecodeError:
|
|
|
|
+ logger.error(f"{kwargs.get('msg')}--代理受限,等待重试")
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- print(e)
|
|
|
|
- logger.error(f"请求失败,原因:{e.args}")
|
|
|
|
|
|
+ logger.error(f"{kwargs.get('msg')}--请求失败")
|
|
|
|
+ logger.exception(f'异常原因:{e}')
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ def parse(self, items, menu):
|
|
|
|
+ for jtme in items:
|
|
|
|
+ businessid = jtme.get("businessId")
|
|
|
|
+ tenderprojectcode = jtme.get("tenderProjectCode")
|
|
|
|
+ platf_code = jtme.get("transactionPlatfCode")
|
|
|
|
+ href = "&".join([businessid, tenderprojectcode, platf_code])
|
|
|
|
+ publish_time = jtme.get("receiveTime")
|
|
|
|
+ title = jtme.get("businessObjectName")
|
|
|
|
+ if not title:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ region = jtme.get('regionName', '') or ''
|
|
|
|
+ if region and len(region.split(' ')) >= 2:
|
|
|
|
+ area, city = region.split(' ')
|
|
|
|
+ elif len(region.split(' ')) == 1:
|
|
|
|
+ area, city = region, ''
|
|
|
|
+ else:
|
|
|
|
+ area, city = '', ''
|
|
|
|
+
|
|
|
|
+ item = {
|
|
|
|
+ "schemaVersion": jtme.get("schemaVersion"),
|
|
|
|
+ "type": jtme.get("type"),
|
|
|
|
+ "businessKeyWord": menu.businessKeyWord,
|
|
|
|
+ "rowGuid": jtme.get("rowGuid"),
|
|
|
|
+ "title": title,
|
|
|
|
+ "href": href,
|
|
|
|
+ "site": "中国招标投标公共服务平台",
|
|
|
|
+ "channel": menu.channel,
|
|
|
|
+ "spidercode": menu.code,
|
|
|
|
+ "area": area,
|
|
|
|
+ "city": city,
|
|
|
|
+ "district": "",
|
|
|
|
+ "comeintime": int2long(int(time.time())),
|
|
|
|
+ "publishtime": publish_time,
|
|
|
|
+ "l_np_publishtime": int2long(date_to_timestamp(publish_time, '%Y-%m-%d')),
|
|
|
|
+ "detail": "",
|
|
|
|
+ "contenthtml": "",
|
|
|
|
+ "T": "bidding",
|
|
|
|
+ "sendflag": "false",
|
|
|
|
+ "iscompete": True,
|
|
|
|
+ "_d": "comeintime",
|
|
|
|
+ "publishdept": "",
|
|
|
|
+ "infoformat": 1
|
|
|
|
+ }
|
|
|
|
+ if not redis_exists(href, self.r):
|
|
|
|
+ result = self.crawl_list.insert_one(item)
|
|
|
|
+ redis_set(href, self.r)
|
|
|
|
+ msg = f"{item['title']} - ObjectId('{result.inserted_id}')"
|
|
|
|
+ logger.info(f"{menu.channel} >>> {msg} --上传成功")
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|