|
@@ -73,19 +73,13 @@ class Details(feapder.AirSpider):
|
|
|
"businessId": encode_info(businessid),
|
|
|
}
|
|
|
detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do'
|
|
|
- yield feapder.Request(
|
|
|
- url=detail_url,
|
|
|
- item=item,
|
|
|
- method="POST",
|
|
|
- data=data,
|
|
|
- callback=self.detail_get,
|
|
|
- timeout=5,
|
|
|
- use_session=True,
|
|
|
- count=0)
|
|
|
+ yield feapder.Request(url=detail_url, item=item, method="POST",
|
|
|
+ data=data, callback=self.detail_get,
|
|
|
+ timeout=5, use_session=True, count=0)
|
|
|
|
|
|
def download_midware(self, request):
|
|
|
request.proxies = self.proxy
|
|
|
- log.info(request.item.get("title"))
|
|
|
+ # log.info(request.item.get("title") + ' {}'.format(request.proxies))
|
|
|
request.headers = {
|
|
|
"Host": "www.cebpubservice.com",
|
|
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
@@ -115,42 +109,34 @@ class Details(feapder.AirSpider):
|
|
|
count=0)
|
|
|
elif '滑动验证页面' in response.text:
|
|
|
log.info('开始过滑块验证')
|
|
|
+ '''尝试代理池获取通过滑块验证的cookies会话信息,进行采集'''
|
|
|
cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
|
|
|
+ # print("cookies >>> ", cookies)
|
|
|
count = request.count
|
|
|
if count > 4:
|
|
|
return
|
|
|
- if cookies is None or len(cookies) <= 1:
|
|
|
+
|
|
|
+ if not cookies or len(cookies) <= 1:
|
|
|
self.proxy = swordfish_proxy()
|
|
|
- else:
|
|
|
- request.session.cookies.update(cookies)
|
|
|
- yield feapder.Request(
|
|
|
- url=request.url,
|
|
|
- item=request.item,
|
|
|
- method="POST",
|
|
|
- data=request.data,
|
|
|
- callback=self.detail_get,
|
|
|
- timeout=5,
|
|
|
- use_session=True,
|
|
|
- count=count + 1)
|
|
|
+
|
|
|
+ request.session.cookies.update(cookies)
|
|
|
+ yield feapder.Request(url=request.url, item=request.item,
|
|
|
+ method="POST", data=request.data,
|
|
|
+ callback=self.detail_get, timeout=5,
|
|
|
+ use_session=True, count=count + 1)
|
|
|
else:
|
|
|
try:
|
|
|
response.json
|
|
|
except Exception as e:
|
|
|
- log.info(e)
|
|
|
+ log.warning(f"状态码:{response.status_code} {e.__class__.__name__}:{e.args[0]}")
|
|
|
self.proxy = swordfish_proxy()
|
|
|
cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
|
|
|
request.session.cookies.update(cookies)
|
|
|
- yield feapder.Request(
|
|
|
- url=request.url,
|
|
|
- item=request.item,
|
|
|
- method="POST",
|
|
|
- data=request.data,
|
|
|
- callback=self.detail_get,
|
|
|
- timeout=5,
|
|
|
- use_session=True,
|
|
|
- cookies=cookies,
|
|
|
- count=0)
|
|
|
-
|
|
|
+ yield feapder.Request(url=request.url, item=request.item,
|
|
|
+ method="POST", data=request.data,
|
|
|
+ callback=self.detail_get, timeout=5,
|
|
|
+ use_session=True, cookies=cookies,
|
|
|
+ count=0)
|
|
|
else:
|
|
|
item = request.item
|
|
|
tenderprojectcode = item.get("href").split("&")[1]
|
|
@@ -177,23 +163,27 @@ class Details(feapder.AirSpider):
|
|
|
item["area"] = "全国"
|
|
|
item["city"] = ""
|
|
|
|
|
|
+ # 页面上暂无数据
|
|
|
if detail_info is None or detail_info == []:
|
|
|
+ # response.json = {'message': '', 'success': True, 'object': {'tenderProject': []}}
|
|
|
businessKeyWords = response.json.get("object").keys()
|
|
|
for key in businessKeyWords:
|
|
|
businesskeyword = key
|
|
|
+
|
|
|
detail_info = response.json.get("object").get(businesskeyword)
|
|
|
- if detail_info is None or detail_info == []:
|
|
|
+ if not detail_info:
|
|
|
+ taskid = item.pop("_id")
|
|
|
_uuid = businessid + tenderprojectcode
|
|
|
item["href"] = "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid=%s" % _uuid
|
|
|
item["sendflag"] = "true"
|
|
|
- item["comeintime"] = int(time.time())
|
|
|
- result = self.to_db.add("data_bak", item)
|
|
|
- self.to_db.update(self.db_name, {"timeout": 3}, {"_id": item["_id"]})
|
|
|
- log.info("mongo add _id:{}<空结果".format(item.get('title')))
|
|
|
+ item["comeintime"] = int2long(int(time.time()))
|
|
|
+ self.to_db.add("data_bak", item)
|
|
|
+ log.info("mongo {}-{}--空结果".format(item["_id"], item.get('title')))
|
|
|
+ self.to_db.update(self.db_name, {"timeout": 3}, {"_id": taskid})
|
|
|
+ return
|
|
|
|
|
|
if businesskeyword == "tenderProject":
|
|
|
item["contenthtml"] = splicing(detail_info)
|
|
|
- pass
|
|
|
else:
|
|
|
detail_info = detail_info[0]
|
|
|
item["contenthtml"] = detail_info.get("bulletinContent")
|
|
@@ -208,13 +198,13 @@ class Details(feapder.AirSpider):
|
|
|
item["href"] = "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid=%s" % _uuid
|
|
|
if text_search(item["detail"]).total == 0:
|
|
|
item["sendflag"] = "true"
|
|
|
+
|
|
|
item["comeintime"] = int2long(int(time.time()))
|
|
|
taskid = item.pop("_id")
|
|
|
try:
|
|
|
- result = self.to_db.add("data_bak", item)
|
|
|
- log.info("_id:{}".format(str(item['_id'])))
|
|
|
+ self.to_db.add("data_bak", item)
|
|
|
+ log.info("mongo {}-{}-{}--上传成功".format(str(item['_id']), item.get('title'), item.get('publishtime')))
|
|
|
self.to_db.update(self.db_name, {"timeout": 2}, {"_id": taskid})
|
|
|
- log.info("mongo add _id:{},{}".format(item.get('title'), item.get('publishtime')))
|
|
|
print("抓取成功")
|
|
|
except:
|
|
|
item["_id"] = taskid
|
|
@@ -223,12 +213,12 @@ class Details(feapder.AirSpider):
|
|
|
|
|
|
def exception_request(self, request, response):
|
|
|
if response is None:
|
|
|
- item = request.item
|
|
|
self.proxy = swordfish_proxy()
|
|
|
log.info("切换代理")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- spider = Details(thread_count=1)
|
|
|
- spider.start()
|
|
|
- spider.join()
|
|
|
+ while True:
|
|
|
+ spider = Details(thread_count=1)
|
|
|
+ spider.start()
|
|
|
+ spider.join()
|