dongzhaorui 2 years ago
parent
commit
bd4108a4c8
2 changed files with 39 additions and 50 deletions
  1. 2 3
      zgztb_cookie/detail_firefox.py
  2. 37 47
      zgztb_cookie/detail_normol.py

+ 2 - 3
zgztb_cookie/detail_firefox.py

@@ -95,10 +95,9 @@ class Details(feapder.Spider):
 
         taskid = item.pop("_id")
         try:
-            result = self.to_db.add("data_bak", item)
-            log.info(f"_id:{str(item['_id'])}")
+            self.to_db.add("data_bak", item)
+            log.info("mongo {}-{}-{}--上传成功".format(str(item['_id']), item.get('title'), item.get('publishtime')))
             self.to_db.update(self.db_name, {"timeout": 3}, {"_id": taskid})
-            log.info(f"mongo add _id:{item.get('title')},{item.get('publishtime')}")
             print("抓取成功")
         except:
             item["_id"] = taskid

+ 37 - 47
zgztb_cookie/detail_normol.py

@@ -73,19 +73,13 @@ class Details(feapder.AirSpider):
                 "businessId": encode_info(businessid),
             }
             detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do'
-            yield feapder.Request(
-                url=detail_url,
-                item=item,
-                method="POST",
-                data=data,
-                callback=self.detail_get,
-                timeout=5,
-                use_session=True,
-                count=0)
+            yield feapder.Request(url=detail_url, item=item, method="POST",
+                                  data=data, callback=self.detail_get,
+                                  timeout=5, use_session=True, count=0)
 
     def download_midware(self, request):
         request.proxies = self.proxy
-        log.info(request.item.get("title"))
+        # log.info(request.item.get("title") + ' {}'.format(request.proxies))
         request.headers = {
             "Host": "www.cebpubservice.com",
             "Accept": "application/json, text/javascript, */*; q=0.01",
@@ -115,42 +109,34 @@ class Details(feapder.AirSpider):
                     count=0)
         elif '滑动验证页面' in response.text:
             log.info('开始过滑块验证')
+            '''尝试代理池获取通过滑块验证的cookies会话信息,进行采集'''
             cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
+            # print("cookies >>> ", cookies)
             count = request.count
             if count > 4:
                 return
-            if cookies is None or len(cookies) <= 1:
+
+            if not cookies or len(cookies) <= 1:
                 self.proxy = swordfish_proxy()
-            else:
-                request.session.cookies.update(cookies)
-                yield feapder.Request(
-                    url=request.url,
-                    item=request.item,
-                    method="POST",
-                    data=request.data,
-                    callback=self.detail_get,
-                    timeout=5,
-                    use_session=True,
-                    count=count + 1)
+
+            request.session.cookies.update(cookies)
+            yield feapder.Request(url=request.url, item=request.item,
+                                  method="POST", data=request.data,
+                                  callback=self.detail_get, timeout=5,
+                                  use_session=True, count=count + 1)
         else:
             try:
                 response.json
             except Exception as e:
-                log.info(e)
+                log.warning(f"状态码:{response.status_code} {e.__class__.__name__}:{e.args[0]}")
                 self.proxy = swordfish_proxy()
                 cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
                 request.session.cookies.update(cookies)
-                yield feapder.Request(
-                    url=request.url,
-                    item=request.item,
-                    method="POST",
-                    data=request.data,
-                    callback=self.detail_get,
-                    timeout=5,
-                    use_session=True,
-                    cookies=cookies,
-                    count=0)
-
+                yield feapder.Request(url=request.url, item=request.item,
+                                      method="POST", data=request.data,
+                                      callback=self.detail_get, timeout=5,
+                                      use_session=True, cookies=cookies,
+                                      count=0)
             else:
                 item = request.item
                 tenderprojectcode = item.get("href").split("&")[1]
@@ -177,23 +163,27 @@ class Details(feapder.AirSpider):
                     item["area"] = "全国"
                     item["city"] = ""
 
+                # 页面上暂无数据
                 if detail_info is None or detail_info == []:
+                    #  response.json = {'message': '', 'success': True, 'object': {'tenderProject': []}}
                     businessKeyWords = response.json.get("object").keys()
                     for key in businessKeyWords:
                         businesskeyword = key
+
                     detail_info = response.json.get("object").get(businesskeyword)
-                    if detail_info is None or detail_info == []:
+                    if not detail_info:
+                        taskid = item.pop("_id")
                         _uuid = businessid + tenderprojectcode
                         item["href"] = "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid=%s" % _uuid
                         item["sendflag"] = "true"
-                        item["comeintime"] = int(time.time())
-                        result = self.to_db.add("data_bak", item)
-                        self.to_db.update(self.db_name, {"timeout": 3}, {"_id": item["_id"]})
-                        log.info("mongo add _id:{}<空结果".format(item.get('title')))
+                        item["comeintime"] = int2long(int(time.time()))
+                        self.to_db.add("data_bak", item)
+                        log.info("mongo {}-{}--空结果".format(item["_id"], item.get('title')))
+                        self.to_db.update(self.db_name, {"timeout": 3}, {"_id": taskid})
+                        return
 
                 if businesskeyword == "tenderProject":
                     item["contenthtml"] = splicing(detail_info)
-                    pass
                 else:
                     detail_info = detail_info[0]
                     item["contenthtml"] = detail_info.get("bulletinContent")
@@ -208,13 +198,13 @@ class Details(feapder.AirSpider):
                 item["href"] = "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid=%s" % _uuid
                 if text_search(item["detail"]).total == 0:
                     item["sendflag"] = "true"
+
                 item["comeintime"] = int2long(int(time.time()))
                 taskid = item.pop("_id")
                 try:
-                    result = self.to_db.add("data_bak", item)
-                    log.info("_id:{}".format(str(item['_id'])))
+                    self.to_db.add("data_bak", item)
+                    log.info("mongo {}-{}-{}--上传成功".format(str(item['_id']), item.get('title'), item.get('publishtime')))
                     self.to_db.update(self.db_name, {"timeout": 2}, {"_id": taskid})
-                    log.info("mongo add _id:{},{}".format(item.get('title'), item.get('publishtime')))
                     print("抓取成功")
                 except:
                     item["_id"] = taskid
@@ -223,12 +213,12 @@ class Details(feapder.AirSpider):
 
     def exception_request(self, request, response):
         if response is None:
-            item = request.item
             self.proxy = swordfish_proxy()
             log.info("切换代理")
 
 
 if __name__ == "__main__":
-    spider = Details(thread_count=1)
-    spider.start()
-    spider.join()
+    while True:
+        spider = Details(thread_count=1)
+        spider.start()
+        spider.join()