dzr 3 months ago
parent
commit
09e57767c8
1 changed files with 16 additions and 13 deletions
  1. 16 13
      lzz_theme/qgzbgggsssyq/ssyq_main.py

+ 16 - 13
lzz_theme/qgzbgggsssyq/ssyq_main.py

@@ -13,10 +13,11 @@ from pymongo import MongoClient
 
 from spider_search import SearchSpider
 
-# check_db = MongoClient('127.0.0.1', port=27080, username="", password="").py_spider
-check_db = MongoClient('172.17.4.87', port=27080, username="", password="").py_spider
+# mgo = MongoClient('127.0.0.1', port=27080, username="", password="")
+mgo = MongoClient('172.17.4.87', port=27080, username="", password="")
+theme_list = mgo['py_spider']['theme_list']
 
-theme_list = check_db.theme_list
+# 创建爬虫实例
 search = SearchSpider()
 
 
@@ -30,19 +31,21 @@ def start(limit):
         "retry": {"$gte": 4, "$lte": 10}
     }
     sort = [("publishtime", -1)]
-    with theme_list.find(query, no_cursor_timeout=True, sort=sort).limit(limit) as cursor:
-        data_lsit = [dd for dd in cursor]
+    p = {"title": 1, "retry": 1, "_id": 1}
+    with theme_list.find(query, projection=p, sort=sort, limit=limit) as cursor:
+        task_items = [doc for doc in cursor]
 
-    for info in data_lsit:
-        title = "".join(info.get('title').split()).strip()
-        retry = info.get('retry')
+    for item in task_items:
+        _id = item['_id']
+        title = "".join(item['title'].split()).strip()
         result = search.spider(title)
-        if result:
-            theme_list.update_one({"_id": info["_id"]}, {"$set": {"is_crawl": True, "failed": False}})
+        if result is True:
+            theme_list.update_one({"_id": _id}, {"$set": {"is_crawl": True, "failed": False}})
         else:
-            retry += 1
-            theme_list.update_one({"_id": info["_id"]}, {"$set": {"retry": retry}})
-            logger.warning(f"{title} 补采失败!")
+            retry = item["retry"] + 1
+            theme_list.update_one({"_id": _id}, {"$set": {"retry": retry}})
+            logger.error(f"{title}|补采失败")
+
         time.sleep(1)
 
     logger.debug("uuid失效数据,补采完成!")