|
@@ -13,10 +13,11 @@ from pymongo import MongoClient
|
|
|
|
|
|
from spider_search import SearchSpider
|
|
from spider_search import SearchSpider
|
|
|
|
|
|
-# check_db = MongoClient('127.0.0.1', port=27080, username="", password="").py_spider
|
|
|
|
-check_db = MongoClient('172.17.4.87', port=27080, username="", password="").py_spider
|
|
|
|
|
|
+# mgo = MongoClient('127.0.0.1', port=27080, username="", password="")
|
|
|
|
+mgo = MongoClient('172.17.4.87', port=27080, username="", password="")
|
|
|
|
+theme_list = mgo['py_spider']['theme_list']
|
|
|
|
|
|
-theme_list = check_db.theme_list
|
|
|
|
|
|
+# 创建爬虫实例
|
|
search = SearchSpider()
|
|
search = SearchSpider()
|
|
|
|
|
|
|
|
|
|
@@ -30,19 +31,21 @@ def start(limit):
|
|
"retry": {"$gte": 4, "$lte": 10}
|
|
"retry": {"$gte": 4, "$lte": 10}
|
|
}
|
|
}
|
|
sort = [("publishtime", -1)]
|
|
sort = [("publishtime", -1)]
|
|
- with theme_list.find(query, no_cursor_timeout=True, sort=sort).limit(limit) as cursor:
|
|
|
|
- data_lsit = [dd for dd in cursor]
|
|
|
|
|
|
+ p = {"title": 1, "retry": 1, "_id": 1}
|
|
|
|
+ with theme_list.find(query, projection=p, sort=sort, limit=limit) as cursor:
|
|
|
|
+ task_items = [doc for doc in cursor]
|
|
|
|
|
|
- for info in data_lsit:
|
|
|
|
- title = "".join(info.get('title').split()).strip()
|
|
|
|
- retry = info.get('retry')
|
|
|
|
|
|
+ for item in task_items:
|
|
|
|
+ _id = item['_id']
|
|
|
|
+ title = "".join(item['title'].split()).strip()
|
|
result = search.spider(title)
|
|
result = search.spider(title)
|
|
- if result:
|
|
|
|
- theme_list.update_one({"_id": info["_id"]}, {"$set": {"is_crawl": True, "failed": False}})
|
|
|
|
|
|
+ if result is True:
|
|
|
|
+ theme_list.update_one({"_id": _id}, {"$set": {"is_crawl": True, "failed": False}})
|
|
else:
|
|
else:
|
|
- retry += 1
|
|
|
|
- theme_list.update_one({"_id": info["_id"]}, {"$set": {"retry": retry}})
|
|
|
|
- logger.warning(f"{title} 补采失败!")
|
|
|
|
|
|
+ retry = item["retry"] + 1
|
|
|
|
+ theme_list.update_one({"_id": _id}, {"$set": {"retry": retry}})
|
|
|
|
+ logger.error(f"{title}|补采失败")
|
|
|
|
+
|
|
time.sleep(1)
|
|
time.sleep(1)
|
|
|
|
|
|
logger.debug("uuid失效数据,补采完成!")
|
|
logger.debug("uuid失效数据,补采完成!")
|