|
@@ -75,6 +75,7 @@ class DetailSpider:
|
|
|
):
|
|
|
self.crawl_tab = mongo_table(db, crawl_tab)
|
|
|
self.save_tab = mongo_table(db, save_tab)
|
|
|
+ self.save_url = mongo_table("editor", "source_url")
|
|
|
self.user = None
|
|
|
|
|
|
def _update_crawl_task(self, tid, **kwargs):
|
|
@@ -209,6 +210,10 @@ class DetailSpider:
|
|
|
|
|
|
html = page_source(valid_node)
|
|
|
'''检查原始页面内容'''
|
|
|
+ source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', html)
|
|
|
+ if source_url:
|
|
|
+ self.save_url.insert_one({"site":"元博网", "source_url":source_url.group(1),
|
|
|
+ "comeintime":int2long(int(time.time()))})
|
|
|
CheckText(html)
|
|
|
item["contenthtml"] = html
|
|
|
special = {
|
|
@@ -219,6 +224,10 @@ class DetailSpider:
|
|
|
item["detail"] = cleaner(html, special)
|
|
|
item["comeintime"] = int2long(int(time.time()))
|
|
|
'''检查清洗之后的详情'''
|
|
|
+ source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', item["detail"])
|
|
|
+ if source_url:
|
|
|
+ self.save_url.insert_one({"site": "元博网", "source_url": source_url.group(1),
|
|
|
+ "comeintime":int2long(int(time.time()))})
|
|
|
CheckText(item["detail"])
|
|
|
insert = {}
|
|
|
for key, val in item.items():
|