Procházet zdrojové kódy

元博网抽取寻源地址

lizongze před 1 rokem
rodič
revize
88e3f4ebd7

+ 1 - 0
ybw/config/conf.yaml

@@ -13,6 +13,7 @@ redis:
   pwd: "k5ZJR5KV4q7DRZ92DQ"
 #  host: 127.0.0.1
 #  port: !!int 6379
+#  pwd: ""
   db: !!int 1
 
 

+ 9 - 0
ybw/detail_spider.py

@@ -75,6 +75,7 @@ class DetailSpider:
     ):
         self.crawl_tab = mongo_table(db, crawl_tab)
         self.save_tab = mongo_table(db, save_tab)
+        self.save_url = mongo_table("editor", "source_url")
         self.user = None
 
     def _update_crawl_task(self, tid, **kwargs):
@@ -209,6 +210,10 @@ class DetailSpider:
 
             html = page_source(valid_node)
         '''检查原始页面内容'''
+        source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', html)
+        if source_url:
+            self.save_url.insert_one({"site":"元博网", "source_url":source_url.group(1),
+                                      "comeintime":int2long(int(time.time()))})
         CheckText(html)
         item["contenthtml"] = html
         special = {
@@ -219,6 +224,10 @@ class DetailSpider:
         item["detail"] = cleaner(html, special)
         item["comeintime"] = int2long(int(time.time()))
         '''检查清洗之后的详情'''
+        source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', item["detail"])
+        if source_url:
+            self.save_url.insert_one({"site": "元博网", "source_url": source_url.group(1),
+                                      "comeintime":int2long(int(time.time()))})
         CheckText(item["detail"])
         insert = {}
         for key, val in item.items():

+ 0 - 2
zbytb/crawler/spiders/DetailPageSpider.py

@@ -272,8 +272,6 @@ class CrawlDetailPageSpider:
             item = sc.crawl_task
             if len(item) == 0:
                 return False
-            # item_list = mongo_table('py_spider', 'zbytb_list').find({"count": 0, "crawl": False, "crawl_status": {"$exists": False}}).limit(1).sort("_id",1)
-            # for item in item_list:
             logger.info(f">>> {item['title']} - {item['competehref']}")
             self._lock_task(item)
             sc.spider_code = self.spider_code = item['spidercode']