dzr hai 5 meses
pai
achega
5ea8908665
Modificáronse 1 ficheiros con 16 adicións e 11 borrados
  1. 16 11
      ybw/detail_spider.py

+ 16 - 11
ybw/detail_spider.py

@@ -68,10 +68,10 @@ def page_source(element: HtmlElement):
 class DetailSpider:
 
     def __init__(
-            self,
-            db: str,
-            crawl_tab: str,
-            save_tab: str,
+        self,
+        db: str,
+        crawl_tab: str,
+        save_tab: str,
     ):
         self.crawl_tab = mongo_table(db, crawl_tab)
         self.save_tab = mongo_table(db, save_tab)
@@ -90,16 +90,13 @@ class DetailSpider:
         self._update_crawl_task(task['_id'], **update)
 
     def json_request(self, fid, request_params):
-
         url = "https://www.chinabidding.cn/agency.info.Detail/show"
         params = {
             "fid": f"{fid}"
         }
-
         res = requests.get(url, params=params, **request_params)
         return res
 
-
     def crawl_request(self, item: dict):
         url = item['competehref']
         headers = {
@@ -212,8 +209,12 @@ class DetailSpider:
         '''检查原始页面内容'''
         source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', html)
         if source_url:
-            self.save_url.insert_one({"site":"元博网", "title": item['title'], "source_url":source_url.group(1),
-                                      "comeintime":int2long(int(time.time()))})
+            self.save_url.insert_one({
+                "site": "元博网",
+                "title": item['title'],
+                "source_url": source_url.group(1),
+                "comeintime": int2long(int(time.time()))
+            })
         CheckText(html)
         item["contenthtml"] = html
         special = {
@@ -226,8 +227,12 @@ class DetailSpider:
         '''检查清洗之后的详情'''
         source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', item["detail"])
         if source_url:
-            self.save_url.insert_one({"site": "元博网", "title": item['title'], "source_url": source_url.group(1),
-                                      "comeintime":int2long(int(time.time()))})
+            self.save_url.insert_one({
+                "site": "元博网",
+                "title": item['title'],
+                "source_url": source_url.group(1),
+                "comeintime": int2long(int(time.time()))
+            })
         CheckText(item["detail"])
         insert = {}
         for key, val in item.items():