Răsfoiți Sursa

采集时间格式统一

dongzhaorui 1 an în urmă
părinte
comite
9bec5940df
2 a modificat fișierele cu 26 adăugiri și 14 ștergeri
  1. 18 10
      FworkSpider/items/njpc_item.py
  2. 8 4
      FworkSpider/items/spider_item.py

+ 18 - 10
FworkSpider/items/njpc_item.py

@@ -17,7 +17,8 @@ class DataNjpcItem(BaseDetailItem):
     }
     __ignore_attr__ = {
         'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
-        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error'
+        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error',
+        'failed_times'
     }
 
     def __init__(self, ignore=None, **kwargs):
@@ -106,16 +107,23 @@ class DataNjpcItem(BaseDetailItem):
             self.title = self.projectname
             log.debug("请检测 < title > 是否正确!")
 
-        if "-" in str(self.publishtime) and ":" in str(self.publishtime):
-            self.publishtime = int2long(tools.date_to_timestamp(self.publishtime))
-        elif "-" in str(self.publishtime) and ":" not in str(self.publishtime):
-            self.publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
-        elif len(str(self.publishtime)) == 10 or len(str(self.publishtime)) == 13: # 或许是时间戳
-            self.publishtime = int2long(int(str(self.publishtime)[:10]))
+        # 时间格式处理
+        cur_time = tools.get_current_date().split(' ')[-1]
+        if "-" in str(self.publishtime) and ":" not in str(self.publishtime):
+            self.publishtime = self.publishtime + " " + cur_time
+        elif "-" not in str(self.publishtime):
+            self.publishtime = tools.timestamp_to_date(int(str(self.publishtime)[:10]))
+            if "00:00:00" in self.publishtime:
+                self.publishtime = self.publishtime.split(' ')[0] + " " + cur_time
         else:
-            raise ValueError("发布时间格式不正确 -> %r " %(self.publishtime))
-
-        if isinstance(self.publishtime,type(self.comeintime)) and self.publishtime > self.comeintime:
+            if "-" in str(self.publishtime) and ":" in str(self.publishtime):
+                pass
+            else:
+                raise ValueError("发布时间格式不正确 -> %r " % (self.publishtime))
+        # 时间字符串转时间戳
+        self.publishtime = int2long(tools.date_to_timestamp(self.publishtime))
+
+        if isinstance(self.publishtime, type(self.comeintime)) and self.publishtime > self.comeintime:
             log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
             self.publishtime = int2long(tools.get_current_timestamp())
 

+ 8 - 4
FworkSpider/items/spider_item.py

@@ -21,7 +21,8 @@ class DataBakItem(BaseDetailItem):
     }
     __ignore_attr__ = {
         'parse_url', 'parser_name', 'parse', 'deal_detail', 'files', 'proxies',
-        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error'
+        'ex_python', 'ex_js', 'request_params', 'comeintime', 'failed', 'error',
+        'failed_times'
     }
 
     def __init__(self, ignore=None, **kwargs):
@@ -64,10 +65,13 @@ class DataBakItem(BaseDetailItem):
             self.s_title = self.title
             log.debug("请检测 < s_title > 是否正确!")
 
-        if ":" in self.publishtime:
-            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
+        # 发布时间处理
+        cur_time = tools.get_current_date().split(' ')[-1]
+        if ":" not in self.publishtime:
+            self.publishtime = self.publishtime + " " + cur_time
         else:
-            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
+            if "00:00:00" in self.publishtime:
+                self.publishtime = self.publishtime.split(' ')[0] + " " + cur_time
 
         if self.l_np_publishtime and self.l_np_publishtime > self.comeintime:
             log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")