ソースを参照

更新业务处理流程

dongzhaorui 1 年間 前
コミット
3420cb7201
2 ファイル変更73 行追加42 行削除
  1. 39 21
      FworkSpider/items/njpc_item.py
  2. 34 21
      FworkSpider/items/spider_item.py

+ 39 - 21
FworkSpider/items/njpc_item.py

@@ -102,37 +102,37 @@ class DataNjpcItem(BaseDetailItem):
         kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
         self.__dict__.update(kw)
 
-    def pre_to_db(self):
-        if not self.title:
-            self.title = self.projectname
-            log.debug("请检测 < title > 是否正确!")
-
+    def handle_publish_time(self):
         # 时间格式处理
-        cur_time = tools.get_current_date().split(' ')[-1]
-        if "-" in str(self.publishtime) and ":" not in str(self.publishtime):
-            self.publishtime = self.publishtime + " " + cur_time
-        elif "-" not in str(self.publishtime):
-            self.publishtime = tools.timestamp_to_date(int(str(self.publishtime)[:10]))
-            if "00:00:00" in self.publishtime:
-                self.publishtime = self.publishtime.split(' ')[0] + " " + cur_time
+        publishtime = str(self.publishtime)
+        time_str = tools.get_current_date().split(' ')[-1]
+        if "-" in publishtime and ":" not in publishtime:
+            publishtime = publishtime + " " + time_str
+        elif "-" not in publishtime:
+            publishtime = tools.timestamp_to_date(int(publishtime[:10]))
+            if "00:00:00" in publishtime:
+                publishtime = publishtime.split(' ')[0] + " " + time_str
         else:
-            if "-" in str(self.publishtime) and ":" in str(self.publishtime):
+            if "-" in publishtime and ":" in publishtime:
                 pass
             else:
-                raise ValueError("发布时间格式不正确 -> %r " % (self.publishtime))
+                raise ValueError("发布时间格式不正确 -> %r " % self.publishtime)
+
         # 时间字符串转时间戳
-        self.publishtime = int2long(tools.date_to_timestamp(self.publishtime))
+        self.publishtime = int2long(tools.date_to_timestamp(publishtime))
+
+    def handle_publish_time_overdue(self):
+        """超期发布时间处理"""
+        if not isinstance(self.publishtime, int):
+            raise TypeError("发布时间类型不正确 -> %s " % type(self.publishtime))
 
-        if isinstance(self.publishtime, type(self.comeintime)) and self.publishtime > self.comeintime:
+        if self.publishtime > self.comeintime:
             log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
             self.publishtime = int2long(tools.get_current_timestamp())
 
-        if not self.projectname or not self.publishtime or not self.href:
-            log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
-            self.dont_save = True
-
+    def handle_page_html(self):
         if not self.contenthtml:
-            log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
+            log.warning(f"页面源码不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
             self.dont_save = True
         else:
             if not self.detail:
@@ -141,9 +141,27 @@ class DataNjpcItem(BaseDetailItem):
             if text_search(self.detail).total == 0:
                 self.sendflag = "true"
 
+    def check_data_validity(self):
+        if not self.dont_save:
+            if not self.projectname or not self.publishtime or not self.href:
+                log.warning(f"基础数据不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
+                self.dont_save = True
+
+    def cleanup(self):
         if not self.projectinfo:
             del self.projectinfo
 
+    def pre_to_db(self):
+        if not self.title:
+            self.title = self.projectname
+            log.debug("请检测 < title > 是否正确!")
+
+        self.handle_publish_time()
+        self.handle_publish_time_overdue()
+        self.handle_page_html()
+        self.check_data_validity()
+        self.cleanup()
+
 
 class NjpcListItem(BaseListItem):
 

+ 34 - 21
FworkSpider/items/spider_item.py

@@ -60,27 +60,35 @@ class DataBakItem(BaseDetailItem):
         kw = {k: v for k, v in kwargs.items() if k not in self.__ignore_attr__}
         self.__dict__.update(kw)
 
-    def pre_to_db(self):
-        if not self.s_title:
-            self.s_title = self.title
-            log.debug("请检测 < s_title > 是否正确!")
+    def cleanup(self):
+        # 竞品网站-详情页地址标识字段
+        if not self.competehref:
+            del self.competehref
+
+        # 详情无附件,不需要 projectinfo 字段
+        if not self.projectinfo:
+            del self.projectinfo
 
-        # 发布时间处理
-        cur_time = tools.get_current_date().split(' ')[-1]
-        if ":" not in self.publishtime:
-            self.publishtime = self.publishtime + " " + cur_time
+    def handle_publish_time(self):
+        time_str = tools.get_current_date().split(' ')[-1]
+        if ':' not in self.publishtime:
+            self.publishtime = self.publishtime + ' ' + time_str
         else:
-            if "00:00:00" in self.publishtime:
-                self.publishtime = self.publishtime.split(' ')[0] + " " + cur_time
+            if '00:00:00' in self.publishtime:
+                self.publishtime = self.publishtime.split(' ')[0] + ' ' + time_str
+
+        self.l_np_publishtime = tools.ensure_int64(tools.date_to_timestamp(self.publishtime))
 
+    def handle_publish_time_overdue(self):
+        """处理超期发布时间"""
         if self.l_np_publishtime and self.l_np_publishtime > self.comeintime:
             log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
             self.publishtime = tools.get_current_date()
             self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
 
-        # html处理正文
+    def handle_page_html(self):
         if not self.contenthtml:
-            log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
+            log.warning(f"页面源码不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
             self.dont_save = True
         else:
             if not self.detail:
@@ -89,17 +97,22 @@ class DataBakItem(BaseDetailItem):
             if text_search(self.detail).total == 0:
                 self.sendflag = "true"   # 无内容数据,数据不入保存服务
 
-        if not self.title or not self.publishtime or not self.href:
-            log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
-            self.dont_save = True
+    def check_data_validity(self):
+        if not self.dont_save:
+            if not self.title or not self.publishtime or not self.href:
+                log.warning(f"基础数据不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
+                self.dont_save = True
 
-        # 竞品网站-详情页地址标识字段
-        if not self.competehref:
-            del self.competehref
+    def pre_to_db(self):
+        if not self.s_title:
+            self.s_title = self.title
+            log.debug("请检测 < s_title > 是否正确!")
 
-        # 详情无附件,不需要 projectinfo 字段
-        if not self.projectinfo:
-            del self.projectinfo
+        self.handle_publish_time()
+        self.handle_publish_time_overdue()
+        self.handle_page_html()
+        self.check_data_validity()
+        self.cleanup()
 
 
 class ExamineAndApproveItem(DataBakItem):