Răsfoiți Sursa

新增爬虫属性 - 混合采集标识

dongzhaorui 1 an în urmă
părinte
comite
e77e742203

+ 2 - 5
FworkSpider/feapder/core/parser_control.py

@@ -80,7 +80,6 @@ class PaserControl(threading.Thread):
             extract_count = 0  # 列表抽取总数量
             task_count = 0  # 详情任务总数量
             rel_count = 0  # 实际入库量
-            is_mixed = False  # 混合采集标识,False=列表页与详情页单独采集;True=列表页即详情页
 
             if parser.name == request.parser_name:
                 used_download_midware_enable = False
@@ -206,10 +205,8 @@ class PaserControl(threading.Thread):
                         elif isinstance(result, Item):
                             result_type = 2
 
-                            # 判断爬虫是否混合采集
-                            if "List" in parser.__business_type__ and hasattr(result, 'contenthtml'):
-                                is_mixed = True
-                            result.is_mixed = is_mixed
+                            # 添加属性 - 混合采集
+                            result.is_mixed = parser.is_mix
 
                             # 实际入库数量计数
                             if not self.is_duplicate(result):

+ 18 - 6
FworkSpider/feapder/core/spiders/spider.py

@@ -100,6 +100,11 @@ class Spider(
         DebugSpider.__name__ = cls.__name__
         return DebugSpider(*args, **kwargs)
 
+    @property
+    def is_mix(self):
+        """爬虫采集方式 True=混合采集(列表页+详情页) False=独立采集(列表页,详情页)"""
+        return False
+
 
 class DebugSpider(Spider):
     """
@@ -221,7 +226,7 @@ class DebugSpider(Spider):
 
 
 class BaseBusinessListSpider(Spider):
-    """列表页采集业务基类"""
+    """列表页采集基础爬虫"""
 
     __business_type__ = "List"
     __extract_count__ = 0
@@ -240,7 +245,7 @@ class BaseBusinessListSpider(Spider):
 
 
 class BaseBusinessDetailSpider(Spider):
-    """详情页采集业务基类"""
+    """详情页采集基础爬虫"""
 
     __business_type__ = "Detail"
     __business_setting__ = dict(
@@ -317,25 +322,32 @@ class BaseBusinessDetailSpider(Spider):
         return task_lst
 
 
+class MixBusinessSpider(BaseBusinessListSpider):
+    """混采(列表页+详情页)采集基础爬虫"""
+
+    def is_mix(self):
+        return True
+
+
 class BiddingListSpider(BaseBusinessListSpider):
-    """标讯列表页采集业务类"""
+    """标讯列表页采集爬虫"""
 
     __business_type__ = "BiddingList"
 
 
 class BiddingDetailSpider(BaseBusinessDetailSpider):
-    """标讯详情页采集业务类"""
+    """标讯详情页采集爬虫"""
 
     __business_type__ = "BiddingDetail"
 
 
 class PlanToBuildListSpider(BaseBusinessListSpider):
-    """拟建列表页采集业务类"""
+    """拟建列表页采集爬虫"""
 
     __business_type__ = "PlanToBuildList"
 
 
 class PlanToBuildDetailSpider(BaseBusinessDetailSpider):
-    """拟建详情页采集业务类"""
+    """拟建详情页采集爬虫"""
 
     __business_type__ = "PlanToBuildDetail"