|
@@ -149,13 +149,25 @@ class BaseItem(Item):
|
|
|
"""数据采集基础类"""
|
|
|
|
|
|
def __init__(self, business_id=None, save=True, **kwargs):
|
|
|
- business_id = business_id or tools.get_uuid().replace('-', '') # 数据流水编号
|
|
|
super(BaseItem, self).__init__(
|
|
|
save=save,
|
|
|
- business_id=business_id,
|
|
|
+ business_id=business_id or tools.get_uuid().replace('-', ''), # 数据流水编号
|
|
|
**kwargs
|
|
|
)
|
|
|
|
|
|
+ self.site = "" # 站点名称(数据源定义)
|
|
|
+ self.channel = "" # 栏目名称(数据源定义)
|
|
|
+ self.spidercode = "" # 爬虫代码(数据源定义)
|
|
|
+
|
|
|
+ self.area = "全国" # 省
|
|
|
+ self.city = "" # 市
|
|
|
+ self.district = "" # 区/县
|
|
|
+
|
|
|
+ self.href = "" # 采集地址
|
|
|
+ self.comeintime = tools.ensure_int64(
|
|
|
+ tools.get_current_timestamp()
|
|
|
+ ) # 入库时间
|
|
|
+
|
|
|
@property
|
|
|
def save(self) -> bool:
|
|
|
return self.__dict__["save"]
|
|
@@ -213,9 +225,21 @@ class BaseItem(Item):
|
|
|
|
|
|
class BaseListItem(BaseItem):
|
|
|
"""列表数据采集基础类"""
|
|
|
- pass
|
|
|
+ def __init__(self):
|
|
|
+ super(BaseListItem, self).__init__()
|
|
|
+
|
|
|
+ self.request_params = {} # 访问详情数据时额外的请求参数
|
|
|
|
|
|
|
|
|
class BaseDetailItem(BaseItem):
|
|
|
"""详情数据采集基础类"""
|
|
|
- pass
|
|
|
+
|
|
|
+ def __init__(self, **kwargs):
|
|
|
+ super(BaseDetailItem, self).__init__()
|
|
|
+
|
|
|
+ self.title = "" # 详情页标题
|
|
|
+ self.contenthtml = "" # 详情页源码
|
|
|
+ self.detail = "" # 清洗之后的详情页源码
|
|
|
+ self.projectinfo = None # 附件信息,格式详见剑鱼数据采集规范
|
|
|
+
|
|
|
+ self.sendflag = "false" # 该数据是否保存到正式库
|