|
@@ -148,44 +148,38 @@ class UpdateItem(Item):
|
|
|
class BaseItem(Item):
|
|
|
"""数据采集基础类"""
|
|
|
|
|
|
- def __init__(self, business_id=None, save=True, **kwargs):
|
|
|
- super(BaseItem, self).__init__(
|
|
|
- save=save,
|
|
|
- business_id=business_id or tools.get_uuid().replace('-', ''), # 数据流水编号
|
|
|
- **kwargs
|
|
|
- )
|
|
|
-
|
|
|
- self.site = "" # 站点名称(数据源定义)
|
|
|
- self.channel = "" # 栏目名称(数据源定义)
|
|
|
- self.spidercode = "" # 爬虫代码(数据源定义)
|
|
|
+ def __init__(self, save=True, site='', channel='', spidercode='',
|
|
|
+ area='全国', city='', district='', href='', pyuuid=None):
|
|
|
+ """
|
|
|
|
|
|
- self.area = "全国" # 省
|
|
|
- self.city = "" # 市
|
|
|
- self.district = "" # 区/县
|
|
|
+ @param pyuuid: 采集数据唯一标识
|
|
|
+ @param save: 是否保存到数据库
|
|
|
+ @param site: 站点名称(数据源定义)
|
|
|
+ @param channel: 栏目名称(数据源定义)
|
|
|
+ @param spidercode: 爬虫代码(数据源定义)
|
|
|
+ @param area: 省, 默认:全国
|
|
|
+ @param city: 市
|
|
|
+ @param district: 区/县
|
|
|
+ @param href: 采集地址
|
|
|
+ """
|
|
|
+ super(BaseItem, self).__init__()
|
|
|
|
|
|
- self.href = "" # 采集地址
|
|
|
+ self.pyuuid = pyuuid or tools.get_uuid().replace('-', '')
|
|
|
self.comeintime = tools.ensure_int64(
|
|
|
tools.get_current_timestamp()
|
|
|
) # 入库时间
|
|
|
|
|
|
- @property
|
|
|
- def save(self) -> bool:
|
|
|
- return self.__dict__["save"]
|
|
|
+ self.save = save
|
|
|
|
|
|
- @save.setter
|
|
|
- def save(self, state: bool):
|
|
|
- """
|
|
|
- 是否持久化存储本数据条目
|
|
|
- 持久化存储 save=True;临时存储 save=False
|
|
|
+ self.site = site
|
|
|
+ self.channel = channel
|
|
|
+ self.spidercode = spidercode
|
|
|
|
|
|
- @param state: 持久化状态
|
|
|
- """
|
|
|
- self.__dict__["save"] = state
|
|
|
+ self.area = area
|
|
|
+ self.city = city
|
|
|
+ self.district = district
|
|
|
|
|
|
- @property
|
|
|
- def business_id(self):
|
|
|
- """本条目数据流水编号"""
|
|
|
- return self.__dict__["business_id"]
|
|
|
+ self.href = href
|
|
|
|
|
|
@property
|
|
|
def fingerprint(self):
|
|
@@ -234,12 +228,21 @@ class BaseListItem(BaseItem):
|
|
|
class BaseDetailItem(BaseItem):
|
|
|
"""详情数据采集基础类"""
|
|
|
|
|
|
- def __init__(self, **kwargs):
|
|
|
- super(BaseDetailItem, self).__init__()
|
|
|
+ def __init__(self, title='', contenthtml='', detail='', sendflag='false',
|
|
|
+ projectinfo=None, **kwargs):
|
|
|
+ """
|
|
|
+
|
|
|
+ @param title: 详情页标题
|
|
|
+ @param contenthtml: 详情页源码
|
|
|
+ @param detail: 清洗之后的详情页源码
|
|
|
+ @param sendflag: 该数据是否保存到正式库
|
|
|
+ @param projectinfo: 附件信息,格式详见剑鱼数据采集规范
|
|
|
+ """
|
|
|
+ super(BaseDetailItem, self).__init__(**kwargs)
|
|
|
|
|
|
- self.title = "" # 详情页标题
|
|
|
- self.contenthtml = "" # 详情页源码
|
|
|
- self.detail = "" # 清洗之后的详情页源码
|
|
|
- self.projectinfo = None # 附件信息,格式详见剑鱼数据采集规范
|
|
|
+ self.title = title
|
|
|
+ self.contenthtml = contenthtml
|
|
|
+ self.detail = detail
|
|
|
+ self.projectinfo = projectinfo
|
|
|
|
|
|
- self.sendflag = "false" # 该数据是否保存到正式库
|
|
|
+ self.sendflag = sendflag
|