spider_item.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. import feapder.utils.tools as tools
  2. from feapder.utils.log import log
  3. from items.base_item import SwordFishProjectItem
  4. from untils.check_data import CheckData
  5. from untils.tools import (
  6. int2long,
  7. substitute,
  8. text_search,
  9. )
  10. class DataBakItem(SwordFishProjectItem):
  11. """标讯数据"""
  12. def __init__(self):
  13. super(DataBakItem, self).__init__()
  14. self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
  15. self.site = "" # 采集的站点(编辑器爬虫平台定义)
  16. self.channel = "" # 采集的版块(编辑器爬虫平台定义)
  17. self.title = "" # 文章标题
  18. self.s_title = "" # 详情页标题(有必填),默认提供列表页标题
  19. self.area = "全国" # 省
  20. self.city = "" # 市
  21. self.district = "" # 区/县
  22. self.publishtime = "" # 文章发布时间(列表页或者详情页发布时间)
  23. self.l_np_publishtime = "" # 发布时间的时间戳(秒级), 需定义为long型
  24. self.comeintime = "" # 入库时间戳(秒级), 需定义为long型
  25. self.contenthtml = "" # 详情页源码
  26. self.detail = "" # 详情页源码清洗之后的文本
  27. self.href = "" # 非竞品详情页地址
  28. self.competehref = None # 竞品详情页地址
  29. self.projectinfo = None # 附件信息,详见剑鱼招投标规范
  30. self.iscompete = True # 新爬虫
  31. self.sendflag = "false"
  32. self.T = "bidding"
  33. self.infoformat = 1
  34. # 默认设置
  35. self.type = ""
  36. self.publishdept = ""
  37. self._d = "comeintime"
  38. def pre_to_db(self):
  39. if not self.s_title:
  40. self.s_title = self.title
  41. log.debug("请检测 < s_title > 是否正确!")
  42. self.comeintime = int2long(tools.get_current_timestamp()) # 生成入库时间戳(秒级), 定义为long型
  43. if ":" in self.publishtime:
  44. self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
  45. else:
  46. self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
  47. # html处理正文
  48. if not self.contenthtml:
  49. log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
  50. self.save = False
  51. else:
  52. if not self.detail:
  53. self.detail = substitute(self.contenthtml)
  54. if text_search(self.detail).total == 0:
  55. self.sendflag = "true" # 无内容数据,数据不入保存服务
  56. if not self.title or not self.publishtime or not self.href:
  57. log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
  58. self.save = False
  59. # 竞品网站-详情页地址标识字段
  60. if not self.competehref:
  61. del self.competehref
  62. # 详情无附件,不需要 projectinfo 字段
  63. if not self.projectinfo:
  64. del self.projectinfo
  65. class ExamineAndApproveItem(DataBakItem):
  66. """审批数据"""
  67. def __init__(self):
  68. super(ExamineAndApproveItem, self).__init__()
  69. self.table_name = "data_bak"
  70. self.T = "bidding"
  71. self.infoformat = 2
  72. class PropertyRightItem(DataBakItem):
  73. """产权数据"""
  74. def __init__(self):
  75. super(PropertyRightItem, self).__init__()
  76. self.table_name = "data_bak"
  77. self.T = "bidding_other"
  78. self.infoformat = 3
  79. class MgpListItem(SwordFishProjectItem):
  80. def __init__(self):
  81. super(MgpListItem, self).__init__()
  82. self.spidercode = "" # 爬虫代码(编辑器爬虫平台定义)
  83. self.parse_url = "" # 详情爬虫访问地址
  84. self.parser_name = "" # 详情爬虫从MongoDB拉取任务的唯一标识,建议使用 spidercode 命名
  85. self.parse = "" # 详情爬虫解析回调方法名
  86. self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,必须与requests请求的参数名称对应,否则无法识别
  87. self.proxies = True # 代理
  88. self.comeintime = int2long(tools.get_current_timestamp()) # 入库时间
  89. self.deal_detail = [] # 定义解析详情页主页内容的xpath列表
  90. self.ex_js = "" # 定义需要执行的js代码,包括但不限于script、文件路径等
  91. self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
  92. self.files = False # 采集附件配置
  93. @property
  94. def item(self) -> dict:
  95. return self.__dict__["item"]
  96. @item.setter
  97. def item(self, data_item: DataBakItem):
  98. self.__dict__["item"] = data_item.to_dict
  99. def pre_to_db(self):
  100. self.spidercode = self.item["spidercode"]
  101. title = self.item.get("title")
  102. channel = self.item["channel"]
  103. if CheckData.channel(channel):
  104. code, reason = CheckData.title(title)
  105. if code == 10106:
  106. log.warning(f"{title}--不可入库,原因:{reason}")
  107. self.save = False