details_firefox.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary:
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import feapder
  10. from feapder.utils.tools import wechat_warning
  11. import execjs
  12. from items.spider_item import DataBakItem, MgpListItem
  13. from feapder.db.mongodb import MongoDB
  14. class FirefoxDetails(feapder.Spider):
  15. _to_db = None
  16. db_name = 'mgp_list'
  17. send_list = []
  18. # 定义mongo链接
  19. @property
  20. def to_db(self):
  21. if not self._to_db:
  22. self._to_db = MongoDB()
  23. return self._to_db
  24. def start_requests(self):
  25. while True:
  26. data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
  27. print(data_lsit)
  28. for item in data_lsit:
  29. print(item)
  30. request_params = item.get("request_params")
  31. if item.get("ex_python"):
  32. exec(item.get("ex_python"))
  33. yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
  34. deal_detail=item.get("deal_detail"),**request_params,
  35. callback=eval(item.get("parse")),base_info=item,render=True,
  36. render_time=item.get("render_time"))
  37. self.to_db.delete(self.db_name,item)
  38. break
  39. def detail_get(self,request,response):
  40. print(response.text)
  41. items = request.item
  42. # print(items)
  43. list_item = DataBakItem()
  44. for key in items:
  45. list_item.__setitem__(key,items[key])
  46. html = ''
  47. for xpath in request.deal_detail:
  48. html = response.xpath(xpath).extract_first() # 标书详细内容
  49. if html is not None:
  50. break
  51. list_item.contenthtml = html
  52. yield list_item
  53. def failed_request(self, request, response):
  54. '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
  55. if response is None:
  56. code = 0
  57. code = response.status_code
  58. err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
  59. if 200 <= code < 300:
  60. err = 'analysis'
  61. elif 300 <= code < 400:
  62. err = 'download'
  63. elif 400 <= code < 500:
  64. err = 'download'
  65. elif 500 <= code:
  66. err = "servers"
  67. else:
  68. err = "timeout"
  69. mgp = MgpListItem()
  70. mgp.code = code
  71. mgp.error = err
  72. items = request.base_info
  73. for key in items:
  74. mgp.__setitem__(key, items[key])
  75. mgp.failed += 1
  76. if mgp.pri is None:
  77. mgp.pri = 0
  78. if mgp.pri > 5:
  79. if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
  80. if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
  81. '''
  82. 根据爬虫优先级报警'''
  83. info = f'''`
  84. 您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
  85. > **爬虫名称:** {mgp.item.get("site")}
  86. > **栏目名称:** {mgp.item.get("channel")}
  87. > **爬虫代码:** {mgp.item.get("spidercode")}
  88. > **爬虫等级:** {mgp.pri}
  89. > **所属管理人员:** {mgp.author}
  90. 请登录剑鱼爬虫管理平台查看详情。
  91. `'''
  92. wechat_warning(info)
  93. self.send_list.append(mgp.item.get("site"))
  94. yield mgp
  95. def end_callback(self):
  96. print("爬虫结束")
  97. # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
  98. # def download_midware(self, request):
  99. # request.proxies = self.prox_pool.get()
  100. # return request
  101. if __name__ == "__main__":
  102. FirefoxDetails(redis_key="magp:details:firefox").start()