details_login.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary: 生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import feapder
  10. from feapder.utils.tools import wechat_warning
  11. import execjs
  12. from items.spider_item import DataBakItem, MgpListItem
  13. from feapder.db.mongodb import MongoDB
  14. from untils.cookie_pool import LoginCookiePool
  15. import copy
  16. class Details(feapder.Spider):
  17. _to_db = None
  18. db_name = 'mgp_list'
  19. send_list = []
  20. # 定义mongo链接
  21. @property
  22. def to_db(self):
  23. if not self._to_db:
  24. self._to_db = MongoDB()
  25. return self._to_db
  26. def start_requests(self):
  27. while True:
  28. data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
  29. for item in data_lsit:
  30. request_params = item.get("request_params")
  31. down_mid = copy.copy(item.get("down_mid"))
  32. key = down_mid.get("key")
  33. page_url = down_mid.get("page_url")
  34. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  35. down_mid["cookie_pool"] = cookie_pool
  36. print(down_mid)
  37. if item.get("ex_python"):
  38. exec(item.get("ex_python"))
  39. yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
  40. deal_detail=item.get("deal_detail"),**request_params,
  41. callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
  42. self.to_db.delete(self.db_name,item)
  43. break
  44. def detail_get(self,request,response):
  45. '''处理html格式的返回结果'''
  46. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  47. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  48. down_mid = copy.copy(request.get("down_mid"))
  49. key = down_mid.get("key")
  50. page_url = down_mid.get("page_url")
  51. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  52. cookie_pool.del_cookie(request.cookies)
  53. yield request
  54. if response.code in (request.down_mid.get("code")):
  55. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  56. down_mid = copy.copy(request.get("down_mid"))
  57. key = down_mid.get("key")
  58. page_url = down_mid.get("page_url")
  59. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  60. cookie_pool.del_cookie(request.cookies)
  61. yield request
  62. items = request.item
  63. list_item = DataBakItem()
  64. for key in items:
  65. list_item.__setitem__(key,items[key])
  66. html = ''
  67. for xpath in request.deal_detail:
  68. html = response.xpath(xpath).extract_first() # 标书详细内容
  69. if html is not None:
  70. break
  71. list_item.contenthtml = html
  72. yield list_item
  73. def detail_json(self,request,response):
  74. '''处理json串及其他格式的返回结果'''
  75. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  76. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  77. down_mid = copy.copy(request.get("down_mid"))
  78. key = down_mid.get("key")
  79. page_url = down_mid.get("page_url")
  80. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  81. cookie_pool.del_cookie(request.cookies)
  82. yield request
  83. if response.code in (request.down_mid.get("code")):
  84. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  85. down_mid = copy.copy(request.get("down_mid"))
  86. key = down_mid.get("key")
  87. page_url = down_mid.get("page_url")
  88. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  89. cookie_pool.del_cookie(request.cookies)
  90. yield request
  91. items = request.item
  92. list_item = DataBakItem()
  93. for key in items:
  94. list_item.__setitem__(key,items[key])
  95. html = ''
  96. exec(request.deal_detail)
  97. list_item.contenthtml = html
  98. yield list_item
  99. def failed_request(self, request, response):
  100. '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
  101. mgp = MgpListItem()
  102. items = request.base_info
  103. for key in items:
  104. mgp.__setitem__(key,items[key])
  105. mgp.failed +=1
  106. print(f'......{mgp.failed}')
  107. if mgp.pri > 5:
  108. if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
  109. if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
  110. '''
  111. 根据爬虫优先级报警'''
  112. info= f'''`
  113. 您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
  114. > **爬虫名称:** {mgp.item.get("site")}
  115. > **栏目名称:** {mgp.item.get("channel")}
  116. > **爬虫代码:** {mgp.item.get("spidercode")}
  117. > **所属管理人员:** {mgp.author}
  118. 请登录剑鱼爬虫管理平台查看详情。
  119. `'''
  120. wechat_warning(info)
  121. self.send_list.append(mgp.item.get("site"))
  122. yield mgp
  123. def end_callback(self):
  124. print("爬虫结束")
  125. def download_midware(self, request):
  126. down_mid = request.down_mid
  127. key = down_mid.get("key")
  128. page_url = down_mid.get("page_url")
  129. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  130. request.cookies = cookie_pool.get_cookie()
  131. return request
  132. if __name__ == "__main__":
  133. Details(redis_key="magp:details1").start()