details_cookie.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary: 生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import sys
  10. sys.path.append('/app/spiders/sword_feapder/FworkSpider')
  11. import feapder
  12. from feapder.utils.tools import wechat_warning
  13. import execjs
  14. from items.spider_item import DataBakItem, MgpListItem
  15. from feapder.db.mongodb import MongoDB
  16. from untils.cookie_pool import PageCookiePool
  17. import copy
  18. class Details(feapder.Spider):
  19. _to_db = None
  20. db_name = 'mgp_list'
  21. send_list = []
  22. # 定义mongo链接
  23. @property
  24. def to_db(self):
  25. if not self._to_db:
  26. self._to_db = MongoDB()
  27. return self._to_db
  28. def start_requests(self):
  29. while True:
  30. data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
  31. for item in data_lsit:
  32. request_params = item.get("request_params")
  33. if item.get("ex_python"):
  34. exec(item.get("ex_python"))
  35. yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
  36. deal_detail=item.get("deal_detail"),**request_params,
  37. callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
  38. self.to_db.delete(self.db_name,item)
  39. break
  40. def detail_get(self,request,response):
  41. '''处理html格式的返回结果'''
  42. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  43. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  44. down_mid = copy.copy(request.down_mid)
  45. key = down_mid.get("key")
  46. page_url = down_mid.get("page_url")
  47. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  48. cookie_pool.del_cookie(request.cookies)
  49. yield request
  50. if response.code in (request.down_mid.get("code")):
  51. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  52. down_mid = copy.copy(request.down_mid)
  53. key = down_mid.get("key")
  54. page_url = down_mid.get("page_url")
  55. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  56. cookie_pool.del_cookie(request.cookies)
  57. yield request
  58. items = request.item
  59. list_item = DataBakItem()
  60. for key in items:
  61. list_item.__setitem__(key,items[key])
  62. html = ''
  63. for xpath in request.deal_detail:
  64. html = response.xpath(xpath).extract_first() # 标书详细内容
  65. if html is not None:
  66. break
  67. list_item.contenthtml = html
  68. yield list_item
  69. def detail_json(self,request,response):
  70. '''处理json串及其他格式的返回结果'''
  71. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  72. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  73. down_mid = copy.copy(request.down_mid)
  74. key = down_mid.get("key")
  75. page_url = down_mid.get("page_url")
  76. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  77. cookie_pool.del_cookie(request.cookies)
  78. yield request
  79. if response.code in (request.down_mid.get("code")):
  80. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  81. down_mid = copy.copy(request.down_mid)
  82. key = down_mid.get("key")
  83. page_url = down_mid.get("page_url")
  84. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  85. cookie_pool.del_cookie(request.cookies)
  86. yield request
  87. items = request.item
  88. list_item = DataBakItem()
  89. for key in items:
  90. list_item.__setitem__(key,items[key])
  91. html = ''
  92. exec(request.deal_detail)
  93. list_item.contenthtml = html
  94. yield list_item
  95. def failed_request(self, request, response):
  96. '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
  97. if response is None:
  98. code = 0
  99. else:
  100. code = response.status_code
  101. err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
  102. if 200 <= code < 300:
  103. err = 'analysis'
  104. elif 300 <= code < 400:
  105. err = 'download'
  106. elif 400 <= code < 500:
  107. err = 'download'
  108. elif 500 <= code:
  109. err = "servers"
  110. else:
  111. err = "timeout"
  112. mgp = MgpListItem()
  113. mgp.code = code
  114. mgp.error = err
  115. items = request.base_info
  116. for key in items:
  117. mgp.__setitem__(key, items[key])
  118. mgp.failed += 1
  119. if mgp.pri is None:
  120. mgp.pri = 0
  121. if mgp.pri > 5:
  122. if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
  123. if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
  124. '''
  125. 根据爬虫优先级报警'''
  126. info = f'''`
  127. 您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
  128. > **爬虫名称:** {mgp.item.get("site")}
  129. > **栏目名称:** {mgp.item.get("channel")}
  130. > **爬虫代码:** {mgp.item.get("spidercode")}
  131. > **爬虫等级:** {mgp.pri}
  132. > **所属管理人员:** {mgp.author}
  133. 请登录剑鱼爬虫管理平台查看详情。
  134. `'''
  135. wechat_warning(info)
  136. self.send_list.append(mgp.item.get("site"))
  137. yield mgp
  138. def end_callback(self):
  139. print("爬虫结束")
  140. def download_midware(self, request):
  141. down_mid = request.down_mid
  142. key = down_mid.get("key")
  143. page_url = down_mid.get("page_url")
  144. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  145. request.cookies = cookie_pool.get_cookie()
  146. return request
  147. if __name__ == "__main__":
  148. Details(redis_key="magp:details1").start()