detail_dtcookie.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary: 生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import sys
  10. from urllib.parse import urljoin
  11. from untils.attachment import AttachmentDownloader
  12. sys.path.append('/app/spiders/sword_feapder/FworkSpider')
  13. import feapder
  14. from feapder.utils.tools import wechat_warning
  15. import execjs
  16. from items.spider_item import DataBakItem, MgpListItem
  17. from feapder.db.mongodb import MongoDB
  18. from dtcookie_pool import *
  19. from untils.cookie_pool import PageCookiePool
  20. import copy
  21. class Details(feapder.Spider):
  22. _to_db = None
  23. db_name = 'mgp_list'
  24. send_list = []
  25. # 定义mongo链接
  26. @property
  27. def to_db(self):
  28. if not self._to_db:
  29. self._to_db = MongoDB()
  30. return self._to_db
  31. def start_requests(self):
  32. while True:
  33. data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","item.site":"合肥市人民政府"},sort={"date":-1},limit=1)
  34. for item in data_lsit:
  35. request_params = item.get("request_params")
  36. if item.get("ex_python"):
  37. exec(item.get("ex_python"))
  38. if item.get("proxies"):
  39. yield feapder.Request(url=item.get("parse_url"), item=item.get("item"),deal_detail=item.get("deal_detail"),
  40. callback=eval(item.get("parse")), base_info=item, files_info=item.get("files"),
  41. down_mid=item.get("down_mid"), **request_params)
  42. else:
  43. yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
  44. deal_detail=item.get("deal_detail"),down_mid=item.get("down_mid"), files_info=item.get("files"),
  45. callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
  46. self.to_db.delete(self.db_name,item)
  47. break
  48. def detail_get(self,request,response):
  49. '''处理html格式的返回结果'''
  50. if response.code in (request.down_mid.get("code")):
  51. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  52. down_mid = request.down_mid
  53. cookie_pool_class = down_mid.get("cookie_pool")
  54. cookie_pool = eval(cookie_pool_class)
  55. cookie_pool.del_cookie(request.cookies)
  56. yield request
  57. items = request.item
  58. list_item = DataBakItem()
  59. for key in items:
  60. list_item.__setitem__(key,items[key])
  61. html = ''
  62. for xpath in request.deal_detail:
  63. html = response.xpath(xpath).extract_first() # 标书详细内容
  64. if html is not None:
  65. break
  66. list_item.contenthtml = html
  67. if request.files_info:
  68. files_info = request.files_info
  69. files = response.xpath(files_info.get("list_xpath"))
  70. if request.files_info:
  71. files_info = request.files_info
  72. files = response.xpath(files_info.get("list_xpath"))
  73. if len(files) > 0:
  74. attachments = {}
  75. for index, info in enumerate(files):
  76. file_url = info.xpath(files_info.get("url_xpath")).extract_first()
  77. file_name = info.xpath(files_info.get("name_xpath")).extract_first()
  78. if files_info.get("host"):
  79. file_url = urljoin(files_info.get("host"), file_url)
  80. if not files_info.get("file_type"):
  81. file_type = file_url.split("?")[0].split(".")[-1].lower()
  82. else:
  83. file_type = files_info.get("file_type")
  84. if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
  85. attachment = AttachmentDownloader().fetch_attachment(
  86. file_name=file_name, file_type=file_type, download_url=file_url,
  87. enable_proxy=False)
  88. attachments[len(attachments) + 1] = attachment
  89. if len(attachments) == 0:
  90. pass
  91. else:
  92. list_item.projectinfo = {"attachment": attachments}
  93. yield list_item
  94. def detail_json(self,request,response):
  95. '''处理json串及其他格式的返回结果'''
  96. if response.code in (request.down_mid.get("code")):
  97. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  98. down_mid = request.down_mid
  99. cookie_pool_class = down_mid.get("cookie_pool")
  100. cookie_pool = eval(cookie_pool_class)
  101. cookie_pool.del_cookie(request.cookies)
  102. yield request
  103. items = request.item
  104. list_item = DataBakItem()
  105. for key in items:
  106. list_item.__setitem__(key,items[key])
  107. html = ''
  108. exec(request.deal_detail)
  109. list_item.contenthtml = html
  110. yield list_item
  111. def failed_request(self, request, response):
  112. '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
  113. if response is None:
  114. code = 0
  115. else:
  116. code = response.status_code
  117. err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
  118. if 200 <= code < 300:
  119. err = 'analysis'
  120. elif 300 <= code < 400:
  121. err = 'download'
  122. elif 400 <= code < 500:
  123. err = 'download'
  124. elif 500 <= code:
  125. err = "servers"
  126. else:
  127. err = "timeout"
  128. mgp = MgpListItem()
  129. mgp.code = code
  130. mgp.error = err
  131. items = request.base_info
  132. for key in items:
  133. mgp.__setitem__(key, items[key])
  134. mgp.failed += 1
  135. if mgp.pri is None:
  136. mgp.pri = 0
  137. if mgp.pri > 5:
  138. if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
  139. if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
  140. '''
  141. 根据爬虫优先级报警'''
  142. info = f'''`
  143. 您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
  144. > **爬虫名称:** {mgp.item.get("site")}
  145. > **栏目名称:** {mgp.item.get("channel")}
  146. > **爬虫代码:** {mgp.item.get("spidercode")}
  147. > **爬虫等级:** {mgp.pri}
  148. > **所属管理人员:** {mgp.author}
  149. 请登录剑鱼爬虫管理平台查看详情。
  150. `'''
  151. wechat_warning(info)
  152. self.send_list.append(mgp.item.get("site"))
  153. yield mgp
  154. def end_callback(self):
  155. print("爬虫结束")
  156. def download_midware(self, request):
  157. headers = {
  158. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  159. "Accept-Encoding": "gzip, deflate, br",
  160. "Accept-Language": "zh-CN,zh;q=0.9",
  161. "Cache-Control": "max-age=0",
  162. "Connection": "keep-alive",
  163. "Host": "www.hefei.gov.cn",
  164. "Upgrade-Insecure-Requests": "1",
  165. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
  166. }
  167. down_mid = request.down_mid
  168. cookie_pool_class = down_mid.get("cookie_pool")
  169. cookie_pool = eval(cookie_pool_class)
  170. request.cookies = cookie_pool.get_cookie()
  171. request.headers=headers
  172. return request
  173. if __name__ == "__main__":
  174. Details(redis_key="magp:details1").start()