details_ces.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary:
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import sys
  10. sys.path.append('/app/spiders/sword_feapder/FworkSpider')
  11. import time
  12. from urllib.parse import urljoin
  13. import feapder
  14. from feapder.utils.tools import wechat_warning
  15. import execjs
  16. from items.spider_item import DataBakItem, MgpListItem
  17. from feapder.db.mongodb import MongoDB
  18. from untils.attachment import AttachmentDownloader
  19. class Details(feapder.Spider):
  20. _to_db = None
  21. db_name = 'mgp_list'
  22. send_list = []
  23. # 定义mongo链接
  24. @property
  25. def to_db(self):
  26. if not self._to_db:
  27. self._to_db = MongoDB()
  28. return self._to_db
  29. def start_requests(self):
  30. while True:
  31. data_lsit = self.to_db.find(self.db_name,{"parser_name":"details","item.spidercode":"a_szsjzsczhcxpt_zbxx"},sort={"item.publishtime":-1},limit=1)
  32. for item in data_lsit:
  33. print(item.get("item"))
  34. request_params = item.get("request_params")
  35. if item.get("js"):
  36. eval(item.get("js"))
  37. if item.get("ex_python"):
  38. exec(item.get("ex_python"))
  39. if item.get("proxies"):
  40. yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
  41. deal_detail=item.get("deal_detail"),
  42. callback=eval(item.get("parse")),base_info=item)
  43. else:
  44. # print(item.get("files"))
  45. files = {'list_xpath': '//div[@class="info-article in active"]//div/a', 'url_xpath': './@href',
  46. 'name_xpath': './text()', 'files_type': ['zip', 'doxc', 'ftp', 'pdf',"ddf"], 'url_key': 'http'}
  47. yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=files,
  48. deal_detail=item.get("deal_detail"),
  49. callback=eval(item.get("parse")), base_info=item,proxies=False)
  50. self.to_db.delete(self.db_name,item)
  51. break
  52. def detail_get(self,request,response):
  53. items = request.item
  54. list_item = DataBakItem()
  55. for key in items:
  56. list_item.__setitem__(key,items[key])
  57. html = ''
  58. for xpath in request.deal_detail:
  59. html = response.xpath(xpath).extract_first() # 标书详细内容
  60. if html is not None:
  61. break
  62. list_item.contenthtml = html
  63. if request.files_info:
  64. files_info = request.files_info
  65. files = response.xpath(files_info.get("list_xpath"))
  66. if len(files)>1:
  67. attachments = {}
  68. for index,info in enumerate(files):
  69. file_url = info.xpath(files_info.get("url_xpath")).extract_first()
  70. file_name = info.xpath(files_info.get("name_xpath")).extract_first()
  71. if files_info.get("host"):
  72. file_url = urljoin(files_info.get("host"), file_url)
  73. if not files_info.get("file_type"):
  74. file_type = file_url.split("?")[0].split(".")[-1].lower()
  75. else:
  76. file_type = files_info.get("file_type")
  77. if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
  78. attachment = AttachmentDownloader().fetch_attachment(
  79. file_name=file_name,file_type=file_type,download_url=file_url,
  80. enable_proxy=False)
  81. attachments[index] = attachment
  82. list_item.projectinfo=attachments
  83. else:
  84. for info in files:
  85. file_url = info.xpath(files_info.get("url_xpath")).extract_first()
  86. file_name = info.xpath(files_info.get("name_xpath")).extract_first()
  87. if files_info.get("host"):
  88. file_url = urljoin(files_info.get("host"), file_url)
  89. if files_info.get("file_name"):
  90. file_name = files_info.get("file_name")
  91. else:
  92. file_name = file_name
  93. if files_info.get("file_type"):
  94. file_type = files_info.get("file_type")
  95. else:
  96. file_type = file_name.split("?")[0].split(".")[-1]
  97. if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
  98. attachment = AttachmentDownloader().fetch_attachment(
  99. file_name=file_name, file_type=file_type, download_url=file_url,
  100. enable_proxy=False)
  101. list_item.projectinfo = attachment
  102. yield list_item
  103. def detail_json(self,request,response):
  104. items = request.item
  105. list_item = DataBakItem()
  106. for key in items:
  107. list_item.__setitem__(key,items[key])
  108. exec(request.deal_detail)
  109. yield list_item
  110. def detail_post(self,request,response):
  111. items = request.item
  112. list_item = DataBakItem()
  113. for key in items:
  114. list_item.__setitem__(key,items[key])
  115. exec(request.deal_detail)
  116. yield list_item
  117. def failed_request(self, request, response):
  118. '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
  119. if response is None:
  120. code = 0
  121. else:
  122. code = response.status_code
  123. err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
  124. if 200<=code<300:
  125. err = 'analysis'
  126. elif 300<=code<400:
  127. err = 'download'
  128. elif 400<=code<500:
  129. err = 'download'
  130. elif 500<=code:
  131. err = "servers"
  132. else:
  133. err = "timeout"
  134. mgp = MgpListItem()
  135. mgp.code=code
  136. mgp.error=err
  137. items = request.base_info
  138. for key in items:
  139. mgp.__setitem__(key,items[key])
  140. mgp.failed +=1
  141. if mgp.pri is None:
  142. mgp.pri = 0
  143. if mgp.pri > 5:
  144. if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
  145. if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
  146. '''
  147. 根据爬虫优先级报警'''
  148. info= f'''`
  149. 您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
  150. > **爬虫名称:** {mgp.item.get("site")}
  151. > **栏目名称:** {mgp.item.get("channel")}
  152. > **爬虫代码:** {mgp.item.get("spidercode")}
  153. > **爬虫等级:** {mgp.pri}
  154. > **所属管理人员:** {mgp.author}
  155. 请登录剑鱼爬虫管理平台查看详情。
  156. `'''
  157. wechat_warning(info)
  158. self.send_list.append(mgp.item.get("site"))
  159. yield mgp
  160. def end_callback(self):
  161. print("爬虫结束")
  162. if __name__ == "__main__":
  163. Details(redis_key="magp:details1").start()