detail_ztlbw.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary:
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import feapder
  10. from feapder.utils.log import Log
  11. from feapder.utils.tools import wechat_warning
  12. from items.spider_item import DataBakItem, MgpListItem
  13. from feapder.db.mongodb import MongoDB
  14. from login_pool.zglbw import ZglbwPool
  15. from untils.attachment import AttachmentDownloader
  16. Log().info("")
  17. class FirefoxDetails(feapder.Spider):
  18. _to_db = None
  19. db_name = 'mgp_list'
  20. send_list = []
  21. # 定义mongo链接
  22. @property
  23. def to_db(self):
  24. if not self._to_db:
  25. self._to_db = MongoDB()
  26. return self._to_db
  27. def start_requests(self):
  28. while True:
  29. data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
  30. sort={"date": -1}, limit=1)
  31. print(data_lsit)
  32. for item in data_lsit:
  33. url = item.get("parse_url")
  34. url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
  35. cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
  36. cookie = cookie.get_cookie().cookie
  37. yield feapder.Request(url=url, item=item.get("item"),
  38. callback=self.detail_get, base_info=item, render=True,
  39. render_time=3, proxies=False, cookies=cookie)
  40. self.to_db.delete(self.db_name, item)
  41. break
  42. def detail_get(self, request, response):
  43. items = request.item
  44. # print(items)
  45. list_item = DataBakItem()
  46. for key in items:
  47. list_item.__setitem__(key, items[key])
  48. html = ''
  49. xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
  50. '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
  51. '//div[@class="login ng-star-inserted"]']
  52. for xpath in xpath_list:
  53. # import pdb
  54. # pdb.set_trace()
  55. html_one = response.xpath(xpath).extract_first()
  56. if html_one is not None:
  57. html += '\n' # 标书详细内容
  58. html += html_one # 拼接html
  59. print(html)
  60. list_item.contenthtml = html
  61. files_list = response.xpath("//iframe/@src").extract_first()
  62. file_url = files_list.split("file=")[-1]
  63. file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
  64. attachments = {}
  65. file_name = list_item.title
  66. attachment = AttachmentDownloader().fetch_attachment(
  67. file_name=file_name, file_type='pdf', download_url=file_url,
  68. enable_proxy=False)
  69. attachments["0"] = attachment
  70. list_item.projectinfo = {"attachments": attachments}
  71. yield list_item
  72. def failed_request(self, request, response):
  73. '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
  74. if response is None:
  75. code = 0
  76. code = response.status_code
  77. err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
  78. if 200 <= code < 300:
  79. err = 'analysis'
  80. elif 300 <= code < 400:
  81. err = 'download'
  82. elif 400 <= code < 500:
  83. err = 'download'
  84. elif 500 <= code:
  85. err = "servers"
  86. else:
  87. err = "timeout"
  88. mgp = MgpListItem()
  89. mgp.code = code
  90. mgp.error = err
  91. items = request.base_info
  92. for key in items:
  93. mgp.__setitem__(key, items[key])
  94. mgp.failed += 1
  95. if mgp.pri is None:
  96. mgp.pri = 0
  97. if mgp.pri > 5:
  98. if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
  99. if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
  100. '''
  101. 根据爬虫优先级报警'''
  102. info = f'''`
  103. 您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
  104. > **爬虫名称:** {mgp.item.get("site")}
  105. > **栏目名称:** {mgp.item.get("channel")}
  106. > **爬虫代码:** {mgp.item.get("spidercode")}
  107. > **爬虫等级:** {mgp.pri}
  108. > **所属管理人员:** {mgp.author}
  109. 请登录剑鱼爬虫管理平台查看详情。
  110. `'''
  111. wechat_warning(info)
  112. self.send_list.append(mgp.item.get("site"))
  113. yield mgp
  114. def end_callback(self):
  115. print("爬虫结束")
  116. # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
  117. # def download_midware(self, request):
  118. # request.proxies = self.prox_pool.get()
  119. # return request
  120. if __name__ == "__main__":
  121. FirefoxDetails(redis_key="magp:details:ztlbw").start()