details.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-03-22
  4. ---------
  5. @summary: 通用采集
  6. ---------
  7. @author:
  8. """
  9. import execjs
  10. import time
  11. import json
  12. import re
  13. from urllib.parse import urljoin
  14. import feapder
  15. from items.spider_item import DataBakItem
  16. from untils.attachment import AttachmentDownloader
  17. from untils.tools import remove_htmldata, extract_file_type
  18. class Details(feapder.BiddingDetailSpider):
  19. def start_requests(self):
  20. data_list = self.get_tasks_by_rabbitmq(limit=500, timeout=60)
  21. for item in data_list:
  22. request_params = item.get("request_params")
  23. timeout = request_params.pop('timeout', 10)
  24. if item.get("js"):
  25. eval(item.get("js"))
  26. if item.get("ex_python"):
  27. exec(item.get("ex_python"))
  28. if item.get("proxies"):
  29. yield feapder.Request(url=item.get("parse_url"),
  30. timeout=timeout,
  31. callback=eval(item.get("parse")),
  32. item=item,
  33. files_info=item.get("files"),
  34. deal_detail=item.get("deal_detail"),
  35. **request_params)
  36. else:
  37. yield feapder.Request(url=item.get("parse_url"),
  38. proxies=False,
  39. timeout=timeout,
  40. callback=eval(item.get("parse")),
  41. item=item,
  42. files_info=item.get("files"),
  43. deal_detail=item.get("deal_detail"),
  44. **request_params)
  45. def detail_get(self, request, response):
  46. items = request.item
  47. data_item = DataBakItem(**items)
  48. html = ''
  49. for xpath in request.deal_detail:
  50. htmls = response.xpath(xpath).extract_first() # 标书详细内容
  51. if request.to_dict.get('conn_html', None):
  52. if htmls is not None:
  53. html += htmls
  54. else:
  55. if htmls is not None:
  56. html = htmls
  57. break
  58. if request.to_dict.get('rm_list', None) and html:
  59. rm_list = request.rm_list
  60. html = remove_htmldata(rm_list, html, response)
  61. if request.to_dict.get('title_xpath', None):
  62. for sxpath in request.title_xpath:
  63. title = response.xpath(sxpath).extract_first("").strip() # 三级页标题
  64. if title:
  65. data_item.title = title
  66. break
  67. data_item.contenthtml = html
  68. attachments = {}
  69. if request.files_info:
  70. files_info = request.files_info
  71. files = response.xpath(files_info.get('list_xpath'))
  72. for index, info in enumerate(files):
  73. file_url = info.xpath(files_info.get('url_xpath')).extract_first()
  74. file_name = info.xpath(files_info.get('name_xpath')).extract()
  75. if not file_url or not file_name:
  76. continue
  77. file_name = ''.join(''.join(file_name).split()).strip()
  78. if files_info.get('host'):
  79. file_url = urljoin(files_info.get('host'), file_url)
  80. if not files_info.get('file_type'):
  81. file_type = extract_file_type(file_name, file_url)
  82. else:
  83. file_type = files_info.get('file_type')
  84. if request.get_proxies():
  85. fpx = request.get_proxies()
  86. else:
  87. fpx = False
  88. if file_type and files_info.get('url_key') in file_url:
  89. attachment = AttachmentDownloader().fetch_attachment(
  90. file_name=file_name,
  91. file_type=file_type,
  92. download_url=file_url,
  93. headers=request.to_dict.get('headers', None),
  94. proxies=fpx,
  95. )
  96. attachments[str(len(attachments) + 1)] = attachment
  97. if len(attachments) > 0:
  98. data_item.projectinfo = {'attachments': attachments}
  99. yield data_item
  100. def detail_json(self, request, response):
  101. items = request.item
  102. list_item = DataBakItem(**items)
  103. exec(request.deal_detail)
  104. yield list_item
  105. def detail_post(self, request, response):
  106. items = request.item
  107. data_item = DataBakItem(**items)
  108. exec(request.deal_detail)
  109. yield data_item
  110. if __name__ == "__main__":
  111. Details(redis_key="detail:normal_details", thread_count=10).start()