detail_firefox.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2023-08-07
  4. ---------
  5. @summary: selenium 采集 FIREFOX
  6. ---------
  7. @author:
  8. """
  9. from urllib.parse import urljoin, quote
  10. import feapder
  11. from items.spider_item import DataBakItem
  12. from untils.attachment import AttachmentDownloader
  13. from untils.tools import remove_htmldata, extract_file_type
  14. try:
  15. import time
  16. import json
  17. import re
  18. except ImportError:
  19. pass
  20. headers = {
  21. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  22. "Accept-Language": "zh-CN,zh;q=0.9",
  23. "Cache-Control": "no-cache",
  24. "Connection": "keep-alive",
  25. "Pragma": "no-cache",
  26. "Upgrade-Insecure-Requests": "1",
  27. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
  28. }
  29. DRISSIONPAGE = dict(
  30. pool_size=10, # 浏览器标签页的数量
  31. user_agent=None, # 字符串
  32. load_images=False, # 是否加载图片
  33. proxy=None, # xxx.xxx.xxx.xxx:xxxx
  34. headless=True, # 是否为无头浏览器
  35. timeout=30, # 请求超时时间
  36. retry=1, # 连接失败重试次数
  37. interval=0.5, # 连接失败重试间隔(秒)
  38. page_load=30,
  39. render_time=0, # 渲染时长,即打开网页等待加载超时时间
  40. window_size=(1024, 800), # 窗口大小
  41. driver_type="chromium",
  42. load_mode="normal", # 网页加载策略, 可选值:"normal", "eager", "none"
  43. download_path=None, # 下载文件的路径
  44. custom_argument=[
  45. "--no-sandbox",
  46. "--ignore-certificate-errors"
  47. ]
  48. )
  49. class Spider(feapder.BiddingDetailSpider):
  50. __custom_setting__ = dict(
  51. PROXY_EXTRACT_API="http://172.17.162.28:16001/sam",
  52. PROXY_POOL="feapder.network.proxy_pool.SpringBoardProxyPool",
  53. DRISSIONPAGE=DRISSIONPAGE
  54. )
  55. def start_requests(self):
  56. data_list = self.get_tasks_by_rabbitmq(limit=100, timeout=60)
  57. for item in data_list:
  58. request_params = item.get('request_params')
  59. timeout = request_params.pop('timeout', 30)
  60. if item.get('ex_python'):
  61. exec(item.get('ex_python'))
  62. yield feapder.Request(url=item.get('parse_url'),
  63. timeout=timeout,
  64. render=True,
  65. render_time=item.get('render_time', 5),
  66. callback=eval(item.get('parse')),
  67. item=item,
  68. files_info=item.get('files'),
  69. deal_detail=item.get('deal_detail'),
  70. **request_params)
  71. def detail_get(self, request, response):
  72. items = request.item
  73. data_item = DataBakItem(**items)
  74. html = ''
  75. for xpath in request.deal_detail:
  76. htmls = response.xpath(xpath).extract_first() # 标书详细内容
  77. if request.to_dict.get('conn_html', None):
  78. if htmls is not None:
  79. html += htmls
  80. else:
  81. if htmls is not None:
  82. html = htmls
  83. break
  84. if request.to_dict.get('rm_list', None) and html:
  85. rm_list = request.rm_list
  86. html = remove_htmldata(rm_list, html, response)
  87. data_item.contenthtml = html
  88. attachments = {}
  89. if request.files_info:
  90. files_info = request.files_info
  91. files = response.xpath(files_info.get('list_xpath'))
  92. for index, info in enumerate(files):
  93. file_url = info.xpath(files_info.get('url_xpath')).extract_first()
  94. file_name = info.xpath(files_info.get('name_xpath')).extract()
  95. if not file_url or not file_name:
  96. continue
  97. file_name = ''.join(''.join(file_name).split()).strip()
  98. if files_info.get('host'):
  99. file_url = urljoin(files_info.get('host'), file_url)
  100. if not files_info.get('file_type'):
  101. file_type = extract_file_type(file_name, file_url)
  102. else:
  103. file_type = files_info.get('file_type')
  104. if request.get_proxies():
  105. fpx = request.get_proxies()
  106. else:
  107. fpx = False
  108. cookie_json = response.cookies.get_dict() or {}
  109. if file_type and files_info.get('url_key') in file_url:
  110. headers['Referer'] = quote(file_url, safe=';/?:@&=+$,', encoding='utf-8')
  111. attachment = AttachmentDownloader().fetch_attachment(
  112. file_name=file_name,
  113. file_type=file_type,
  114. download_url=file_url,
  115. headers=headers,
  116. proxies=fpx,
  117. cookies=cookie_json)
  118. attachments[str(len(attachments) + 1)] = attachment
  119. headers.pop('Referer', '')
  120. if len(attachments) > 0:
  121. data_item.projectinfo = {'attachments': attachments}
  122. yield data_item
  123. if __name__ == "__main__":
  124. Spider(redis_key='detail:firefox', thread_count=10).start()