123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-03-22
- ---------
- @summary: 通用采集
- ---------
- @author:
- """
- import execjs
- import time
- import json
- import re
- from urllib.parse import urljoin
- import feapder
- from items.spider_item import DataBakItem
- from untils.attachment import AttachmentDownloader
- from untils.tools import remove_htmldata, extract_file_type
- class Details(feapder.BiddingDetailSpider):
- def start_requests(self):
- data_list = self.get_tasks_by_rabbitmq(limit=500, timeout=60)
- for item in data_list:
- request_params = item.get("request_params")
- timeout = request_params.pop('timeout', 10)
- if item.get("js"):
- eval(item.get("js"))
- if item.get("ex_python"):
- exec(item.get("ex_python"))
- if item.get("proxies"):
- yield feapder.Request(url=item.get("parse_url"),
- timeout=timeout,
- callback=eval(item.get("parse")),
- item=item,
- files_info=item.get("files"),
- deal_detail=item.get("deal_detail"),
- **request_params)
- else:
- yield feapder.Request(url=item.get("parse_url"),
- proxies=False,
- timeout=timeout,
- callback=eval(item.get("parse")),
- item=item,
- files_info=item.get("files"),
- deal_detail=item.get("deal_detail"),
- **request_params)
- def detail_get(self, request, response):
- items = request.item
- data_item = DataBakItem(**items)
- html = ''
- for xpath in request.deal_detail:
- htmls = response.xpath(xpath).extract_first() # 标书详细内容
- if request.to_dict.get('conn_html', None):
- if htmls is not None:
- html += htmls
- else:
- if htmls is not None:
- html = htmls
- break
- if request.to_dict.get('rm_list', None) and html:
- rm_list = request.rm_list
- html = remove_htmldata(rm_list, html, response)
- if request.to_dict.get('title_xpath', None):
- for sxpath in request.title_xpath:
- title = response.xpath(sxpath).extract_first("").strip() # 三级页标题
- if title:
- data_item.title = title
- break
- data_item.contenthtml = html
- attachments = {}
- if request.files_info:
- files_info = request.files_info
- files = response.xpath(files_info.get('list_xpath'))
- for index, info in enumerate(files):
- file_url = info.xpath(files_info.get('url_xpath')).extract_first()
- file_name = info.xpath(files_info.get('name_xpath')).extract()
- if not file_url or not file_name:
- continue
- file_name = ''.join(''.join(file_name).split()).strip()
- if files_info.get('host'):
- file_url = urljoin(files_info.get('host'), file_url)
- if not files_info.get('file_type'):
- file_type = extract_file_type(file_name, file_url)
- else:
- file_type = files_info.get('file_type')
- if request.get_proxies():
- fpx = request.get_proxies()
- else:
- fpx = False
- if file_type and files_info.get('url_key') in file_url:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name,
- file_type=file_type,
- download_url=file_url,
- headers=request.to_dict.get('headers', None),
- proxies=fpx,
- )
- attachments[str(len(attachments) + 1)] = attachment
- if len(attachments) > 0:
- data_item.projectinfo = {'attachments': attachments}
- yield data_item
- def detail_json(self, request, response):
- items = request.item
- list_item = DataBakItem(**items)
- exec(request.deal_detail)
- yield list_item
- def detail_post(self, request, response):
- items = request.item
- data_item = DataBakItem(**items)
- exec(request.deal_detail)
- yield data_item
- if __name__ == "__main__":
- Details(redis_key="detail:normal_details", thread_count=10).start()
|