12345678910111213141516171819202122232425262728293031323334353637 |
- from urllib3 import get_host
- from common.log import logger
- from common.tools import html2element
- def err_details(worker):
- worker_exception = worker.exception()
- if worker_exception:
- logger.exception("Worker return exception: {}".format(worker_exception))
- return worker
- def extract_base_url(url):
- """
- # >>> base_url = extract_base_url('http://192.168.3.207:8080/')
- """
- _s, _h, _p = get_host(url)
- return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
- def extract_domain(url):
- """
- # >>> base_url = extract_domain('http://192.168.3.207:8080/')
- """
- _, host, port = get_host(url)
- return f"{host}" if port is None else f"{host}:{port}"
- def extract_page_title(html):
- element = html2element(html)
- nodes = element.xpath('/html/head/title/text()')
- if len(nodes) > 1:
- return "".format(nodes[-1]).strip()
- return "".join(nodes).strip()
|