123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- import re
- from urllib3 import get_host
- from common.log import logger
- from common.tools import html2element
- def err_details(worker):
- worker_exception = worker.exception()
- if worker_exception:
- logger.exception("Worker return exception: {}".format(worker_exception))
- return worker
- def extract_base_url(url):
- """
- # >>> base_url = extract_base_url('http://192.168.3.207:8080/')
- """
- _s, _h, _p = get_host(url)
- return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
- def extract_domain(url):
- """
- # >>> base_url = extract_domain('http://192.168.3.207:8080/')
- """
- _, host, port = get_host(url)
- return f"{host}" if port is None else f"{host}:{port}"
- def extract_page_title(html):
- element = html2element(html)
- nodes = element.xpath('/html/head/title/text()')
- if len(nodes) > 1:
- return "".format(nodes[-1]).strip()
- return "".join(nodes).strip()
- def valid_url(url):
- """判断url格式畸形与否"""
- _regex = re.compile(
- r'^(?:http|ftp)s?://' # http:// or https://
- r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
- r'localhost|' # localhost...
- r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
- r'(?::\d+)?' # optional port
- r'(?:/?|[/?]\S+)$', re.IGNORECASE)
- return re.match(_regex, url) is not None
|