utils.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import re
  2. from urllib3 import get_host
  3. from common.log import logger
  4. from common.tools import html2element
  5. def err_details(worker):
  6. worker_exception = worker.exception()
  7. if worker_exception:
  8. logger.exception("Worker return exception: {}".format(worker_exception))
  9. return worker
  10. def extract_base_url(url):
  11. """
  12. # >>> base_url = extract_base_url('http://192.168.3.207:8080/')
  13. """
  14. _s, _h, _p = get_host(url)
  15. return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
  16. def extract_domain(url):
  17. """
  18. # >>> base_url = extract_domain('http://192.168.3.207:8080/')
  19. """
  20. _, host, port = get_host(url)
  21. return f"{host}" if port is None else f"{host}:{port}"
  22. def extract_page_title(html):
  23. element = html2element(html)
  24. nodes = element.xpath('/html/head/title/text()')
  25. if len(nodes) > 1:
  26. return "".format(nodes[-1]).strip()
  27. return "".join(nodes).strip()
  28. def valid_url(url):
  29. """判断url格式畸形与否"""
  30. _regex = re.compile(
  31. r'^(?:http|ftp)s?://' # http:// or https://
  32. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  33. r'localhost|' # localhost...
  34. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  35. r'(?::\d+)?' # optional port
  36. r'(?:/?|[/?]\S+)$', re.IGNORECASE)
  37. return re.match(_regex, url) is not None