utils.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import re
  2. from urllib.parse import urlencode, urljoin
  3. from urllib3 import get_host
  4. from common.log import logger
  5. from common.tools import html2element
  6. def err_details(worker):
  7. worker_exception = worker.exception()
  8. if worker_exception:
  9. logger.exception("Worker return exception: {}".format(worker_exception))
  10. return worker
  11. def extract_base_url(url):
  12. """
  13. # >>> base_url = extract_base_url('http://192.168.3.207:8080/')
  14. """
  15. _s, _h, _p = get_host(url)
  16. return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
  17. def extract_domain(url):
  18. """
  19. # >>> base_url = extract_domain('http://192.168.3.207:8080/')
  20. """
  21. _, host, port = get_host(url)
  22. return f"{host}" if port is None else f"{host}:{port}"
  23. def extract_page_title(html):
  24. element = html2element(html)
  25. nodes = element.xpath('/html/head/title/text()')
  26. if len(nodes) > 1:
  27. return "".format(nodes[-1]).strip()
  28. return "".join(nodes).strip()
  29. def is_url(url):
  30. """判断url格式畸形与否"""
  31. _regex = re.compile(
  32. r'^(?:http|ftp)s?://' # http:// or https://
  33. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  34. r'localhost|' # localhost...
  35. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  36. r'(?::\d+)?' # optional port
  37. r'(?:/?|[/?]\S+)$', re.IGNORECASE)
  38. return re.match(_regex, url) is not None
  39. def is_domain(domain):
  40. _regex = re.compile(
  41. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  42. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  43. r'(?::\d+)?', re.IGNORECASE)
  44. return re.match(_regex, domain) is not None
  45. def label_split(val):
  46. '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
  47. result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
  48. result = [v for v in result if len(v) > 0]
  49. return result
  50. def get_url(url: str, parameters: dict):
  51. """
  52. 拼接url与所带参数
  53. :param url: 链接
  54. :param parameters: 参数
  55. :return: 拼接后的url
  56. """
  57. _data = '?' + urlencode(parameters)
  58. return urljoin(url, _data)