utils.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. import re
  2. from urllib.parse import urlencode, urljoin
  3. from urllib3 import get_host
  4. from common.log import logger
  5. from common.tools import html2element
  6. def err_details(worker):
  7. worker_exception = worker.exception()
  8. if worker_exception:
  9. logger.exception("Worker return exception: {}".format(worker_exception))
  10. return worker
  11. def extract_base_url(url):
  12. """
  13. # >>> base_url = extract_base_url('http://192.168.3.207:8080/')
  14. """
  15. _s, _h, _p = get_host(url)
  16. return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
  17. def parser_domain(val: str):
  18. if re.match(r'\d+', val) is None:
  19. return re.split(r'[\\.:]', val)
  20. return [val]
  21. def extract_domain(url):
  22. """
  23. # >>> base_url = extract_domain('http://192.168.3.207:8080/')
  24. """
  25. _, host, port = get_host(url)
  26. return f"{host}" if port is None else f"{host}:{port}"
  27. def extract_page_title(html):
  28. element = html2element(html)
  29. nodes = element.xpath('/html/head/title/text()')
  30. if len(nodes) > 1:
  31. return "".join("".format(nodes[-1]).split())
  32. return "".join("".join(nodes).split())
  33. def is_url(url):
  34. """判断url格式畸形与否"""
  35. _regex = re.compile(
  36. r'^(?:http|ftp)s?://' # http:// or https://
  37. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  38. r'localhost|' # localhost...
  39. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  40. r'(?::\d+)?' # optional port
  41. r'(?:/?|[/?]\S+)$', re.IGNORECASE)
  42. return re.match(_regex, url) is not None
  43. def is_domain(domain):
  44. _regex = re.compile(
  45. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  46. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  47. r'(?::\d+)?', re.IGNORECASE)
  48. return re.match(_regex, domain) is not None
  49. def label_split(val):
  50. '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
  51. result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
  52. result = [v for v in result if len(v) > 0]
  53. return result
  54. def get_url(url: str, parameters: dict):
  55. """
  56. 拼接url与所带参数
  57. :param url: 链接
  58. :param parameters: 参数
  59. :return: 拼接后的url
  60. """
  61. _data = '?' + urlencode(parameters)
  62. return urljoin(url, _data)