selector.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2018-10-08 15:33:37
  4. ---------
  5. @summary: 重新定义 selector
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import re
  11. import six
  12. from lxml import etree
  13. from parsel import Selector as ParselSelector
  14. from parsel import SelectorList as ParselSelectorList
  15. from w3lib.html import replace_entities as w3lib_replace_entities
  16. def extract_regex(regex, text, replace_entities=True, flags=0):
  17. """Extract a list of unicode strings from the given text/encoding using the following policies:
  18. * if the regex contains a named group called "extract" that will be returned
  19. * if the regex contains multiple numbered groups, all those will be returned (flattened)
  20. * if the regex doesn't contain any group the entire regex matching is returned
  21. """
  22. if isinstance(regex, six.string_types):
  23. regex = re.compile(regex, flags=flags)
  24. if "extract" in regex.groupindex:
  25. # named group
  26. try:
  27. extracted = regex.search(text).group("extract")
  28. except AttributeError:
  29. strings = []
  30. else:
  31. strings = [extracted] if extracted is not None else []
  32. else:
  33. # full regex or numbered groups
  34. strings = regex.findall(text)
  35. # strings = flatten(strings) # 这东西会把多维列表铺平
  36. if not replace_entities:
  37. return strings
  38. values = []
  39. for value in strings:
  40. if isinstance(value, (list, tuple)): # w3lib_replace_entities 不能接收list tuple
  41. values.append(
  42. [w3lib_replace_entities(v, keep=["lt", "amp"]) for v in value]
  43. )
  44. else:
  45. values.append(w3lib_replace_entities(value, keep=["lt", "amp"]))
  46. return values
  47. def create_root_node(text, parser_cls, base_url=None):
  48. """Create root node for text using given parser class.
  49. """
  50. body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
  51. parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
  52. root = etree.fromstring(body, parser=parser, base_url=base_url)
  53. if root is None:
  54. root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
  55. return root
  56. class SelectorList(ParselSelectorList):
  57. """
  58. The :class:`SelectorList` class is a subclass of the builtin ``list``
  59. class, which provides a few additional methods.
  60. """
  61. def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
  62. """
  63. Call the ``.re()`` method for the first element in this list and
  64. return the result in an unicode string. If the list is empty or the
  65. regex doesn't match anything, return the default value (``None`` if
  66. the argument is not provided).
  67. By default, character entity references are replaced by their
  68. corresponding character (except for ``&amp;`` and ``&lt;``.
  69. Passing ``replace_entities`` as ``False`` switches off these
  70. replacements.
  71. """
  72. datas = self.re(regex, replace_entities=replace_entities, flags=flags)
  73. return datas[0] if datas else default
  74. def re(self, regex, replace_entities=True, flags=re.S):
  75. """
  76. Call the ``.re()`` method for each element in this list and return
  77. their results flattened, as a list of unicode strings.
  78. By default, character entity references are replaced by their
  79. corresponding character (except for ``&amp;`` and ``&lt;``.
  80. Passing ``replace_entities`` as ``False`` switches off these
  81. replacements.
  82. """
  83. datas = [
  84. x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
  85. ]
  86. return datas[0] if len(datas) == 1 else datas
  87. class Selector(ParselSelector):
  88. selectorlist_cls = SelectorList
  89. def __str__(self):
  90. data = repr(self.get())
  91. return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
  92. __repr__ = __str__
  93. def __init__(self, text=None, *args, **kwargs):
  94. # 先将&nbsp; 转为空格,否则selector 会转为 \xa0
  95. if text:
  96. text = re.sub("&nbsp;", "\x20", text)
  97. super(Selector, self).__init__(text, *args, **kwargs)
  98. def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
  99. """
  100. Apply the given regex and return the first unicode string which
  101. matches. If there is no match, return the default value (``None`` if
  102. the argument is not provided).
  103. By default, character entity references are replaced by their
  104. corresponding character (except for ``&amp;`` and ``&lt;``.
  105. Passing ``replace_entities`` as ``False`` switches off these
  106. replacements.
  107. """
  108. datas = self.re(regex, replace_entities=replace_entities, flags=flags)
  109. return datas[0] if datas else default
  110. def re(self, regex, replace_entities=True, flags=re.S):
  111. """
  112. Apply the given regex and return a list of unicode strings with the
  113. matches.
  114. ``regex`` can be either a compiled regular expression or a string which
  115. will be compiled to a regular expression using ``re.compile(regex)``.
  116. By default, character entity references are replaced by their
  117. corresponding character (except for ``&amp;`` and ``&lt;``.
  118. Passing ``replace_entities`` as ``False`` switches off these
  119. replacements.
  120. """
  121. return extract_regex(
  122. regex, self.get(), replace_entities=replace_entities, flags=flags
  123. )
  124. def _get_root(self, text, base_url=None):
  125. return create_root_node(text, self._parser, base_url=base_url)