123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- # -*- coding: utf-8 -*-
- """
- Created on 2018-10-08 15:33:37
- ---------
- @summary: 重新定义 selector
- ---------
- @author: Boris
- @email: boris_liu@foxmail.com
- """
- import re
- import six
- from lxml import etree
- from parsel import Selector as ParselSelector
- from parsel import SelectorList as ParselSelectorList
- from w3lib.html import replace_entities as w3lib_replace_entities
- def extract_regex(regex, text, replace_entities=True, flags=0):
- """Extract a list of unicode strings from the given text/encoding using the following policies:
- * if the regex contains a named group called "extract" that will be returned
- * if the regex contains multiple numbered groups, all those will be returned (flattened)
- * if the regex doesn't contain any group the entire regex matching is returned
- """
- if isinstance(regex, six.string_types):
- regex = re.compile(regex, flags=flags)
- if "extract" in regex.groupindex:
- # named group
- try:
- extracted = regex.search(text).group("extract")
- except AttributeError:
- strings = []
- else:
- strings = [extracted] if extracted is not None else []
- else:
- # full regex or numbered groups
- strings = regex.findall(text)
- # strings = flatten(strings) # 这东西会把多维列表铺平
- if not replace_entities:
- return strings
- values = []
- for value in strings:
- if isinstance(value, (list, tuple)): # w3lib_replace_entities 不能接收list tuple
- values.append(
- [w3lib_replace_entities(v, keep=["lt", "amp"]) for v in value]
- )
- else:
- values.append(w3lib_replace_entities(value, keep=["lt", "amp"]))
- return values
- def create_root_node(text, parser_cls, base_url=None):
- """Create root node for text using given parser class.
- """
- body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
- parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
- root = etree.fromstring(body, parser=parser, base_url=base_url)
- if root is None:
- root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
- return root
- class SelectorList(ParselSelectorList):
- """
- The :class:`SelectorList` class is a subclass of the builtin ``list``
- class, which provides a few additional methods.
- """
- def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
- """
- Call the ``.re()`` method for the first element in this list and
- return the result in an unicode string. If the list is empty or the
- regex doesn't match anything, return the default value (``None`` if
- the argument is not provided).
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``.
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- datas = self.re(regex, replace_entities=replace_entities, flags=flags)
- return datas[0] if datas else default
- def re(self, regex, replace_entities=True, flags=re.S):
- """
- Call the ``.re()`` method for each element in this list and return
- their results flattened, as a list of unicode strings.
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``.
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- datas = [
- x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
- ]
- return datas[0] if len(datas) == 1 else datas
- class Selector(ParselSelector):
- selectorlist_cls = SelectorList
- def __str__(self):
- data = repr(self.get())
- return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
- __repr__ = __str__
- def __init__(self, text=None, *args, **kwargs):
- # 先将 转为空格,否则selector 会转为 \xa0
- if text:
- text = re.sub(" ", "\x20", text)
- super(Selector, self).__init__(text, *args, **kwargs)
- def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
- """
- Apply the given regex and return the first unicode string which
- matches. If there is no match, return the default value (``None`` if
- the argument is not provided).
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``.
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- datas = self.re(regex, replace_entities=replace_entities, flags=flags)
- return datas[0] if datas else default
- def re(self, regex, replace_entities=True, flags=re.S):
- """
- Apply the given regex and return a list of unicode strings with the
- matches.
- ``regex`` can be either a compiled regular expression or a string which
- will be compiled to a regular expression using ``re.compile(regex)``.
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``.
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- return extract_regex(
- regex, self.get(), replace_entities=replace_entities, flags=flags
- )
- def _get_root(self, text, base_url=None):
- return create_root_node(text, self._parser, base_url=base_url)
|