data_spider
/
topic_spider


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
							import re
from urllib.parse import urlencode, urljoin

from urllib3 import get_host

from common.log import logger
from common.tools import html2element


def err_details(worker):
    worker_exception = worker.exception()
    if worker_exception:
        logger.exception("Worker return exception: {}".format(worker_exception))
    return worker


def extract_base_url(url):
    """

    # >>> base_url = extract_base_url('http://192.168.3.207:8080/')
    """
    _s, _h, _p = get_host(url)
    return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"


def parser_domain(val: str):
    if re.match(r'\d+', val) is None:
        return re.split(r'[\\.:]', val)
    return [val]


def extract_domain(url):
    """

    # >>> base_url = extract_domain('http://192.168.3.207:8080/')
    """
    _, host, port = get_host(url)
    return f"{host}" if port is None else f"{host}:{port}"


def extract_page_title(html):
    element = html2element(html)
    nodes = element.xpath('/html/head/title/text()')
    if len(nodes) > 1:
        return "".join("".format(nodes[-1]).split())
    return "".join("".join(nodes).split())


def is_url(url):
    """判断url格式畸形与否"""
    _regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return re.match(_regex, url) is not None


def is_domain(domain):
    _regex = re.compile(
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?', re.IGNORECASE)
    return re.match(_regex, domain) is not None


def label_split(val):
    '~`!#$%^&*()_+-=|\';"＂:/.,?><~·！@#￥%……&*（）——+-=“：’；、。，？》{《}】【\n\]\[ '
    result = re.split(r'[- _,，\\.|－「」【】?？！!/、] *', val)
    result = [v for v in result if len(v) > 0]
    return result


def get_url(url: str, parameters: dict):
    """
    拼接url与所带参数

    :param url: 链接
    :param parameters: 参数
    :return: 拼接后的url
    """
    _data = '?' + urlencode(parameters)
    return urljoin(url, _data)