data_spider
/
topic_spider


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
							import re

from bs4 import BeautifulSoup

from crawler.analysis.FilterUrl import FilterUrl
from crawler.utils import extract_fqdn

FILTER_MODES = {
    0: 'general_mode',
    1: 'same_origin_mode',
    2: 'non_origin_mode',
}


class DomAnalysis(FilterUrl):
    """
    Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
        Tag
        NavigableString
        BeautifulSoup
        Comment
    """

    def __init__(self, dom: str, url: str, mode=0):
        """
        :param dom: 页面源码
        :param url: 当前访问网址
        :param mode: 过滤模式：同源模式=1;非同源模式=2;通用模式=0
        """
        self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
        self.soup = BeautifulSoup(dom, "lxml")
        self.mode = FILTER_MODES[mode]
        self.request_url = url
        self.domain = extract_fqdn(url)

    def show_html(self):
        #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
        print(self.soup.prettify().encode('utf-8', 'ignore'))

    def _is_input_with_onclick(self, tag):
        return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')

    def get_urls(self):
        urls = []
        # 静态页面链接解析 和 javascript动态解析
        for tag in self.soup.find_all('a'):
            if self.judge(tag.get('href')):
                href = self.urljoin(tag.get('href'))
                if self.filter(href) and href not in urls:
                    urls.append(href)

        # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
        for tag in self.soup.find_all():
            if self._is_input_with_onclick(tag):
                for item in re.findall(self.pattern, tag.get('onclick')):
                    if self.judge(self.onclick_filter(item)):
                        href = self.urljoin(self.onclick_filter(item))
                        if self.filter(href) and href not in urls:
                            urls.append(href)
        return urls

    def get_items(self):
        items = []

        def _extract():
            name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
            name = "".join(name.split())
            if len(name) > 50:
                name = "{:.50s}".format(name)
            if tag.get('href') is None:
                return
            try:
                href = self.urljoin(tag.get('href'))
            except ValueError:
                return
            data = {'title': name, 'href': href}
            if self.filter(href) and data not in items:
                items.append(data)

        for tag in self.soup.find_all('a'):
            if self.judge(tag.get('href')):
                _extract()

        for tag in self.soup.find_all():
            if self._is_input_with_onclick(tag):
                for item in re.findall(self.pattern, tag.get('onclick')):
                    if self.judge(self.onclick_filter(item)):
                        _extract()
        return items