import re from bs4 import BeautifulSoup from crawler.analysis.FilterUrl import FilterUrl class DomAnalysis(FilterUrl): """ Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种: Tag NavigableString BeautifulSoup Comment """ def __init__(self, isogeny: bool, dom: str, host=None, request_url=None): self.soup = BeautifulSoup(dom, "lxml") self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)") self.isogeny = isogeny if self.isogeny: if host is None: raise TypeError( '{} missing 1 required positional argument: {}'.format( self.__class__.__name__, 'host') ) self.host = host # 网址主机地址 if not self.isogeny: if request_url is None: raise TypeError( '{} missing 1 required positional argument: {}'.format( self.__class__.__name__, 'request_url' ) ) self.request_url = request_url # 当前请求网址 def show_html(self): #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符 print(self.soup.prettify().encode('utf-8', 'ignore')) def _is_input_with_onclick(self, tag): return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick') def get_urls(self): urls = [] # 静态页面链接解析 和 javascript动态解析 for tag in self.soup.find_all('a'): if self.judge(tag.get('href')): urls.append(self.filter(tag.get('href'))) # 自动交互. 这里采用静态解析的思路提取交互式生成的链接 for tag in self.soup.find_all(): if self._is_input_with_onclick(tag): for item in re.findall(self.pattern, tag.get('onclick')): if self.judge(self.onclick_filter(item)): urls.append(self.filter(self.onclick_filter(item))) return urls def get_items(self): items = [] def _extract(): name = (tag.text if len(tag.text) != 0 else None or tag.parent.text) name = "".join(name.split()) if len(name) > 50: name = "{:.50s}".format(name) if tag.get('href') is None: return try: href = self.filter(tag.get('href')) except ValueError: return data = {'title': name, 'href': href} if data not in items: items.append(data) for tag in self.soup.find_all('a'): if self.judge(tag.get('href')): _extract() for tag in self.soup.find_all(): if self._is_input_with_onclick(tag): for item in re.findall(self.pattern, tag.get('onclick')): if self.judge(self.onclick_filter(item)): _extract() return items