12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- import re
- from bs4 import BeautifulSoup
- from crawler.analysis.FilterUrl import FilterUrl
- class DomAnalysis(FilterUrl):
- """
- Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
- Tag
- NavigableString
- BeautifulSoup
- Comment
- """
- def __init__(self, isogeny: bool, dom: str, addr: str):
- self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
- self.isogeny = isogeny
- self.soup = BeautifulSoup(dom, "lxml")
- self.addr = addr # 请求地址
- def show_html(self):
- #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
- print(self.soup.prettify().encode('utf-8', 'ignore'))
- def _is_input_with_onclick(self, tag):
- return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
- def get_urls(self):
- urls = []
- # 静态页面链接解析 和 javascript动态解析
- for tag in self.soup.find_all('a'):
- if self.judge(tag.get('href')):
- href = self.urljoin(tag.get('href'))
- if self.filter(href) and href not in urls:
- urls.append(href)
- # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
- for tag in self.soup.find_all():
- if self._is_input_with_onclick(tag):
- for item in re.findall(self.pattern, tag.get('onclick')):
- if self.judge(self.onclick_filter(item)):
- href = self.urljoin(self.onclick_filter(item))
- if self.filter(href) and href not in urls:
- urls.append(href)
- return urls
- def get_items(self):
- items = []
- def _extract():
- name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
- name = "".join(name.split())
- if len(name) > 50:
- name = "{:.50s}".format(name)
- if tag.get('href') is None:
- return
- try:
- href = self.urljoin(tag.get('href'))
- except ValueError:
- return
- data = {'title': name, 'href': href}
- if self.filter(href) and data not in items:
- items.append(data)
- for tag in self.soup.find_all('a'):
- if self.judge(tag.get('href')):
- _extract()
- for tag in self.soup.find_all():
- if self._is_input_with_onclick(tag):
- for item in re.findall(self.pattern, tag.get('onclick')):
- if self.judge(self.onclick_filter(item)):
- _extract()
- return items
|