|
@@ -14,10 +14,25 @@ class DomAnalysis(FilterUrl):
|
|
|
Comment
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, dom, base_url):
|
|
|
+ def __init__(self, isogeny: bool, dom: str, host=None, request_url=None):
|
|
|
self.soup = BeautifulSoup(dom, "lxml")
|
|
|
- self.url = base_url
|
|
|
self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
|
|
|
+ self.isogeny = isogeny
|
|
|
+ if self.isogeny:
|
|
|
+ if host is None:
|
|
|
+ raise TypeError(
|
|
|
+ '{} missing 1 required positional argument: {}'.format(
|
|
|
+ self.__class__.__name__, 'host')
|
|
|
+ )
|
|
|
+ self.host = host # 网址主机地址
|
|
|
+ if not self.isogeny:
|
|
|
+ if request_url is None:
|
|
|
+ raise TypeError(
|
|
|
+ '{} missing 1 required positional argument: {}'.format(
|
|
|
+ self.__class__.__name__, 'request_url'
|
|
|
+ )
|
|
|
+ )
|
|
|
+ self.request_url = request_url # 当前请求网址
|
|
|
|
|
|
def show_html(self):
|
|
|
#https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
|
|
@@ -26,22 +41,22 @@ class DomAnalysis(FilterUrl):
|
|
|
def _is_input_with_onclick(self, tag):
|
|
|
return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
|
|
|
|
|
|
- def get_urls(self, **kwargs):
|
|
|
+ def get_urls(self):
|
|
|
urls = []
|
|
|
# 静态页面链接解析 和 javascript动态解析
|
|
|
for tag in self.soup.find_all('a'):
|
|
|
- if self.judge(tag.get('href'), **kwargs):
|
|
|
+ if self.judge(tag.get('href')):
|
|
|
urls.append(self.filter(tag.get('href')))
|
|
|
|
|
|
# 自动交互. 这里采用静态解析的思路提取交互式生成的链接
|
|
|
for tag in self.soup.find_all():
|
|
|
if self._is_input_with_onclick(tag):
|
|
|
for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
- if self.judge(self.onclick_filter(item), **kwargs):
|
|
|
+ if self.judge(self.onclick_filter(item)):
|
|
|
urls.append(self.filter(self.onclick_filter(item)))
|
|
|
return urls
|
|
|
|
|
|
- def get_items(self, **kwargs):
|
|
|
+ def get_items(self):
|
|
|
items = []
|
|
|
|
|
|
def _extract():
|
|
@@ -55,17 +70,17 @@ class DomAnalysis(FilterUrl):
|
|
|
href = self.filter(tag.get('href'))
|
|
|
except ValueError:
|
|
|
return
|
|
|
- data = {'name': name, 'host': href}
|
|
|
+ data = {'title': name, 'href': href}
|
|
|
if data not in items:
|
|
|
items.append(data)
|
|
|
|
|
|
for tag in self.soup.find_all('a'):
|
|
|
- if self.judge(tag.get('href'), **kwargs):
|
|
|
+ if self.judge(tag.get('href')):
|
|
|
_extract()
|
|
|
|
|
|
for tag in self.soup.find_all():
|
|
|
if self._is_input_with_onclick(tag):
|
|
|
for item in re.findall(self.pattern, tag.get('onclick')):
|
|
|
- if self.judge(self.onclick_filter(item), **kwargs):
|
|
|
+ if self.judge(self.onclick_filter(item)):
|
|
|
_extract()
|
|
|
return items
|