DomAnalysis.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. import re
  2. from bs4 import BeautifulSoup
  3. from crawler.analysis.FilterUrl import FilterUrl
  4. from crawler.utils import extract_fqdn
  5. FILTER_MODES = {
  6. 0: 'general_mode',
  7. 1: 'same_origin_mode',
  8. 2: 'non_origin_mode',
  9. }
  10. class DomAnalysis(FilterUrl):
  11. """
  12. Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
  13. Tag
  14. NavigableString
  15. BeautifulSoup
  16. Comment
  17. """
  18. def __init__(self, dom: str, url: str, mode=0):
  19. """
  20. :param dom: 页面源码
  21. :param url: 当前访问网址
  22. :param mode: 过滤模式:同源模式=1;非同源模式=2;通用模式=0
  23. """
  24. self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
  25. self.soup = BeautifulSoup(dom, "lxml")
  26. self.mode = FILTER_MODES[mode]
  27. self.request_url = url
  28. self.domain = extract_fqdn(url)
  29. def show_html(self):
  30. #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
  31. print(self.soup.prettify().encode('utf-8', 'ignore'))
  32. def _is_input_with_onclick(self, tag):
  33. return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
  34. def get_urls(self):
  35. urls = []
  36. # 静态页面链接解析 和 javascript动态解析
  37. for tag in self.soup.find_all('a'):
  38. if self.judge(tag.get('href')):
  39. href = self.urljoin(tag.get('href'))
  40. if self.filter(href) and href not in urls:
  41. urls.append(href)
  42. # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
  43. for tag in self.soup.find_all():
  44. if self._is_input_with_onclick(tag):
  45. for item in re.findall(self.pattern, tag.get('onclick')):
  46. if self.judge(self.onclick_filter(item)):
  47. href = self.urljoin(self.onclick_filter(item))
  48. if self.filter(href) and href not in urls:
  49. urls.append(href)
  50. return urls
  51. def get_items(self):
  52. items = []
  53. def _extract():
  54. name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
  55. name = "".join(name.split())
  56. if len(name) > 50:
  57. name = "{:.50s}".format(name)
  58. if tag.get('href') is None:
  59. return
  60. try:
  61. href = self.urljoin(tag.get('href'))
  62. except ValueError:
  63. return
  64. data = {'title': name, 'href': href}
  65. if self.filter(href) and data not in items:
  66. items.append(data)
  67. for tag in self.soup.find_all('a'):
  68. if self.judge(tag.get('href')):
  69. _extract()
  70. for tag in self.soup.find_all():
  71. if self._is_input_with_onclick(tag):
  72. for item in re.findall(self.pattern, tag.get('onclick')):
  73. if self.judge(self.onclick_filter(item)):
  74. _extract()
  75. return items