DomAnalysis.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import re
  2. from bs4 import BeautifulSoup
  3. from crawler.analysis.FilterUrl import FilterUrl
  4. class DomAnalysis(FilterUrl):
  5. """
  6. Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
  7. Tag
  8. NavigableString
  9. BeautifulSoup
  10. Comment
  11. """
  12. def __init__(self, dom, base_url):
  13. self.soup = BeautifulSoup(dom, "lxml")
  14. self.url = base_url
  15. self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
  16. def show_html(self):
  17. #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
  18. print(self.soup.prettify().encode('utf-8', 'ignore'))
  19. def _is_input_with_onclick(self, tag):
  20. return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
  21. def get_urls(self):
  22. urls = []
  23. # 静态页面链接分析 和 javascript动态解析
  24. for tag in self.soup.find_all('a'):
  25. if self.judge(tag.get('href')):
  26. urls.append(self.filter(tag.get('href')))
  27. # 自动交互. 这里采用静态析的思路提取交互式生成的链接
  28. for tag in self.soup.find_all(self._is_input_with_onclick):
  29. for item in re.findall(self.pattern, tag.get('onclick')):
  30. if self.judge(self.onclick_filter(item)):
  31. urls.append(self.filter(self.onclick_filter(item)))
  32. return urls
  33. def get_items(self):
  34. items = []
  35. def _extract():
  36. name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
  37. name = "".join(name.split())
  38. if len(name) > 50:
  39. name = "{:.50s}".format(name)
  40. if tag.get('href') is None:
  41. return
  42. href = self.filter(tag.get('href'))
  43. data = {'name': name, 'host': href}
  44. if data not in items:
  45. items.append(data)
  46. for tag in self.soup.find_all('a'):
  47. if self.judge(tag.get('href')):
  48. _extract()
  49. for tag in self.soup.find_all(self._is_input_with_onclick):
  50. for item in re.findall(self.pattern, tag.get('onclick')):
  51. if self.judge(self.onclick_filter(item)):
  52. _extract()
  53. return items