DomAnalysis.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import re
  2. from bs4 import BeautifulSoup
  3. from crawler.analysis.FilterUrl import FilterUrl
  4. class DomAnalysis(FilterUrl):
  5. """
  6. Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
  7. Tag
  8. NavigableString
  9. BeautifulSoup
  10. Comment
  11. """
  12. def __init__(self, isogeny: bool, dom: str, addr: str):
  13. self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
  14. self.isogeny = isogeny
  15. self.soup = BeautifulSoup(dom, "lxml")
  16. self.addr = addr # 请求地址
  17. def show_html(self):
  18. #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
  19. print(self.soup.prettify().encode('utf-8', 'ignore'))
  20. def _is_input_with_onclick(self, tag):
  21. return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
  22. def get_urls(self):
  23. urls = []
  24. # 静态页面链接解析 和 javascript动态解析
  25. for tag in self.soup.find_all('a'):
  26. if self.judge(tag.get('href')):
  27. href = self.urljoin(tag.get('href'))
  28. if self.filter(href) and href not in urls:
  29. urls.append(href)
  30. # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
  31. for tag in self.soup.find_all():
  32. if self._is_input_with_onclick(tag):
  33. for item in re.findall(self.pattern, tag.get('onclick')):
  34. if self.judge(self.onclick_filter(item)):
  35. href = self.urljoin(self.onclick_filter(item))
  36. if self.filter(href) and href not in urls:
  37. urls.append(href)
  38. return urls
  39. def get_items(self):
  40. items = []
  41. def _extract():
  42. name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
  43. name = "".join(name.split())
  44. if len(name) > 50:
  45. name = "{:.50s}".format(name)
  46. if tag.get('href') is None:
  47. return
  48. try:
  49. href = self.urljoin(tag.get('href'))
  50. except ValueError:
  51. return
  52. data = {'title': name, 'href': href}
  53. if self.filter(href) and data not in items:
  54. items.append(data)
  55. for tag in self.soup.find_all('a'):
  56. if self.judge(tag.get('href')):
  57. _extract()
  58. for tag in self.soup.find_all():
  59. if self._is_input_with_onclick(tag):
  60. for item in re.findall(self.pattern, tag.get('onclick')):
  61. if self.judge(self.onclick_filter(item)):
  62. _extract()
  63. return items