DomAnalysis.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. import re
  2. from bs4 import BeautifulSoup
  3. from crawler.analysis.FilterUrl import FilterUrl
  4. class DomAnalysis(FilterUrl):
  5. """
  6. Beautiful Soup将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以归纳为4种:
  7. Tag
  8. NavigableString
  9. BeautifulSoup
  10. Comment
  11. """
  12. def __init__(self, isogeny: bool, dom: str, host=None, request_url=None):
  13. self.soup = BeautifulSoup(dom, "lxml")
  14. self.pattern = re.compile("href=([a-zA-Z0-9'\"+?=.%/_]*)")
  15. self.isogeny = isogeny
  16. if self.isogeny:
  17. if host is None:
  18. raise TypeError(
  19. '{} missing 1 required positional argument: {}'.format(
  20. self.__class__.__name__, 'host')
  21. )
  22. self.host = host # 网址主机地址
  23. if not self.isogeny:
  24. if request_url is None:
  25. raise TypeError(
  26. '{} missing 1 required positional argument: {}'.format(
  27. self.__class__.__name__, 'request_url'
  28. )
  29. )
  30. self.request_url = request_url # 当前请求网址
  31. def show_html(self):
  32. #https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html 发现prettify是u字符
  33. print(self.soup.prettify().encode('utf-8', 'ignore'))
  34. def _is_input_with_onclick(self, tag):
  35. return (tag.name == 'input') and (tag.get('type')=='button') and tag.has_attr('onclick')
  36. def get_urls(self):
  37. urls = []
  38. # 静态页面链接解析 和 javascript动态解析
  39. for tag in self.soup.find_all('a'):
  40. if self.judge(tag.get('href')):
  41. urls.append(self.filter(tag.get('href')))
  42. # 自动交互. 这里采用静态解析的思路提取交互式生成的链接
  43. for tag in self.soup.find_all():
  44. if self._is_input_with_onclick(tag):
  45. for item in re.findall(self.pattern, tag.get('onclick')):
  46. if self.judge(self.onclick_filter(item)):
  47. urls.append(self.filter(self.onclick_filter(item)))
  48. return urls
  49. def get_items(self):
  50. items = []
  51. def _extract():
  52. name = (tag.text if len(tag.text) != 0 else None or tag.parent.text)
  53. name = "".join(name.split())
  54. if len(name) > 50:
  55. name = "{:.50s}".format(name)
  56. if tag.get('href') is None:
  57. return
  58. try:
  59. href = self.filter(tag.get('href'))
  60. except ValueError:
  61. return
  62. data = {'title': name, 'href': href}
  63. if data not in items:
  64. items.append(data)
  65. for tag in self.soup.find_all('a'):
  66. if self.judge(tag.get('href')):
  67. _extract()
  68. for tag in self.soup.find_all():
  69. if self._is_input_with_onclick(tag):
  70. for item in re.findall(self.pattern, tag.get('onclick')):
  71. if self.judge(self.onclick_filter(item)):
  72. _extract()
  73. return items