htmltag.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # coding:utf-8
  2. import re
  3. br_reg = re.compile('<br[/]*>', re.I)
  4. table_reg = re.compile('<([/]*table[^>]*)>', re.I)
  5. tablebody_reg = re.compile('<([/]*tbody[^>]*)>', re.I)
  6. input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
  7. tr_reg = re.compile('<([/]*tr[^>]*)>', re.I)
  8. th_reg = re.compile('<([/]*th[^>]*)>', re.I)
  9. td_reg = re.compile('<([/]*td[^>]*)>', re.I)
  10. p_reg = re.compile('<[/]?p>', re.I)
  11. othertag_reg = re.compile('<[^>]+>', re.I | re.M)
  12. other_symbol_reg = re.compile('[\t| ]*')
  13. seg_first_space_reg = re.compile('\n+\\s*', re.M)
  14. mul_crcf_reg = re.compile('\n+', re.M)
  15. brackets_reg = re.compile('\\s+')
  16. table_fk_reg = re.compile('(\\[table[^\\]]*\\])(.*?)(\\[/table\\])', re.M | re.S | re.I)
  17. zhushi_reg = re.compile('<!--.*?-->', re.I)
  18. ##html标签清理
  19. def Clean(html: str):
  20. html = br_reg.sub('\n', html)
  21. html = table_reg.sub('', html)
  22. html = tablebody_reg.sub('', html)
  23. html = tr_reg.sub('\n', html)
  24. html = td_reg.sub(' ', html)
  25. html = p_reg.sub('\n', html)
  26. html = othertag_reg.sub('', html)
  27. html = other_symbol_reg.sub('', html)
  28. html = seg_first_space_reg.sub('\n', html)
  29. html = mul_crcf_reg.sub('\n', html)
  30. return html
  31. def ClearSpace(txt: str):
  32. return brackets_reg.sub('', txt)
  33. ##html标签清理,但保留table表格
  34. def CleanKeepTable(html: str):
  35. html = zhushi_reg.sub('', html)
  36. html = br_reg.sub('\n', html)
  37. html = table_reg.sub(subFunc4Match, html)
  38. html = tablebody_reg.sub(subFunc4Match, html)
  39. html = tr_reg.sub(subFunc4Match, html)
  40. html = td_reg.sub(subFunc4Match, html)
  41. html = th_reg.sub(subFunc4Match, html)
  42. html = p_reg.sub('\n', html)
  43. html = othertag_reg.sub('', html)
  44. html = seg_first_space_reg.sub('\n', html)
  45. # print("-->", html)
  46. html = table_fk_reg.sub(lambda x: x.group(1) + mul_crcf_reg.sub(' ', x.group(2)) + x.group(3), html)
  47. html = mul_crcf_reg.sub('\n', html)
  48. # 清理table标签中的空格
  49. html = html.replace('[', '<').replace(']', '>')
  50. html = html.replace('<table', '\n<table').replace('</table>', '</table>\n')
  51. return html
  52. def subFunc4Match(strmatch):
  53. try:
  54. if strmatch:
  55. return '[%s]' % (strmatch.group(1))
  56. else:
  57. return ""
  58. except Exception as e:
  59. print(e)
  60. def extract_input_value(html):
  61. input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
  62. input_r = re.compile(r'<[/]*input[^>].*?[/]>', re.I)
  63. result = input_r.findall(html)
  64. for input_detail in result:
  65. ret = input_reg.findall(input_detail)
  66. if ret:
  67. html = html.replace(input_detail, f"</td><td>{ret[0]}")
  68. return html