htmltag.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # coding:utf-8
  2. import re
  3. br_reg = re.compile('<br[/]*>', re.I)
  4. table_reg = re.compile('<([/]*table[^>]*)>', re.I)
  5. tablebody_reg = re.compile('<([/]*tbody[^>]*)>', re.I)
  6. input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
  7. tr_reg = re.compile('<([/]*tr[^>]*)>', re.I)
  8. th_reg = re.compile('<([/]*th[^>]*)>', re.I)
  9. td_reg = re.compile('<([/]*td[^>]*)>', re.I)
  10. p_reg = re.compile('<[/]?p>', re.I)
  11. othertag_reg = re.compile('<[^>]+>', re.I | re.M)
  12. other_symbol_reg = re.compile('[\t| ]*')
  13. seg_first_space_reg = re.compile('\n+\\s*', re.M)
  14. mul_crcf_reg = re.compile('\n+', re.M)
  15. brackets_reg = re.compile('\\s+')
  16. table_fk_reg = re.compile('(\\[table[^\\]]*\\])(.*?)(\\[/table\\])', re.M | re.S | re.I)
  17. ##html标签清理
  18. def Clean(html: str):
  19. html = br_reg.sub('\n', html)
  20. html = table_reg.sub('', html)
  21. html = tablebody_reg.sub('', html)
  22. html = tr_reg.sub('\n', html)
  23. html = td_reg.sub(' ', html)
  24. html = p_reg.sub('\n', html)
  25. html = othertag_reg.sub('', html)
  26. html = other_symbol_reg.sub('', html)
  27. html = seg_first_space_reg.sub('\n', html)
  28. html = mul_crcf_reg.sub('\n', html)
  29. return html
  30. def ClearSpace(txt: str):
  31. return brackets_reg.sub('', txt)
  32. ##html标签清理,但保留table表格
  33. def CleanKeepTable(html: str):
  34. html = br_reg.sub('\n', html)
  35. html = table_reg.sub(subFunc4Match, html)
  36. html = tablebody_reg.sub(subFunc4Match, html)
  37. html = tr_reg.sub(subFunc4Match, html)
  38. html = td_reg.sub(subFunc4Match, html)
  39. html = th_reg.sub(subFunc4Match, html)
  40. html = p_reg.sub('\n', html)
  41. html = othertag_reg.sub('', html)
  42. # html = other_symbol_reg.sub('',html)
  43. html = seg_first_space_reg.sub('\n', html)
  44. # print("-->", html)
  45. html = table_fk_reg.sub(lambda x: x.group(1) + mul_crcf_reg.sub(' ', x.group(2)) + x.group(3), html)
  46. html = mul_crcf_reg.sub('\n', html)
  47. # 清理table标签中的空格
  48. html = html.replace('[', '<').replace(']', '>')
  49. html = html.replace('<table', '\n<table').replace('</table>', '</table>\n')
  50. return html
  51. def subFunc4Match(strmatch):
  52. try:
  53. if strmatch:
  54. return '[%s]' % (strmatch.group(1))
  55. else:
  56. return ""
  57. except Exception as e:
  58. print(e)
  59. def extract_input_value(html):
  60. input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
  61. input_r = re.compile(r'<[/]*input[^>].*?[/]>', re.I)
  62. result = input_r.findall(html)
  63. for input_detail in result:
  64. ret = input_reg.findall(input_detail)
  65. if ret:
  66. html = html.replace(input_detail, f"</td><td>{ret[0]}")
  67. return html