dongzhaorui 3 years ago
parent
commit
f8467b7d4b
1 changed files with 2 additions and 1 deletions
  1. 2 1
      find_source/common/tools.py

+ 2 - 1
find_source/common/tools.py

@@ -11,9 +11,10 @@ def element2html(element: HtmlElement) -> str:
 
 
 def html2element(html_str: str) -> HtmlElement:
+    html_str = re.sub('\ufeff|\xa0|\u3000', '', html_str)
     html_str = re.sub('</?br.*?>', '', html_str)
     html_str = re.sub(r'<\?xml.*?>', '', html_str)
-    html_str = re.sub(r'<DOCTYPE.*?>', '', html_str)
+    html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
     return fromstring(html_str)