Explorar o código

更新清楚无效正文内容工具

lizongze %!s(int64=2) %!d(string=hai) anos
pai
achega
aa01aed0fe
Modificáronse 1 ficheiros con 14 adicións e 6 borrados
  1. 14 6
      FworkSpider/untils/tools.py

+ 14 - 6
FworkSpider/untils/tools.py

@@ -335,12 +335,11 @@ def get_construction_area(project_scale):
     return construction_area
 
 
-# 过滤详情页无效数据
 def remove_htmldata(remove_info_list:list, html:str, response):
     """
-
+        过滤详情页无效数据
     Args:
-        remove_info_list: 需删除内容的xpath或文本 -> list
+        remove_info_list: 需删除内容的xpath或文本 -> list [xpath,re,str]
         html: 待清洗文本
         response: 原文响应体
 
@@ -350,9 +349,18 @@ def remove_htmldata(remove_info_list:list, html:str, response):
     if html and remove_info_list:
         for extra_item in remove_info_list:
             if re.search('^//.*', extra_item):
-                extra_html = response.xpath(extra_item).extract_first()
+                extra_html_list = response.xpath(extra_item).extract()
+                for extra_html in extra_html_list:
+                    if extra_html:
+                        html = html.replace(extra_html, '')
+            elif re.search('^<re>.*', extra_item):
+                extra_item = extra_item.replace('<re>','')
+                extra_html_list = re.findall(f'{extra_item}',html,re.S|re.I|re.M)
+                if extra_html_list:
+                    for exhtml in extra_html_list:
+                        html = html.replace(exhtml, '')
             else:
                 extra_html = extra_item
-            if extra_html:
-                html = html.replace(extra_html, '')
+                if extra_html:
+                    html = html.replace(extra_html, '')
     return html