|
@@ -335,12 +335,11 @@ def get_construction_area(project_scale):
|
|
|
return construction_area
|
|
|
|
|
|
|
|
|
-# 过滤详情页无效数据
|
|
|
def remove_htmldata(remove_info_list:list, html:str, response):
|
|
|
"""
|
|
|
-
|
|
|
+ 过滤详情页无效数据
|
|
|
Args:
|
|
|
- remove_info_list: 需删除内容的xpath或文本 -> list
|
|
|
+ remove_info_list: 需删除内容的xpath或文本 -> list [xpath,re,str]
|
|
|
html: 待清洗文本
|
|
|
response: 原文响应体
|
|
|
|
|
@@ -350,9 +349,18 @@ def remove_htmldata(remove_info_list:list, html:str, response):
|
|
|
if html and remove_info_list:
|
|
|
for extra_item in remove_info_list:
|
|
|
if re.search('^//.*', extra_item):
|
|
|
- extra_html = response.xpath(extra_item).extract_first()
|
|
|
+ extra_html_list = response.xpath(extra_item).extract()
|
|
|
+ for extra_html in extra_html_list:
|
|
|
+ if extra_html:
|
|
|
+ html = html.replace(extra_html, '')
|
|
|
+ elif re.search('^<re>.*', extra_item):
|
|
|
+ extra_item = extra_item.replace('<re>','')
|
|
|
+ extra_html_list = re.findall(f'{extra_item}',html,re.S|re.I|re.M)
|
|
|
+ if extra_html_list:
|
|
|
+ for exhtml in extra_html_list:
|
|
|
+ html = html.replace(exhtml, '')
|
|
|
else:
|
|
|
extra_html = extra_item
|
|
|
- if extra_html:
|
|
|
- html = html.replace(extra_html, '')
|
|
|
+ if extra_html:
|
|
|
+ html = html.replace(extra_html, '')
|
|
|
return html
|