|
@@ -68,11 +68,12 @@ def tag_date_total(node: HtmlElement, tag=None):
|
|
|
|
|
|
|
|
|
def remove_ancestors_date_tag(node: HtmlElement):
|
|
|
- # tag = node.tag.lower() # TODO 是否一定需要后裔节点的类型
|
|
|
prev_node = node
|
|
|
# 情况1: 先辈节点下直接放置全部的时间节点,直接删除
|
|
|
remove_count = 0
|
|
|
for ancestor in node.iterancestors():
|
|
|
+ if ancestor.tag.lower() in ['body', 'html']:
|
|
|
+ continue
|
|
|
is_remove = False
|
|
|
total = tag_date_total(ancestor)
|
|
|
# print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
|
|
@@ -162,24 +163,32 @@ def strip_node(element: HtmlElement):
|
|
|
:return:
|
|
|
"""
|
|
|
for node, _ in iter_node(element):
|
|
|
- # 节点文本(剔除空白、换行、回车符号)
|
|
|
- text = "".join("".join(node.xpath('./text()')).split())
|
|
|
# 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点
|
|
|
if node.tag.lower() != 'a':
|
|
|
+ # 节点文本(剔除左右空白、换行、回车符号)
|
|
|
+ text = "".join("".join(node.xpath('./text()')).strip())
|
|
|
# 关键词文本
|
|
|
non_title = is_title(text) is False
|
|
|
# 后裔a节点数量
|
|
|
- total_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
|
|
|
+ sub_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
|
|
|
# 时间文本
|
|
|
publish_time = TimeExtractor().extractor(node)
|
|
|
# print('>>> ', node.tag, node.attrib, text)
|
|
|
- if non_title and total_tag_gt_0 and publish_time == '':
|
|
|
+ if non_title and sub_tag_gt_0 and publish_time == '':
|
|
|
# print('strip_node >>> ', node.tag, node.attrib, text)
|
|
|
parent = node.getparent()
|
|
|
- if parent is not None and parent.tag.lower() == 'a':
|
|
|
- etree.strip_tags(parent, node.tag)
|
|
|
- elif parent is not None and parent.tag.lower() == 'td':
|
|
|
- remove_node(parent)
|
|
|
+ if parent is not None:
|
|
|
+ if parent.tag.lower() == 'a':
|
|
|
+ etree.strip_tags(parent, node.tag)
|
|
|
+ elif parent.tag.lower() == 'td':
|
|
|
+ if not node.getchildren():
|
|
|
+ if len(text) == 0:
|
|
|
+ remove_node(parent)
|
|
|
+ else:
|
|
|
+ etree.strip_tags(parent, node.tag)
|
|
|
+ else:
|
|
|
+ name = [child.tag for child in node.getchildren()]
|
|
|
+ etree.strip_tags(parent, *name)
|
|
|
else:
|
|
|
remove_node(node)
|
|
|
|
|
@@ -262,8 +271,7 @@ def extract_data(source, base_url):
|
|
|
data = []
|
|
|
for node in child.iterdescendants('a'):
|
|
|
title = extract_text(node)
|
|
|
- href = node.attrib.get('href')
|
|
|
- href = urljoin(base_url, href)
|
|
|
+ href = urljoin(base_url, node.attrib.get('href'))
|
|
|
if is_title(title) and len(title) <= 15:
|
|
|
item = (title, href)
|
|
|
data.append(item)
|