3 ani în urmă · 39bbba5e29
--- a/find_source/crawler/services/channel.py
+++ b/find_source/crawler/services/channel.py
@@ -68,11 +68,12 @@ def tag_date_total(node: HtmlElement, tag=None):
 
				 
			
 
				 
			
 
				 def remove_ancestors_date_tag(node: HtmlElement):
			
 
				-    # tag = node.tag.lower()  # TODO 是否一定需要后裔节点的类型
			
 
				     prev_node = node
			
 
				     # 情况1: 先辈节点下直接放置全部的时间节点，直接删除
			
 
				     remove_count = 0
			
 
				     for ancestor in node.iterancestors():
			
 
				+        if ancestor.tag.lower() in ['body', 'html']:
			
 
				+            continue
			
 
				         is_remove = False
			
 
				         total = tag_date_total(ancestor)
			
 
				         # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
			
@@ -162,24 +163,32 @@ def strip_node(element: HtmlElement):
 
				     :return:
			
 
				     """
			
 
				     for node, _ in iter_node(element):
			
 
				-        # 节点文本（剔除空白、换行、回车符号）
			
 
				-        text = "".join("".join(node.xpath('./text()')).split())
			
 
				         # 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点
			
 
				         if node.tag.lower() != 'a':
			
 
				+            # 节点文本（剔除左右空白、换行、回车符号）
			
 
				+            text = "".join("".join(node.xpath('./text()')).strip())
			
 
				             # 关键词文本
			
 
				             non_title = is_title(text) is False
			
 
				             # 后裔a节点数量
			
 
				-            total_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
			
 
				+            sub_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
			
 
				             # 时间文本
			
 
				             publish_time = TimeExtractor().extractor(node)
			
 
				             # print('>>> ', node.tag, node.attrib, text)
			
 
				-            if non_title and total_tag_gt_0 and publish_time == '':
			
 
				+            if non_title and sub_tag_gt_0 and publish_time == '':
			
 
				                 # print('strip_node >>> ', node.tag, node.attrib, text)
			
 
				                 parent = node.getparent()
			
 
				-                if parent is not None and parent.tag.lower() == 'a':
			
 
				-                    etree.strip_tags(parent, node.tag)
			
 
				-                elif parent is not None and parent.tag.lower() == 'td':
			
 
				-                    remove_node(parent)
			
 
				+                if parent is not None:
			
 
				+                    if parent.tag.lower() == 'a':
			
 
				+                        etree.strip_tags(parent, node.tag)
			
 
				+                    elif parent.tag.lower() == 'td':
			
 
				+                        if not node.getchildren():
			
 
				+                            if len(text) == 0:
			
 
				+                                remove_node(parent)
			
 
				+                            else:
			
 
				+                                etree.strip_tags(parent, node.tag)
			
 
				+                        else:
			
 
				+                            name = [child.tag for child in node.getchildren()]
			
 
				+                            etree.strip_tags(parent, *name)
			
 
				                 else:
			
 
				                     remove_node(node)
			
 
				 
			
@@ -262,8 +271,7 @@ def extract_data(source, base_url):
 
				         data = []
			
 
				         for node in child.iterdescendants('a'):
			
 
				             title = extract_text(node)
			
 
				-            href = node.attrib.get('href')
			
 
				-            href = urljoin(base_url, href)
			
 
				+            href = urljoin(base_url, node.attrib.get('href'))
			
 
				             if is_title(title) and len(title) <= 15:
			
 
				                 item = (title, href)
			
 
				                 data.append(item)