dongzhaorui 3 ani în urmă
părinte
comite
39bbba5e29
1 a modificat fișierele cu 19 adăugiri și 11 ștergeri
  1. 19 11
      find_source/crawler/services/channel.py

+ 19 - 11
find_source/crawler/services/channel.py

@@ -68,11 +68,12 @@ def tag_date_total(node: HtmlElement, tag=None):
 
 
 def remove_ancestors_date_tag(node: HtmlElement):
-    # tag = node.tag.lower()  # TODO 是否一定需要后裔节点的类型
     prev_node = node
     # 情况1: 先辈节点下直接放置全部的时间节点,直接删除
     remove_count = 0
     for ancestor in node.iterancestors():
+        if ancestor.tag.lower() in ['body', 'html']:
+            continue
         is_remove = False
         total = tag_date_total(ancestor)
         # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
@@ -162,24 +163,32 @@ def strip_node(element: HtmlElement):
     :return:
     """
     for node, _ in iter_node(element):
-        # 节点文本(剔除空白、换行、回车符号)
-        text = "".join("".join(node.xpath('./text()')).split())
         # 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点
         if node.tag.lower() != 'a':
+            # 节点文本(剔除左右空白、换行、回车符号)
+            text = "".join("".join(node.xpath('./text()')).strip())
             # 关键词文本
             non_title = is_title(text) is False
             # 后裔a节点数量
-            total_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
+            sub_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
             # 时间文本
             publish_time = TimeExtractor().extractor(node)
             # print('>>> ', node.tag, node.attrib, text)
-            if non_title and total_tag_gt_0 and publish_time == '':
+            if non_title and sub_tag_gt_0 and publish_time == '':
                 # print('strip_node >>> ', node.tag, node.attrib, text)
                 parent = node.getparent()
-                if parent is not None and parent.tag.lower() == 'a':
-                    etree.strip_tags(parent, node.tag)
-                elif parent is not None and parent.tag.lower() == 'td':
-                    remove_node(parent)
+                if parent is not None:
+                    if parent.tag.lower() == 'a':
+                        etree.strip_tags(parent, node.tag)
+                    elif parent.tag.lower() == 'td':
+                        if not node.getchildren():
+                            if len(text) == 0:
+                                remove_node(parent)
+                            else:
+                                etree.strip_tags(parent, node.tag)
+                        else:
+                            name = [child.tag for child in node.getchildren()]
+                            etree.strip_tags(parent, *name)
                 else:
                     remove_node(node)
 
@@ -262,8 +271,7 @@ def extract_data(source, base_url):
         data = []
         for node in child.iterdescendants('a'):
             title = extract_text(node)
-            href = node.attrib.get('href')
-            href = urljoin(base_url, href)
+            href = urljoin(base_url, node.attrib.get('href'))
             if is_title(title) and len(title) <= 15:
                 item = (title, href)
                 data.append(item)