Browse Source

正文、标题末尾标点符号清洗,es配置修改

lizongze 3 năm trước cách đây
mục cha
commit
c6790f398a

+ 2 - 2
zbytb/config/conf.yaml

@@ -24,10 +24,10 @@ ali_oss:
 
 
 es:
-  host: 172.17.4.184
+  host: 172.17.145.178
 #  host: 127.0.0.1
 #  host: 192.168.3.206
-  port: !!int 19800
+  port: !!int 9800
   db: biddingall # es库别名
 
 

+ 3 - 0
zbytb/crawler/clean_html.py

@@ -129,6 +129,9 @@ def cleaner(html, special=None, completely=False):
         html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
 
+    if html:
+        html = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", html.strip())  # 清除文本末尾符号
+
     html = _escape_character(html)
     return html
 

+ 5 - 1
zbytb/crawler/spiders/ListPageSpider.py

@@ -1,4 +1,5 @@
 import time
+import re
 from concurrent.futures import ThreadPoolExecutor, wait
 from urllib.parse import urlencode
 
@@ -56,8 +57,11 @@ class CrawlListPageSpider:
         for node in nodes:
             publish_time = str(node.xpath("./td[4]/text()")[0])
             l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
+            title = node.xpath("./td[2]/a/text()")[0]
+            if title:
+                title = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", title.strip())
             info = {
-                "title": node.xpath("./td[2]/a/text()")[0],
+                "title": title,
                 "competehref": node.xpath("./td[2]/a/@href")[0],
                 "area": node.xpath("./td[1]/a/text()")[0],
                 "publishtime": publish_time,