3 năm trước cách đây · c6790f398a
--- a/zbytb/config/conf.yaml
+++ b/zbytb/config/conf.yaml
@@ -24,10 +24,10 @@ ali_oss:
 
				 
			
 
				 
			
 
				 es:
			
 
				-  host: 172.17.4.184
			
 
				+  host: 172.17.145.178
			
 
				 #  host: 127.0.0.1
			
 
				 #  host: 192.168.3.206
			
 
				-  port: !!int 19800
			
 
				+  port: !!int 9800
			
 
				   db: biddingall # es库别名
			
 
				 
			
 
				 
			
--- a/zbytb/crawler/clean_html.py
+++ b/zbytb/crawler/clean_html.py
@@ -129,6 +129,9 @@ def cleaner(html, special=None, completely=False):
 
				         html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				 
			
 
				+    if html:
			
 
				+        html = re.sub(r'([,|.|。|，|；|;|?|&|$|#|@|!|！|%|*|\'|"|‘|’|“|￥|？| ]*?)$', "", html.strip())  # 清除文本末尾符号
			
 
				+
			
 
				     html = _escape_character(html)
			
 
				     return html
			
 
				 
			
--- a/zbytb/crawler/spiders/ListPageSpider.py
+++ b/zbytb/crawler/spiders/ListPageSpider.py
@@ -1,4 +1,5 @@
 
				 import time
			
 
				+import re
			
 
				 from concurrent.futures import ThreadPoolExecutor, wait
			
 
				 from urllib.parse import urlencode
			
 
				 
			
@@ -56,8 +57,11 @@ class CrawlListPageSpider:
 
				         for node in nodes:
			
 
				             publish_time = str(node.xpath("./td[4]/text()")[0])
			
 
				             l_np_publishtime = int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))
			
 
				+            title = node.xpath("./td[2]/a/text()")[0]
			
 
				+            if title:
			
 
				+                title = re.sub(r'([,|.|。|，|；|;|?|&|$|#|@|!|！|%|*|\'|"|‘|’|“|￥|？| ]*?)$', "", title.strip())
			
 
				             info = {
			
 
				-                "title": node.xpath("./td[2]/a/text()")[0],
			
 
				+                "title": title,
			
 
				                 "competehref": node.xpath("./td[2]/a/@href")[0],
			
 
				                 "area": node.xpath("./td[1]/a/text()")[0],
			
 
				                 "publishtime": publish_time,