浏览代码

清洗title,优化es检索

lizongze 1 年之前
父节点
当前提交
bc6eaee086
共有 4 个文件被更改,包括 67 次插入12 次删除
  1. 2 2
      ybw/list_spider.py
  2. 7 10
      ybw/utils/databases.py
  3. 44 0
      ybw/utils/title_participle.py
  4. 14 0
      ybw/utils/tools.py

+ 2 - 2
ybw/list_spider.py

@@ -14,7 +14,7 @@ from utils.databases import mongo_table, int2long, es_query, redis_client
 from utils.execptions import CrawlError, YbwCrawlError
 from utils.log import logger
 from utils.socks5 import Proxy
-from utils.tools import sha1
+from utils.tools import sha1,clean_title
 
 CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
 
@@ -115,7 +115,7 @@ class ListSpider:
             if '-' not in publish_time:
                 publish_time = "".join(node.xpath('./td[6]/text()')).strip()
             area = "".join("".join(node.xpath('./td[5]/text()')).split())
-            title = "".join("".join(node.xpath('./td[2]/a/text()')).split())
+            title = clean_title("".join("".join(node.xpath('./td[2]/a/text()')).split()))
             competehref = 'https://www.chinabidding.cn{}'.format("".join(node.xpath('./td[2]/a/@href')))
             item = {
                 "site": "元博网(采购与招标网)",

+ 7 - 10
ybw/utils/databases.py

@@ -2,6 +2,7 @@ import bson
 import pymongo
 import redis
 from elasticsearch import Elasticsearch
+from utils.title_participle import get_should
 
 from config.load import mongo_conf, redis_conf, es_conf
 
@@ -65,20 +66,16 @@ def es_query(title: str, publish_time: int):
     client = es_client()
     stime = publish_time - 432000  # 往前推5天
     etime = publish_time + 432000
+
+    time_limit = {"range": {'publishtime': {"from": stime, "to": etime}}}
+    should_list = get_should(title)   # 对标题进行分词组合query语句
+    should_list.append(time_limit)
     # 通过发布标题和发布时间范围查询
     query = {
         "query": {
             "bool": {
-                "must": [
-                    {
-                        "multi_match": {
-                            "query": title,
-                            "type": "phrase",
-                            "fields": ["title","detail"]
-                        }
-                    },
-                    {"range": {'publishtime': {"from": stime, "to": etime}}}
-                ]
+                "should": should_list,
+                "minimum_should_match": "10<90%",
             }
         }
     }

+ 44 - 0
ybw/utils/title_participle.py

@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-10-10 
+---------
+@summary: 标题分词,组合es查询语句
+---------
+@author: Lzz
+"""
+import requests
+import json
+
+
+def get_should(title):
+
+    url = "http://172.17.145.178:9200/_analyze"
+
+    headers = {"Content-Type": "application/json"}
+
+    data = {
+        "analyzer": "ik_smart",
+        "text": title
+    }
+
+    res = requests.post(url, headers=headers, json=data, timeout=10)
+
+    try:
+        res_text = json.loads(res.text).get('tokens')
+    except:
+        res_text = [{"token":title}]
+
+    should_list = []
+    for key in res_text:
+        single_dict = {
+            "multi_match": {
+                "query": f"{key.get('token')}",
+                "type": "phrase",
+                "fields": [
+                    "title"
+                ]
+            }
+        }
+        should_list.append(single_dict)
+
+    return should_list

+ 14 - 0
ybw/utils/tools.py

@@ -1,6 +1,20 @@
 import socket
 import hashlib
 import time
+import re
+
+
+def clean_title(title):
+    if title:
+        rule_list = [
+            '\(\d{1,20}\)',
+            '\[[\u4e00-\u9fa5]{1,9}\]',
+            '【[\u4e00-\u9fa5]{1,9}】',
+        ]
+        for rule in rule_list:
+            title = re.sub(rule, '', title)
+
+    return title
 
 
 def get_host_ip():