1 年之前 · bc6eaee086
--- a/ybw/list_spider.py
+++ b/ybw/list_spider.py
@@ -14,7 +14,7 @@ from utils.databases import mongo_table, int2long, es_query, redis_client
 
				 from utils.execptions import CrawlError, YbwCrawlError
			
 
				 from utils.log import logger
			
 
				 from utils.socks5 import Proxy
			
 
				-from utils.tools import sha1
			
 
				+from utils.tools import sha1,clean_title
			
 
				 
			
 
				 CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
			
 
				 
			
@@ -115,7 +115,7 @@ class ListSpider:
 
				             if '-' not in publish_time:
			
 
				                 publish_time = "".join(node.xpath('./td[6]/text()')).strip()
			
 
				             area = "".join("".join(node.xpath('./td[5]/text()')).split())
			
 
				-            title = "".join("".join(node.xpath('./td[2]/a/text()')).split())
			
 
				+            title = clean_title("".join("".join(node.xpath('./td[2]/a/text()')).split()))
			
 
				             competehref = 'https://www.chinabidding.cn{}'.format("".join(node.xpath('./td[2]/a/@href')))
			
 
				             item = {
			
 
				                 "site": "元博网（采购与招标网）",
			
--- a/ybw/utils/databases.py
+++ b/ybw/utils/databases.py
@@ -2,6 +2,7 @@ import bson
 
				 import pymongo
			
 
				 import redis
			
 
				 from elasticsearch import Elasticsearch
			
 
				+from utils.title_participle import get_should
			
 
				 
			
 
				 from config.load import mongo_conf, redis_conf, es_conf
			
 
				 
			
@@ -65,20 +66,16 @@ def es_query(title: str, publish_time: int):
 
				     client = es_client()
			
 
				     stime = publish_time - 432000  # 往前推5天
			
 
				     etime = publish_time + 432000
			
 
				+
			
 
				+    time_limit = {"range": {'publishtime': {"from": stime, "to": etime}}}
			
 
				+    should_list = get_should(title)   # 对标题进行分词组合query语句
			
 
				+    should_list.append(time_limit)
			
 
				     # 通过发布标题和发布时间范围查询
			
 
				     query = {
			
 
				         "query": {
			
 
				             "bool": {
			
 
				-                "must": [
			
 
				-                    {
			
 
				-                        "multi_match": {
			
 
				-                            "query": title,
			
 
				-                            "type": "phrase",
			
 
				-                            "fields": ["title","detail"]
			
 
				-                        }
			
 
				-                    },
			
 
				-                    {"range": {'publishtime': {"from": stime, "to": etime}}}
			
 
				-                ]
			
 
				+                "should": should_list,
			
 
				+                "minimum_should_match": "10<90%",
			
 
				             }
			
 
				         }
			
 
				     }
			
--- a/ybw/utils/title_participle.py
+++ b/ybw/utils/title_participle.py
@@ -0,0 +1,44 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2023-10-10 
			
 
				+---------
			
 
				+@summary: 标题分词，组合es查询语句
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import requests
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+def get_should(title):
			
 
				+
			
 
				+    url = "http://172.17.145.178:9200/_analyze"
			
 
				+
			
 
				+    headers = {"Content-Type": "application/json"}
			
 
				+
			
 
				+    data = {
			
 
				+        "analyzer": "ik_smart",
			
 
				+        "text": title
			
 
				+    }
			
 
				+
			
 
				+    res = requests.post(url, headers=headers, json=data, timeout=10)
			
 
				+
			
 
				+    try:
			
 
				+        res_text = json.loads(res.text).get('tokens')
			
 
				+    except:
			
 
				+        res_text = [{"token":title}]
			
 
				+
			
 
				+    should_list = []
			
 
				+    for key in res_text:
			
 
				+        single_dict = {
			
 
				+            "multi_match": {
			
 
				+                "query": f"{key.get('token')}",
			
 
				+                "type": "phrase",
			
 
				+                "fields": [
			
 
				+                    "title"
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+        should_list.append(single_dict)
			
 
				+
			
 
				+    return should_list
			
--- a/ybw/utils/tools.py
+++ b/ybw/utils/tools.py
@@ -1,6 +1,20 @@
 
				 import socket
			
 
				 import hashlib
			
 
				 import time
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def clean_title(title):
			
 
				+    if title:
			
 
				+        rule_list = [
			
 
				+            '\(\d{1,20}\)',
			
 
				+            '\[[\u4e00-\u9fa5]{1,9}\]',
			
 
				+            '【[\u4e00-\u9fa5]{1,9}】',
			
 
				+        ]
			
 
				+        for rule in rule_list:
			
 
				+            title = re.sub(rule, '', title)
			
 
				+
			
 
				+    return title
			
 
				 
			
 
				 
			
 
				 def get_host_ip():