浏览代码

竞品更新es 配置

lizongze 1 年之前
父节点
当前提交
66c07f283b

+ 4 - 3
ybw/config/conf.yaml

@@ -15,10 +15,11 @@ redis:
 
 
 es:
-  host: 172.17.145.178
+  host: 172.17.4.184
 #  host: 127.0.0.1
-#  host: 192.168.3.206
-  port: !!int 9200
+  usename: "jybid"
+  pwd: "Top2023_JEB01i@31"
+  port: !!int 19905
   db: biddingall # es库别名
 
 

+ 2 - 4
ybw/crawler/account.py

@@ -33,8 +33,7 @@ def read_account():
 
 
 def get_account(site, crawl_type):
-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
-    url = "http://172.17.4.232:1405/competing_goods/account/fetch"
+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
     params = {
         "site": site,
         "crawl_type": crawl_type
@@ -54,8 +53,7 @@ def get_account(site, crawl_type):
 
 
 def release_account(uid, crawl_type, disable_log=False):
-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
-    url = 'http://172.17.4.232:1405/competing_goods/account/release'
+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
     if uid is not None:
         params = {
             "uid": uid,

+ 6 - 5
ybw/detail_spider.py

@@ -138,11 +138,12 @@ class DetailSpider:
                     if code == 200:
                         retries += 1
                     else:
-                        if proxy is None:
-                            proxy = Proxy(True)
-                        else:
-                            proxy.switch()
-                        proxies = proxy.proxies
+                        # if proxy is None:
+                        #     proxy = Proxy(True)
+                        # else:
+                        #     proxy.switch()
+                        # proxies = proxy.proxies
+                        time.sleep(1800)
                         retries += 1
                     continue
                 element = fromstring(r.text)

+ 6 - 5
ybw/list_spider.py

@@ -81,11 +81,12 @@ class ListSpider:
                         self.session, code = login(*self.user, proxies=proxies)
                         if code != 200:
                             '''1小时内登录频繁会限制ip,此时添加代理登录账号'''
-                            if proxy is None:
-                                proxy = Proxy(True)
-                            else:
-                                proxy.switch()
-                            proxies = proxy.proxies
+                            # if proxy is None:
+                            #     proxy = Proxy(True)
+                            # else:
+                            #     proxy.switch()
+                            # proxies = proxy.proxies
+                            time.sleep(1800)
                             retries += 1
                     login_cookies = load_login_cookies(self.user.phone)
                     request_params.update({'cookies': login_cookies})

+ 1 - 1
ybw/utils/databases.py

@@ -52,7 +52,7 @@ def object_id(_id: str):
 def es_client(cfg=None):
     if cfg is None:
         cfg = es_conf
-    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}],http_auth=(cfg['usename'], cfg['pwd']))
 
 
 def es_query(title: str, publish_time: int):

+ 8 - 5
ybw/utils/title_participle.py

@@ -6,23 +6,25 @@ Created on 2023-10-10
 ---------
 @author: Lzz
 """
+from requests.auth import HTTPBasicAuth
 import requests
 import json
 
 
 def get_should(title):
 
-    url = "http://172.17.145.178:9200/_analyze"  # 线上
-    # url = "http://192.168.3.241:9200/_analyze"    # 本地
+    url = "http://172.17.4.184:19905/_analyze"  # 线上
+    username = "jybid"
+    password = "Top2023_JEB01i@31"
 
     headers = {"Content-Type": "application/json"}
-
+    auth = HTTPBasicAuth(username, password)
     data = {
         "analyzer": "ik_smart",
         "text": title
     }
 
-    res = requests.post(url, headers=headers, json=data, timeout=10)
+    res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
 
     try:
         res_text = json.loads(res.text).get('tokens') or [{"token":title}]
@@ -42,4 +44,5 @@ def get_should(title):
         }
         should_list.append(single_dict)
 
-    return should_list
+    return should_list
+

+ 4 - 3
zbytb/config/conf.yaml

@@ -24,10 +24,11 @@ ali_oss:
 
 
 es:
-  host: 172.17.145.178
+  host: 172.17.4.184
 #  host: 127.0.0.1
-#  host: 192.168.3.206
-  port: !!int 9200
+  usename: "jybid"
+  pwd: "Top2023_JEB01i@31"
+  port: !!int 19905
   db: biddingall # es库别名
 
 

+ 2 - 4
zbytb/crawler/account.py

@@ -33,8 +33,7 @@ def read_account():
 
 
 def get_account(site, crawl_type):
-    url = "http://172.17.4.232:1405/competing_goods/account/fetch"
-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
     params = {
         "site": site,
         "crawl_type": crawl_type
@@ -55,8 +54,7 @@ def get_account(site, crawl_type):
 
 def release_account(uid, crawl_type, disable_log=False):
 
-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
-    url = 'http://172.17.4.232:1405/competing_goods/account/release'
+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
     if uid is not None:
         params = {
             "uid": uid,

+ 7 - 11
zbytb/utils/databases.py

@@ -2,7 +2,7 @@ import bson
 import pymongo
 import redis
 from elasticsearch import Elasticsearch
-
+from utils.title_participle import get_should
 from config.load import mongo_conf, redis_conf, es_conf
 
 # ---------------------------------- mongo ----------------------------------
@@ -65,20 +65,16 @@ def es_query(title: str, publish_time: int):
     client = es_client()
     stime = publish_time - 432000  # 往前推5天
     etime = publish_time + 432000
+
+    time_limit = {"range": {'publishtime': {"from": stime, "to": etime}}}
+    should_list = get_should(title)   # 对标题进行分词组合query语句
     # 通过发布标题和发布时间范围查询
     query = {
         "query": {
             "bool": {
-                "must": [
-                    {
-                        "multi_match": {
-                            "query": title,
-                            "type": "phrase",
-                            "fields": ["title"]
-                        }
-                    },
-                    {"range": {'publishtime': {"from": stime, "to": etime}}}
-                ]
+                "must": [time_limit],
+                "should": should_list,
+                "minimum_should_match": "10<90%",
             }
         }
     }

+ 48 - 0
zbytb/utils/title_participle.py

@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-10-10 
+---------
+@summary: 标题分词,组合es查询语句
+---------
+@author: Lzz
+"""
+from requests.auth import HTTPBasicAuth
+import requests
+import json
+
+
+def get_should(title):
+
+    url = "http://172.17.4.184:19905/_analyze"  # 线上
+    username = "jybid"
+    password = "Top2023_JEB01i@31"
+
+    headers = {"Content-Type": "application/json"}
+    auth = HTTPBasicAuth(username, password)
+    data = {
+        "analyzer": "ik_smart",
+        "text": title
+    }
+
+    res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
+
+    try:
+        res_text = json.loads(res.text).get('tokens') or [{"token":title}]
+    except:
+        res_text = [{"token":title}]
+
+    should_list = []
+    for key in res_text:
+        single_dict = {
+            "multi_match": {
+                "query": f"{key.get('token')}",
+                "type": "phrase",
+                "fields": [
+                    "title"
+                ]
+            }
+        }
+        should_list.append(single_dict)
+
+    return should_list
+