1 年之前 · 66c07f283b
--- a/ybw/config/conf.yaml
+++ b/ybw/config/conf.yaml
@@ -15,10 +15,11 @@ redis:
 
				 
			
 
				 
			
 
				 es:
			
 
				-  host: 172.17.145.178
			
 
				+  host: 172.17.4.184
			
 
				 #  host: 127.0.0.1
			
 
				-#  host: 192.168.3.206
			
 
				-  port: !!int 9200
			
 
				+  usename: "jybid"
			
 
				+  pwd: "Top2023_JEB01i@31"
			
 
				+  port: !!int 19905
			
 
				   db: biddingall # es库别名
			
 
				 
			
 
				 
			
--- a/ybw/crawler/account.py
+++ b/ybw/crawler/account.py
@@ -33,8 +33,7 @@ def read_account():
 
				 
			
 
				 
			
 
				 def get_account(site, crawl_type):
			
 
				-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
			
 
				-    url = "http://172.17.4.232:1405/competing_goods/account/fetch"
			
 
				+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
			
 
				     params = {
			
 
				         "site": site,
			
 
				         "crawl_type": crawl_type
			
@@ -54,8 +53,7 @@ def get_account(site, crawl_type):
 
				 
			
 
				 
			
 
				 def release_account(uid, crawl_type, disable_log=False):
			
 
				-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
			
 
				-    url = 'http://172.17.4.232:1405/competing_goods/account/release'
			
 
				+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
			
 
				     if uid is not None:
			
 
				         params = {
			
 
				             "uid": uid,
			
--- a/ybw/detail_spider.py
+++ b/ybw/detail_spider.py
@@ -138,11 +138,12 @@ class DetailSpider:
 
				                     if code == 200:
			
 
				                         retries += 1
			
 
				                     else:
			
 
				-                        if proxy is None:
			
 
				-                            proxy = Proxy(True)
			
 
				-                        else:
			
 
				-                            proxy.switch()
			
 
				-                        proxies = proxy.proxies
			
 
				+                        # if proxy is None:
			
 
				+                        #     proxy = Proxy(True)
			
 
				+                        # else:
			
 
				+                        #     proxy.switch()
			
 
				+                        # proxies = proxy.proxies
			
 
				+                        time.sleep(1800)
			
 
				                         retries += 1
			
 
				                     continue
			
 
				                 element = fromstring(r.text)
			
--- a/ybw/list_spider.py
+++ b/ybw/list_spider.py
@@ -81,11 +81,12 @@ class ListSpider:
 
				                         self.session, code = login(*self.user, proxies=proxies)
			
 
				                         if code != 200:
			
 
				                             '''1小时内登录频繁会限制ip,此时添加代理登录账号'''
			
 
				-                            if proxy is None:
			
 
				-                                proxy = Proxy(True)
			
 
				-                            else:
			
 
				-                                proxy.switch()
			
 
				-                            proxies = proxy.proxies
			
 
				+                            # if proxy is None:
			
 
				+                            #     proxy = Proxy(True)
			
 
				+                            # else:
			
 
				+                            #     proxy.switch()
			
 
				+                            # proxies = proxy.proxies
			
 
				+                            time.sleep(1800)
			
 
				                             retries += 1
			
 
				                     login_cookies = load_login_cookies(self.user.phone)
			
 
				                     request_params.update({'cookies': login_cookies})
			
--- a/ybw/utils/databases.py
+++ b/ybw/utils/databases.py
@@ -52,7 +52,7 @@ def object_id(_id: str):
 
				 def es_client(cfg=None):
			
 
				     if cfg is None:
			
 
				         cfg = es_conf
			
 
				-    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
			
 
				+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}],http_auth=(cfg['usename'], cfg['pwd']))
			
 
				 
			
 
				 
			
 
				 def es_query(title: str, publish_time: int):
			
--- a/ybw/utils/title_participle.py
+++ b/ybw/utils/title_participle.py
@@ -6,23 +6,25 @@ Created on 2023-10-10
 
				 ---------
			
 
				 @author: Lzz
			
 
				 """
			
 
				+from requests.auth import HTTPBasicAuth
			
 
				 import requests
			
 
				 import json
			
 
				 
			
 
				 
			
 
				 def get_should(title):
			
 
				 
			
 
				-    url = "http://172.17.145.178:9200/_analyze"  # 线上
			
 
				-    # url = "http://192.168.3.241:9200/_analyze"    # 本地
			
 
				+    url = "http://172.17.4.184:19905/_analyze"  # 线上
			
 
				+    username = "jybid"
			
 
				+    password = "Top2023_JEB01i@31"
			
 
				 
			
 
				     headers = {"Content-Type": "application/json"}
			
 
				-
			
 
				+    auth = HTTPBasicAuth(username, password)
			
 
				     data = {
			
 
				         "analyzer": "ik_smart",
			
 
				         "text": title
			
 
				     }
			
 
				 
			
 
				-    res = requests.post(url, headers=headers, json=data, timeout=10)
			
 
				+    res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
			
 
				 
			
 
				     try:
			
 
				         res_text = json.loads(res.text).get('tokens') or [{"token":title}]
			
@@ -42,4 +44,5 @@ def get_should(title):
 
				         }
			
 
				         should_list.append(single_dict)
			
 
				 
			
 
				-    return should_list
			
 
				+    return should_list
			
 
				+
			
--- a/zbytb/config/conf.yaml
+++ b/zbytb/config/conf.yaml
@@ -24,10 +24,11 @@ ali_oss:
 
				 
			
 
				 
			
 
				 es:
			
 
				-  host: 172.17.145.178
			
 
				+  host: 172.17.4.184
			
 
				 #  host: 127.0.0.1
			
 
				-#  host: 192.168.3.206
			
 
				-  port: !!int 9200
			
 
				+  usename: "jybid"
			
 
				+  pwd: "Top2023_JEB01i@31"
			
 
				+  port: !!int 19905
			
 
				   db: biddingall # es库别名
			
 
				 
			
 
				 
			
--- a/zbytb/crawler/account.py
+++ b/zbytb/crawler/account.py
@@ -33,8 +33,7 @@ def read_account():
 
				 
			
 
				 
			
 
				 def get_account(site, crawl_type):
			
 
				-    url = "http://172.17.4.232:1405/competing_goods/account/fetch"
			
 
				-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
			
 
				+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
			
 
				     params = {
			
 
				         "site": site,
			
 
				         "crawl_type": crawl_type
			
@@ -55,8 +54,7 @@ def get_account(site, crawl_type):
 
				 
			
 
				 def release_account(uid, crawl_type, disable_log=False):
			
 
				 
			
 
				-    # url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
			
 
				-    url = 'http://172.17.4.232:1405/competing_goods/account/release'
			
 
				+    url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
			
 
				     if uid is not None:
			
 
				         params = {
			
 
				             "uid": uid,
			
--- a/zbytb/utils/databases.py
+++ b/zbytb/utils/databases.py
@@ -2,7 +2,7 @@ import bson
 
				 import pymongo
			
 
				 import redis
			
 
				 from elasticsearch import Elasticsearch
			
 
				-
			
 
				+from utils.title_participle import get_should
			
 
				 from config.load import mongo_conf, redis_conf, es_conf
			
 
				 
			
 
				 # ---------------------------------- mongo ----------------------------------
			
@@ -65,20 +65,16 @@ def es_query(title: str, publish_time: int):
 
				     client = es_client()
			
 
				     stime = publish_time - 432000  # 往前推5天
			
 
				     etime = publish_time + 432000
			
 
				+
			
 
				+    time_limit = {"range": {'publishtime': {"from": stime, "to": etime}}}
			
 
				+    should_list = get_should(title)   # 对标题进行分词组合query语句
			
 
				     # 通过发布标题和发布时间范围查询
			
 
				     query = {
			
 
				         "query": {
			
 
				             "bool": {
			
 
				-                "must": [
			
 
				-                    {
			
 
				-                        "multi_match": {
			
 
				-                            "query": title,
			
 
				-                            "type": "phrase",
			
 
				-                            "fields": ["title"]
			
 
				-                        }
			
 
				-                    },
			
 
				-                    {"range": {'publishtime': {"from": stime, "to": etime}}}
			
 
				-                ]
			
 
				+                "must": [time_limit],
			
 
				+                "should": should_list,
			
 
				+                "minimum_should_match": "10<90%",
			
 
				             }
			
 
				         }
			
 
				     }
			
--- a/zbytb/utils/title_participle.py
+++ b/zbytb/utils/title_participle.py
@@ -0,0 +1,48 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2023-10-10 
			
 
				+---------
			
 
				+@summary: 标题分词，组合es查询语句
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+from requests.auth import HTTPBasicAuth
			
 
				+import requests
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+def get_should(title):
			
 
				+
			
 
				+    url = "http://172.17.4.184:19905/_analyze"  # 线上
			
 
				+    username = "jybid"
			
 
				+    password = "Top2023_JEB01i@31"
			
 
				+
			
 
				+    headers = {"Content-Type": "application/json"}
			
 
				+    auth = HTTPBasicAuth(username, password)
			
 
				+    data = {
			
 
				+        "analyzer": "ik_smart",
			
 
				+        "text": title
			
 
				+    }
			
 
				+
			
 
				+    res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
			
 
				+
			
 
				+    try:
			
 
				+        res_text = json.loads(res.text).get('tokens') or [{"token":title}]
			
 
				+    except:
			
 
				+        res_text = [{"token":title}]
			
 
				+
			
 
				+    should_list = []
			
 
				+    for key in res_text:
			
 
				+        single_dict = {
			
 
				+            "multi_match": {
			
 
				+                "query": f"{key.get('token')}",
			
 
				+                "type": "phrase",
			
 
				+                "fields": [
			
 
				+                    "title"
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+        should_list.append(single_dict)
			
 
				+
			
 
				+    return should_list
			
 
				+