Эх сурвалжийг харах

抽取数据库配置,更新到全局配置

dongzhaorui 1 жил өмнө
parent
commit
c95f3f51c4

+ 0 - 25
ybw/config/conf.yaml

@@ -1,25 +0,0 @@
-mongo:
-  host: 172.17.4.87
-  port: !!int 27080
-
-
-redis:
-  host: 172.17.162.28
-  port: !!int 7361
-  pwd: "k5ZJR5KV4q7DRZ92DQ"
-  db: !!int 1
-
-
-es:
-  host: 172.17.4.184
-  usename: "jybid"
-  pwd: "Top2023_JEB01i@31"
-  port: !!int 19905
-  db: biddingall # es库别名
-
-
-proxy:
-  socks5:
-    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
-    auth:
-      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB

+ 0 - 5
ybw/config/constants.yaml

@@ -1,8 +1,3 @@
-headers:
-  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
-  Accept: '*/*'
-
-
 crawler_url:
   home_page: https://www.chinabidding.cn/search/searchgj/zbcg?areaid={}&keywords=&time_start={}&time_end={}&search_type=CONTEXT&categoryid=&rp={}&table_type={}&b_date=custom
   list_url: https://www.chinabidding.cn/search/searchgj/zbcg?areaid={}&keywords=&time_start={}&time_end={}&page={}&search_type=TITLE&categoryid=&rp={}&table_type={}&b_date=custom

+ 1 - 21
ybw/config/load.py

@@ -2,35 +2,15 @@ from pathlib import Path
 
 import yaml
 
-__all__ = [
-    'mongo_conf',
-    'redis_conf',
-    'es_conf',
-    'constants',
-    'headers',
-    'jy_proxy',
-    'crawler_url',
-    'region',
-    'node_module_path'
-]
+__all__ = ['constants', 'crawler_url', 'region', 'node_module_path']
 
 _base_path = Path(__file__).parent
-_yaml_conf = (_base_path / 'conf.yaml').resolve()
 _yaml_constants = (_base_path / 'constants.yaml').resolve()
 _yaml_areas = (_base_path / 'areas.yaml').resolve()
 _node_modules = (_base_path.parent / 'node_modules').resolve()
 
-with open(_yaml_conf, encoding="utf-8") as f:
-    conf = yaml.safe_load(f)
-    mongo_conf = conf['mongo']
-    redis_conf = conf['redis']
-    es_conf: dict = conf['es']
-    jy_proxy: dict = conf['proxy']
-
-
 with open(_yaml_constants, encoding="utf-8") as fp:
     constants = yaml.safe_load(fp)
-    headers: dict = constants['headers']
     crawler_url: dict = constants['crawler_url']
 
 with open(_yaml_areas, encoding="utf-8") as fr:

+ 25 - 32
ybw/crawler/account.py

@@ -1,31 +1,26 @@
 import json
-from pathlib import Path
 
 import requests
+from bson.objectid import ObjectId
 
+import setting
+from utils.databases import mongo_table
 from utils.log import logger
 from utils.tools import wait
-from utils.databases import mongo_table
-from bson.objectid import ObjectId
-
-ROOT_PATH = Path(__file__).parent.parent
 
-_headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
-JSON_ACCOUNT_RECORD = (ROOT_PATH / 'config/account_record.json').resolve()
+HEADERS = {"Authorization": setting.JY_AUTH_TOKEN}
+ACCOUNT_DATA = (setting.ROOT_PATH / 'config/account_record.json').resolve()
 
 
 def account_record(uid, crawl_type):
-    with open(JSON_ACCOUNT_RECORD, 'w+', encoding='utf-8') as wp:
-        item = {
-            "uid": uid,
-            "crawl_type": crawl_type
-        }
+    with open(ACCOUNT_DATA, 'w+', encoding='utf-8') as wp:
+        item = {"uid": uid, "crawl_type": crawl_type}
         wp.write(json.dumps(item, indent=4))
 
 
 def read_account():
     try:
-        with open(JSON_ACCOUNT_RECORD, encoding='utf-8') as rp:
+        with open(ACCOUNT_DATA, encoding='utf-8') as rp:
             cookies: dict = json.load(rp)
             return cookies
     except (json.decoder.JSONDecodeError, FileNotFoundError):
@@ -33,16 +28,14 @@ def read_account():
 
 
 def get_account(site, crawl_type):
-    url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
-    params = {
-        "site": site,
-        "crawl_type": crawl_type
-    }
+    params = {"site": site, "crawl_type": crawl_type}
     try:
-        response = requests.get(url,
-                                headers=_headers,
-                                params=params,
-                                timeout=60)
+        response = requests.get(
+            setting.ACCOUNT_FETCH_API,
+            headers=HEADERS,
+            params=params,
+            timeout=60
+        )
         data = response.json()['data']
         logger.info("当前账号状态:{}".format(data['crawl_detail']))
     except requests.RequestException:
@@ -53,20 +46,20 @@ def get_account(site, crawl_type):
 
 
 def release_account(uid, crawl_type, disable_log=False):
-    url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
     if uid is not None:
-        params = {
-            "uid": uid,
-            "crawl_type": crawl_type
-        }
+        params = {"uid": uid, "crawl_type": crawl_type}
         while True:
             try:
-                response = requests.get(url,
-                                        headers=_headers,
-                                        params=params,
-                                        timeout=60)
+                response = requests.get(
+                    setting.ACCOUNT_RELEASE_API,
+                    headers=HEADERS,
+                    params=params,
+                    timeout=60
+                )
                 if response.status_code == 200:
-                    acc_status = mongo_table('py_spider', 'match_account').find_one({'_id': ObjectId(uid)})['crawl_detail']
+                    collection = mongo_table('py_spider', 'match_account')
+                    ret = collection.find_one({'_id': ObjectId(uid)})
+                    acc_status = ret['crawl_detail']
                     if not disable_log:
                         logger.info(f"release_account >>> {response.json()}, status : {acc_status}")
                     break

+ 5 - 7
ybw/crawler/login.py

@@ -4,22 +4,20 @@ import threading
 import time
 import uuid
 from collections import namedtuple
-from pathlib import Path
 
 import execjs
 import requests
 from requests import Session
 from requests.utils import dict_from_cookiejar
 
+import setting
 from config.load import node_module_path
 from utils.execptions import CrawlError
 from utils.log import logger
 
-LOCK = threading.RLock()
-
-ROOT_PATH = Path(__file__).parent.parent
-JSON_LOGIN_COOKIE = (ROOT_PATH / 'config/login_cookie.json').resolve()
+RLOCK = threading.RLock()
 
+JSON_LOGIN_COOKIE = (setting.ROOT_PATH / 'config/login_cookie.json').resolve()
 User = namedtuple('User', ['phone', 'passwd'])
 
 
@@ -42,7 +40,7 @@ def load_login_cookies(user_name: str):
 
 
 def save_login_cookies(user_name: str, login_cookie: dict):
-    with LOCK:
+    with RLOCK:
         # 文件存在就读取,不存在就创建
         fp = _open_file()
         # 内容存在就加载到内存,不存在就设置为空字典
@@ -73,7 +71,7 @@ def update_login_cookies(user_name: str, update_val: dict):
         update_val: 需要更新的cookie内容
 
     """
-    with LOCK:
+    with RLOCK:
         fp = open(JSON_LOGIN_COOKIE, encoding='utf-8')
         user_maps: dict = json.load(fp)
         login_cookies: dict = user_maps.get(user_name)

+ 32 - 0
ybw/setting.py

@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-28 
+---------
+@summary:  全局配置
+---------
+@author: Dzr
+"""
+import pathlib
+
+ROOT_PATH = pathlib.Path(__file__).absolute().parent
+
+MONGO_IP = "172.17.4.87"
+MONGO_PORT = 27080
+MONGO_DB = "py_spider"
+
+REDIS_URL = "redis://:k5ZJR5KV4q7DRZ92DQ@172.17.4.240:8361/0"
+# REDIS_URL = "redis://default:top@123@192.168.3.165:8165/5"
+
+ES_IP = "172.17.4.184"
+ES_PORT = 19905
+ES_USERNAME = "jybid"
+ES_PASSWORD = "Top2023_JEB01i@31"
+ES_INDEX = "biddingall"  # es库别名
+WORD_SEGMENTATION_API = "http://172.17.4.184:19905/_analyze"  # 分词接口
+# WORD_SEGMENTATION_API = "http://192.168.3.149:9201/_analyze"
+# WORD_SEGMENTATION_API = "http://192.168.3.182:1990/_analyze"
+
+JY_AUTH_TOKEN = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+PROXY_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+ACCOUNT_FETCH_API = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
+ACCOUNT_RELEASE_API = "http://cc.spdata.jianyu360.com/competing_goods/account/release"

+ 28 - 45
ybw/utils/RedisDB.py

@@ -10,14 +10,15 @@ import hashlib
 
 import redis
 
+import setting
 
-class RedisFilter:
-    redis_db = None
-
-    def __init__(self, redis_url=None, expire_time=None):
 
-        self.__class__.redis_db = redis.StrictRedis.from_url(redis_url)  # 单机
+class RedisFilter:
 
+    def __init__(self, url=None, expire_time=None):
+        if not url:
+            url = setting.REDIS_URL
+        self.redis_db = redis.StrictRedis.from_url(url)
         self._ex = expire_time or 86400 * 365 * 1  # 1年 = 86400 * 365 * 1
 
     def __repr__(self):
@@ -29,19 +30,21 @@ class RedisFilter:
             return True
         return False
 
-    def add(self, keys, *args, **kwargs):
+    def add(self, keys):
         """
-        添加数据  删除数据:redis_db.delete("pylist_" + key)
+        添加数据
+
         @param keys: 检查关键词在 redis 中是否存在,支持列表批量
-        @return: list / 单个值(如果数据已存在 返回 False 否则返回 True, 可以理解为是否添加成功)
+        @return: list / 单个值(添加失败返回False, 添加成功返回True)
         """
         is_list = isinstance(keys, list)
         keys = keys if is_list else [keys]
 
         is_added = []
         for key in keys:
-            if not self.exists(key):
-                is_added.append(self.redis_db.set(key, 1, ex=self._ex))
+            pkey = "pylist_" + self.fingerprint(key)
+            if not self.exists(pkey):
+                is_added.append(self.redis_db.set(pkey, 1, ex=self._ex))
             else:
                 is_added.append(False)
 
@@ -58,7 +61,8 @@ class RedisFilter:
 
         is_exist = []
         for key in keys:
-            is_exist.append(self.exists(key))
+            pkey = "pylist_" + self.fingerprint(key)
+            is_exist.append(self.exists(pkey))
 
         # 判断数据本身是否重复
         temp_set = set()
@@ -70,37 +74,16 @@ class RedisFilter:
 
         return is_exist if is_list else is_exist[0]
 
-
-def get_sha256(*args):
-    """
-    @summary: 获取唯一的64位值, 用于获取唯一的id
-    ---------
-    @param *args: 参与联合去重的值
-    ---------
-    @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
-    """
-
-    sha256 = hashlib.sha256()
-    for arg in args:
-        sha256.update(str(arg).encode())
-    return sha256.hexdigest()  # 64位
-
-
-def rexists(dedup, data):
-    data = [data] if not isinstance(data, list) else data
-    args = sorted(data)
-    pykey = "pylist_" + get_sha256(*args)
-    if dedup.get(pykey):
-        ''' 存在 '''
-        return True
-    else:
-        ''' 不存在 '''
-        return False
-
-
-def radd(dedup, data):
-    data = [data] if not isinstance(data, list) else data
-    args = sorted(data)
-    pykey = "pylist_" + get_sha256(*args)
-    state = dedup.add(pykey)
-    return state
+    def fingerprint(self, *args):
+        """
+        @summary: 获取唯一的64位值,获取唯一数据指纹
+        ---------
+        @param args: 去重数据集合
+        ---------
+        @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
+        """
+        args = sorted(args)
+        sha256 = hashlib.sha256()
+        for arg in args:
+            sha256.update(str(arg).encode())
+        return sha256.hexdigest()

+ 19 - 24
ybw/utils/databases.py

@@ -1,10 +1,9 @@
 import bson
 import pymongo
-import redis
 from elasticsearch import Elasticsearch
-from utils.title_participle import get_should
 
-from config.load import mongo_conf, redis_conf, es_conf
+import setting
+from utils.title_participle import get_should
 
 # ---------------------------------- mongo ----------------------------------
 MONGO_URI_CLIENTS = {}    # a dictionary hold all client with uri as key
@@ -14,11 +13,12 @@ def mongo_client(cfg=None, host=None, port=None, fork=False, **kwargs):
     if host is not None and port is not None:
         uri = f'mongodb://{host}:{port}'
     else:
-        _cfg = (cfg or mongo_conf)
+        _cfg = (cfg or {'host': setting.MONGO_IP, 'port': setting.MONGO_PORT})
         uri = f'mongodb://{_cfg["host"]}:{_cfg["port"]}'
 
     if fork:
         return pymongo.MongoClient(uri, **kwargs)
+
     global MONGO_URI_CLIENTS
     matched_client = MONGO_URI_CLIENTS.get(uri)
     if matched_client is None:
@@ -29,13 +29,13 @@ def mongo_client(cfg=None, host=None, port=None, fork=False, **kwargs):
     return matched_client
 
 
-def mongo_database(name: str, **kw):
-    client = mongo_client(**kw)
+def mongo_database(name: str, **kwargs):
+    client = mongo_client(**kwargs)
     return client.get_database(name)
 
 
-def mongo_table(db: str, name: str, **kw):
-    database = mongo_database(db, **kw)
+def mongo_table(db: str, name: str, **kwargs):
+    database = mongo_database(db, **kwargs)
     return database.get_collection(name)
 
 
@@ -51,8 +51,16 @@ def object_id(_id: str):
 # ---------------------------------- es ----------------------------------
 def es_client(cfg=None):
     if cfg is None:
-        cfg = es_conf
-    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}],http_auth=(cfg['usename'], cfg['pwd']))
+        cfg = {
+            'host': setting.ES_IP,
+            'username': setting.ES_USERNAME,
+            'pwd': setting.ES_PASSWORD,
+            'port': setting.ES_PORT,
+        }
+
+    hosts = [{'host': cfg['host'], 'port': cfg['port']}]
+    auth = (cfg['username'], cfg['pwd'])
+    return Elasticsearch(hosts, http_auth=auth)
 
 
 def es_query(title: str, publish_time: int):
@@ -79,19 +87,6 @@ def es_query(title: str, publish_time: int):
             }
         }
     }
-    result = client.search(index=es_conf['db'], body=query, request_timeout=100)
+    result = client.search(index=setting.ES_INDEX, body=query, request_timeout=100)
     total = int(result['hits']['total']['value'])
     return total
-
-
-# ---------------------------------- redis ----------------------------------
-def redis_client(cfg=None):
-    if cfg is None:
-        cfg = redis_conf
-    pool = redis.ConnectionPool(
-        host=cfg['host'],
-        port=cfg['port'],
-        password=cfg['pwd'],
-        db=cfg['db']
-    )
-    return redis.Redis(connection_pool=pool, decode_responses=True)

+ 17 - 17
ybw/utils/socks5.py

@@ -8,29 +8,29 @@ Created on 2024-02-27
 """
 import requests
 
-from config.load import jy_proxy
+import setting
+from utils.log import logger
 
 
 def get_proxy(scheme=None, default=None, socks5h=False):
-    headers = jy_proxy['socks5']['auth']
-    url = jy_proxy['socks5']['url']
+    headers = {'Authorization': setting.JY_AUTH_TOKEN}
     try:
-        proxy_res = requests.get(url, headers=headers).json()
-        proxies = proxy_res.get('data')
-        if proxy_res and proxies:
+        resp = requests.get(setting.PROXY_API, headers=headers, timeout=5).json()
+        proxies = resp.get("data")
+        if resp and proxies:
             if socks5h:
-                proxyh = {}
                 proxy_items = proxies.get("http")
-                proxyh["http"] = proxy_items.replace("socks5", "socks5h")
-                proxyh["https"] = proxy_items.replace("socks5", "socks5h")
-                proxies = proxyh
-            # print(f"切换代理:{proxies}")
-            if not scheme:
-                return proxies
-            else:
-                return proxies.get(scheme, default)
+                # 请求代理 vps dns域名解析
+                proxies = dict(
+                    http=proxy_items.replace("socks5", "socks5h"),
+                    https=proxy_items.replace("socks5", "socks5h")
+                )
+
+            logger.debug(f"切换代理:{proxies}")
+            return proxies if not scheme else proxies.get(scheme, default)
         else:
-            print("暂无代理...")
+            logger.info("暂无代理...")
     except Exception:
         pass
-    return None
+
+    return default

+ 9 - 16
ybw/utils/title_participle.py

@@ -6,31 +6,25 @@ Created on 2023-10-10
 ---------
 @author: Lzz
 """
-from requests.auth import HTTPBasicAuth
-import requests
 import json
 
+import requests
+from requests.auth import HTTPBasicAuth
 
-def get_should(title):
+import setting
 
-    # url = "http://192.168.3.149:9201/_analyze"  # 测试
-    url = "http://172.17.4.184:19905/_analyze"  # 线上
-    username = "jybid"
-    password = "Top2023_JEB01i@31"
 
+def get_should(title):
+    url = setting.WORD_SEGMENTATION_API
+    auth = HTTPBasicAuth(setting.ES_USERNAME, setting.ES_PASSWORD)
     headers = {"Content-Type": "application/json"}
-    auth = HTTPBasicAuth(username, password)
-    data = {
-        "analyzer": "ik_smart",
-        "text": title
-    }
-
+    data = {"analyzer": "ik_smart", "text": title}
     res = requests.post(url, headers=headers, auth=auth, json=data, timeout=10)
 
     try:
-        res_text = json.loads(res.text).get('tokens') or [{"token":title}]
+        res_text = json.loads(res.text).get('tokens') or [{"token": title}]
     except:
-        res_text = [{"token":title}]
+        res_text = [{"token": title}]
 
     should_list = []
     for key in res_text:
@@ -46,4 +40,3 @@ def get_should(title):
         should_list.append(single_dict)
 
     return should_list
-