Browse Source

添加日志模块

dzr 5 months ago
parent
commit
67ced21d8d
11 changed files with 123 additions and 71 deletions
  1. 1 1
      docker-compose.yml
  2. 6 0
      install-crontab.sh
  3. 33 0
      log.py
  4. 1 1
      start.sh
  5. 2 3
      utils/login.py
  6. 2 1
      utils/tools.py
  7. 2 2
      ybw_crontab.txt
  8. 67 59
      ybw_details.py
  9. 1 1
      ybw_esquery.py
  10. 1 1
      ybw_query_list.py
  11. 7 2
      ybw_release_account.py

+ 1 - 1
docker-compose.yml

@@ -6,7 +6,6 @@ services:
       - /mnt/ybw:/mnt
       - /mnt/ybw:/mnt
     restart: always
     restart: always
     privileged: true
     privileged: true
-    tty: true
     logging:
     logging:
       driver: "json-file"
       driver: "json-file"
       options:
       options:
@@ -18,3 +17,4 @@ services:
           memory: 4G
           memory: 4G
         reservations:
         reservations:
           memory: 10M
           memory: 10M
+    command: /sbin/init

+ 6 - 0
install-crontab.sh

@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+# 解决bash: service: command not found 错误
+yum list | grep initscripts && yum install initscripts -y
+# 安装和启动
+yum install crontabs -y && service crond start

+ 33 - 0
log.py

@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-08-22
+---------
+@summary:  日志模块
+---------
+@author: Dzr
+"""
+import sys
+from pathlib import Path
+
+from loguru import logger
+
+logger.remove()  # 删除默认logru配置
+
+_absolute = Path(__file__).absolute().parent
+_log_path = (_absolute / 'logs/log_{time:YYYYMMDD}.log').resolve()
+loguru_format = (
+    "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+    "<level>{level: <4}</level> | "
+    "<cyan>{thread.name}</cyan> | "
+    "<cyan>{file.name}</cyan>:<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+)
+logru_level = 'INFO'
+logger.add(
+    sink=_log_path,
+    format=loguru_format,
+    level=logru_level,
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)
+logger.add(sys.stdout, format=loguru_format, colorize=True, level=logru_level)

+ 1 - 1
start.sh

@@ -1,4 +1,4 @@
 #!/bin/bash
 #!/bin/bash
 
 
 ps -ef |grep python3 |grep "ybw_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
 ps -ef |grep python3 |grep "ybw_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
-nohup python3 ybw_details.py > ybw_details.out 2>&1 &
+nohup python3 ybw_details.py > /dev/null 2>&1 &

+ 2 - 3
utils/login.py

@@ -4,16 +4,15 @@ import threading
 import time
 import time
 import uuid
 import uuid
 from collections import namedtuple
 from collections import namedtuple
-from pathlib import Path
 
 
 import execjs
 import execjs
 import requests
 import requests
 from requests import Session
 from requests import Session
 from requests.utils import dict_from_cookiejar
 from requests.utils import dict_from_cookiejar
 
 
-from utils.execptions import CrawlError
-from loguru import logger
 import setting
 import setting
+from log import logger
+from utils.execptions import CrawlError
 
 
 LOCK = threading.RLock()
 LOCK = threading.RLock()
 
 

+ 2 - 1
utils/tools.py

@@ -12,8 +12,9 @@ import time
 
 
 import bson
 import bson
 import requests
 import requests
-from loguru import logger
+
 import setting
 import setting
+from log import logger
 
 
 
 
 def clean_title(title):
 def clean_title(title):

+ 2 - 2
ybw_crontab.txt

@@ -1,5 +1,5 @@
 # 元博网 限量采集
 # 元博网 限量采集
-30 9 * * * cd /mnt && python3 ybw_query_list.py > ybw_list.out 2>&1
+30 9 * * * cd /mnt && python3 ybw_query_list.py > /dev/null 2>&1
 0 3 * * * python3 /mnt/ybw_release_account.py
 0 3 * * * python3 /mnt/ybw_release_account.py
 0 9-16/2 * * 1-5 cd /mnt && ./start.sh
 0 9-16/2 * * 1-5 cd /mnt && ./start.sh
-50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > esquery.out 2>&1
+50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > /dev/null 2>&1

+ 67 - 59
ybw_details.py

@@ -11,7 +11,6 @@ import re
 import time
 import time
 
 
 import requests.exceptions
 import requests.exceptions
-from loguru import logger
 from lxml.html import fromstring, HtmlElement, tostring
 from lxml.html import fromstring, HtmlElement, tostring
 from lxml.html.clean import Cleaner
 from lxml.html.clean import Cleaner
 from pymongo import MongoClient
 from pymongo import MongoClient
@@ -19,6 +18,7 @@ from pymongo import MongoClient
 import setting
 import setting
 import utils.tools as tool
 import utils.tools as tool
 from dbs.RedisDB import RedisFilter
 from dbs.RedisDB import RedisFilter
+from log import logger
 from utils.check_utils import CheckText, CheckTask
 from utils.check_utils import CheckText, CheckTask
 from utils.clean_html import cleaner
 from utils.clean_html import cleaner
 from utils.login import User, load_login_cookies, login, login_check
 from utils.login import User, load_login_cookies, login, login_check
@@ -102,17 +102,15 @@ class DetailSpider:
         params = {
         params = {
             "fid": f"{fid}"
             "fid": f"{fid}"
         }
         }
-
-        res = requests.get(url, headers=headers, params=params, **request_params)
-        return res
+        return requests.get(url, headers=headers, params=params, **request_params)
 
 
     def crawl_request(self, item: dict):
     def crawl_request(self, item: dict):
-        url = item['competehref']
+        url = item["competehref"]
         headers = {
         headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
             'Host': 'www.chinabidding.cn',
             'Host': 'www.chinabidding.cn',
             'Upgrade-Insecure-Requests': '1',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
         }
         }
 
 
@@ -136,8 +134,10 @@ class DetailSpider:
             if login_cookies is None:
             if login_cookies is None:
                 login(*self.user, **proxy_params)
                 login(*self.user, **proxy_params)
                 self.login_times += 1
                 self.login_times += 1
+                self.update_account_login_times()
                 continue
                 continue
-            elif 'cookies' not in request_params:
+
+            if 'cookies' not in request_params:
                 request_params.setdefault('cookies', login_cookies)
                 request_params.setdefault('cookies', login_cookies)
             else:
             else:
                 request_params.update({'cookies': login_cookies})
                 request_params.update({'cookies': login_cookies})
@@ -153,17 +153,18 @@ class DetailSpider:
                         logger.info(f"[重新登录]{self.user.phone}")
                         logger.info(f"[重新登录]{self.user.phone}")
                         _, code = login(*self.user, **proxy_params)
                         _, code = login(*self.user, **proxy_params)
                         self.login_times += 1
                         self.login_times += 1
-                        if code == 200:
-                            retries += 1
-                        else:
+                        retries += 1
+                        if code != 200:
                             time.sleep(600)
                             time.sleep(600)
-                            retries += 1
+
                         continue
                         continue
+
                     logger.info(f'[采集正文] fid_{fid}')
                     logger.info(f'[采集正文] fid_{fid}')
                     return r
                     return r
                 except:
                 except:
                     retries += 1
                     retries += 1
-                    continue
+                finally:
+                    self.update_account_login_times()
             else:
             else:
                 try:
                 try:
                     r = requests.get(url, **request_params)
                     r = requests.get(url, **request_params)
@@ -173,26 +174,25 @@ class DetailSpider:
                         logger.info(f"[重新登录]{self.user.phone}")
                         logger.info(f"[重新登录]{self.user.phone}")
                         _, code = login(*self.user, **proxy_params)
                         _, code = login(*self.user, **proxy_params)
                         self.login_times += 1
                         self.login_times += 1
-                        if code == 200:
-                            retries += 1
-                        else:
+                        retries += 1
+                        if code != 200:
                             time.sleep(1800)
                             time.sleep(1800)
-                            retries += 1
+
                         continue
                         continue
 
 
-                    element = fromstring(r.text)
+                    element = fromstring(r.content.decode())
                     nodes = element.xpath('//*[@id="main_dom"]/div[1]')
                     nodes = element.xpath('//*[@id="main_dom"]/div[1]')
                     if len(nodes) != 1:
                     if len(nodes) != 1:
                         retries_502 += 1
                         retries_502 += 1
                         logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
                         logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
-                        continue
                     else:
                     else:
-                        node = nodes[0]
+                        node = nodes[0]  # list index out of range
                         logger.info(f'[采集正文] id={node.attrib.get("id")}')
                         logger.info(f'[采集正文] id={node.attrib.get("id")}')
                         return r
                         return r
                 except requests.RequestException:
                 except requests.RequestException:
                     retries += 1
                     retries += 1
-                    continue
+                finally:
+                    self.update_account_login_times()
 
 
         return None
         return None
 
 
@@ -239,36 +239,36 @@ class DetailSpider:
         self.save_tab.insert_one(insert)
         self.save_tab.insert_one(insert)
         logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
         logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
 
 
-    def crawl_spider(self, schedule, item):
-        count = schedule['count']
-        self.login_times = schedule['login_times']
-        if count >= schedule['total'] or self.login_times >= 3:
-            ''' 账号限制 '''
-            logger.warning("账号限制")
-            return '账号限制'
+    def update_account_login_times(self):
+        self.ybw_info.update_one(
+            {"account": self.user.phone},
+            {"$set": {
+                "login_times": self.login_times,
+                "update_time": tool.get_current_date()
+            }}
+        )
 
 
+    def crawl_spider(self, account, item):
         _id = item["_id"]
         _id = item["_id"]
-        err = "error"
-        for _ in range(3):
-            try:
-                CheckTask(item)  # 检查请求采集任务
-                response = self.crawl_request(item)
-                if response is not None:
-                    self.crawl_response(response, item)
-                    count += 1
-                    self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
-                    self.ybw_info.update_one(
-                        {"account": self.user.phone},
-                        {"$set": {
-                            "count": count,
-                            "update_time": tool.get_current_date(),
-                            "login_times": self.login_times
-                        }}
-                    )
-                    return True
-            except Exception as e:
-                err = e
-                logger.error(f"请求错误:{err}")
+        err = "unknown error"
+
+        try:
+            CheckTask(item)  # 检查请求采集任务
+            response = self.crawl_request(item)
+            if response is not None:
+                self.crawl_response(response, item)
+                self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
+                self.ybw_info.update_one(
+                    {"account": self.user.phone},
+                    {"$set": {
+                        "count": account["count"] + 1,
+                        "update_time": tool.get_current_date(),
+                    }}
+                )
+                return True
+        except Exception as e:
+            err = e
+            logger.error(f"请求错误:{err}")
 
 
         self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
         self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
         return False
         return False
@@ -276,11 +276,6 @@ class DetailSpider:
     def start(self):
     def start(self):
         logger.debug(" *** start ***")
         logger.debug(" *** start ***")
 
 
-        schedule = self.ybw_info.find_one({"account": self.user.phone})
-        if schedule is None:
-            logger.error(f"数据库无此账号信息|{self.user.phone}")
-            return
-
         query = {"crawl_status": {"$exists": False}, "es_count": 0}
         query = {"crawl_status": {"$exists": False}, "es_count": 0}
         sort = [('publishtime', -1)]
         sort = [('publishtime', -1)]
         limit = 100
         limit = 100
@@ -288,23 +283,36 @@ class DetailSpider:
             tasks = [doc for doc in cursor]
             tasks = [doc for doc in cursor]
 
 
         download_count = 0
         download_count = 0
-        rdm = random.randint(30, 50)
         for item in tasks:
         for item in tasks:
-            publish_ts = tool.date_to_timestamp(item['publishtime'])
+            # 检查账号
+            account = self.ybw_info.find_one({"account": self.user.phone})
+            if account is None:
+                logger.error(f"数据库无此账号信息|{self.user.phone}")
+                return
+
+            # 登录次数检查
+            self.login_times = account["login_times"]
+            if self.login_times >= 3:
+                logger.warning(f"账号限制|{self.user.phone}")
+                return
+
+            # 数据发布时间延迟采集
+            publish_ts = tool.date_to_timestamp(item["publishtime"])
             if publish_ts > int(time.time()) - 43200:
             if publish_ts > int(time.time()) - 43200:
                 logger.warning("未到采集时间")
                 logger.warning("未到采集时间")
                 continue
                 continue
 
 
-            fp = 'detail_' + item.get('competehref')
+            fp = "detail_" + item.get("competehref")
             if not self.dedup.get(fp):
             if not self.dedup.get(fp):
                 self.dedup.add(fp)
                 self.dedup.add(fp)
 
 
                 download_count += 1
                 download_count += 1
-                rst = self.crawl_spider(schedule, item)
-                if not rst or '账号限制' in str(rst):
+                rst = self.crawl_spider(account, item)
+                if not rst:
                     self.dedup.delete(fp)
                     self.dedup.delete(fp)
 
 
-                if download_count >= rdm or '账号限制' in str(rst):
+                if download_count >= account["total"]:
+                    logger.warning("当日采集数量已达上限")
                     break
                     break
 
 
                 time.sleep(random.randint(80, 180))
                 time.sleep(random.randint(80, 180))

+ 1 - 1
ybw_esquery.py

@@ -11,11 +11,11 @@ import warnings
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 from elasticsearch import Elasticsearch
 from elasticsearch import Elasticsearch
-from loguru import logger
 from pymongo import MongoClient
 from pymongo import MongoClient
 
 
 import setting
 import setting
 import utils.tools as tool
 import utils.tools as tool
+from log import logger
 
 
 warnings.filterwarnings('ignore')
 warnings.filterwarnings('ignore')
 
 

+ 1 - 1
ybw_query_list.py

@@ -13,12 +13,12 @@ import warnings
 from collections import namedtuple
 from collections import namedtuple
 
 
 import requests
 import requests
-from loguru import logger
 from pymongo import MongoClient
 from pymongo import MongoClient
 
 
 import setting
 import setting
 import utils.tools as tool
 import utils.tools as tool
 from dbs.RedisDB import RedisFilter
 from dbs.RedisDB import RedisFilter
+from log import logger
 
 
 warnings.filterwarnings('ignore')
 warnings.filterwarnings('ignore')
 
 

+ 7 - 2
ybw_release_account.py

@@ -2,6 +2,7 @@ from pymongo import MongoClient
 
 
 import setting
 import setting
 import utils.tools as tool
 import utils.tools as tool
+from log import logger
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
     to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
@@ -13,6 +14,10 @@ if __name__ == '__main__':
             update_date = tool.get_current_date()
             update_date = tool.get_current_date()
             account_coll.update_one(
             account_coll.update_one(
                 {"_id": item["_id"]},
                 {"_id": item["_id"]},
-                {"$set": {"count": 0, "login_times": 0, "update_time": update_date()}}
+                {"$set": {
+                    "count": 0,
+                    "login_times": 0,
+                    "update_time": update_date()
+                }}
             )
             )
-            print(f" {item['account']} 已更新 < {update_date} >")
+            logger.info(f" {item['account']} 已更新 < {update_date} >")