dzr 5 сар өмнө
parent
commit
67ced21d8d

+ 1 - 1
docker-compose.yml

@@ -6,7 +6,6 @@ services:
       - /mnt/ybw:/mnt
     restart: always
     privileged: true
-    tty: true
     logging:
       driver: "json-file"
       options:
@@ -18,3 +17,4 @@ services:
           memory: 4G
         reservations:
           memory: 10M
+    command: /sbin/init

+ 6 - 0
install-crontab.sh

@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+# 解决bash: service: command not found 错误
+yum list | grep initscripts && yum install initscripts -y
+# 安装和启动
+yum install crontabs -y && service crond start

+ 33 - 0
log.py

@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-08-22
+---------
+@summary:  日志模块
+---------
+@author: Dzr
+"""
+import sys
+from pathlib import Path
+
+from loguru import logger
+
+logger.remove()  # 删除默认logru配置
+
+_absolute = Path(__file__).absolute().parent
+_log_path = (_absolute / 'logs/log_{time:YYYYMMDD}.log').resolve()
+loguru_format = (
+    "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+    "<level>{level: <4}</level> | "
+    "<cyan>{thread.name}</cyan> | "
+    "<cyan>{file.name}</cyan>:<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+)
+logru_level = 'INFO'
+logger.add(
+    sink=_log_path,
+    format=loguru_format,
+    level=logru_level,
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)
+logger.add(sys.stdout, format=loguru_format, colorize=True, level=logru_level)

+ 1 - 1
start.sh

@@ -1,4 +1,4 @@
 #!/bin/bash
 
 ps -ef |grep python3 |grep "ybw_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
-nohup python3 ybw_details.py > ybw_details.out 2>&1 &
+nohup python3 ybw_details.py > /dev/null 2>&1 &

+ 2 - 3
utils/login.py

@@ -4,16 +4,15 @@ import threading
 import time
 import uuid
 from collections import namedtuple
-from pathlib import Path
 
 import execjs
 import requests
 from requests import Session
 from requests.utils import dict_from_cookiejar
 
-from utils.execptions import CrawlError
-from loguru import logger
 import setting
+from log import logger
+from utils.execptions import CrawlError
 
 LOCK = threading.RLock()
 

+ 2 - 1
utils/tools.py

@@ -12,8 +12,9 @@ import time
 
 import bson
 import requests
-from loguru import logger
+
 import setting
+from log import logger
 
 
 def clean_title(title):

+ 2 - 2
ybw_crontab.txt

@@ -1,5 +1,5 @@
 # 元博网 限量采集
-30 9 * * * cd /mnt && python3 ybw_query_list.py > ybw_list.out 2>&1
+30 9 * * * cd /mnt && python3 ybw_query_list.py > /dev/null 2>&1
 0 3 * * * python3 /mnt/ybw_release_account.py
 0 9-16/2 * * 1-5 cd /mnt && ./start.sh
-50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > esquery.out 2>&1
+50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > /dev/null 2>&1

+ 67 - 59
ybw_details.py

@@ -11,7 +11,6 @@ import re
 import time
 
 import requests.exceptions
-from loguru import logger
 from lxml.html import fromstring, HtmlElement, tostring
 from lxml.html.clean import Cleaner
 from pymongo import MongoClient
@@ -19,6 +18,7 @@ from pymongo import MongoClient
 import setting
 import utils.tools as tool
 from dbs.RedisDB import RedisFilter
+from log import logger
 from utils.check_utils import CheckText, CheckTask
 from utils.clean_html import cleaner
 from utils.login import User, load_login_cookies, login, login_check
@@ -102,17 +102,15 @@ class DetailSpider:
         params = {
             "fid": f"{fid}"
         }
-
-        res = requests.get(url, headers=headers, params=params, **request_params)
-        return res
+        return requests.get(url, headers=headers, params=params, **request_params)
 
     def crawl_request(self, item: dict):
-        url = item['competehref']
+        url = item["competehref"]
         headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
             'Host': 'www.chinabidding.cn',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
         }
 
@@ -136,8 +134,10 @@ class DetailSpider:
             if login_cookies is None:
                 login(*self.user, **proxy_params)
                 self.login_times += 1
+                self.update_account_login_times()
                 continue
-            elif 'cookies' not in request_params:
+
+            if 'cookies' not in request_params:
                 request_params.setdefault('cookies', login_cookies)
             else:
                 request_params.update({'cookies': login_cookies})
@@ -153,17 +153,18 @@ class DetailSpider:
                         logger.info(f"[重新登录]{self.user.phone}")
                         _, code = login(*self.user, **proxy_params)
                         self.login_times += 1
-                        if code == 200:
-                            retries += 1
-                        else:
+                        retries += 1
+                        if code != 200:
                             time.sleep(600)
-                            retries += 1
+
                         continue
+
                     logger.info(f'[采集正文] fid_{fid}')
                     return r
                 except:
                     retries += 1
-                    continue
+                finally:
+                    self.update_account_login_times()
             else:
                 try:
                     r = requests.get(url, **request_params)
@@ -173,26 +174,25 @@ class DetailSpider:
                         logger.info(f"[重新登录]{self.user.phone}")
                         _, code = login(*self.user, **proxy_params)
                         self.login_times += 1
-                        if code == 200:
-                            retries += 1
-                        else:
+                        retries += 1
+                        if code != 200:
                             time.sleep(1800)
-                            retries += 1
+
                         continue
 
-                    element = fromstring(r.text)
+                    element = fromstring(r.content.decode())
                     nodes = element.xpath('//*[@id="main_dom"]/div[1]')
                     if len(nodes) != 1:
                         retries_502 += 1
                         logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
-                        continue
                     else:
-                        node = nodes[0]
+                        node = nodes[0]  # list index out of range
                         logger.info(f'[采集正文] id={node.attrib.get("id")}')
                         return r
                 except requests.RequestException:
                     retries += 1
-                    continue
+                finally:
+                    self.update_account_login_times()
 
         return None
 
@@ -239,36 +239,36 @@ class DetailSpider:
         self.save_tab.insert_one(insert)
         logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
 
-    def crawl_spider(self, schedule, item):
-        count = schedule['count']
-        self.login_times = schedule['login_times']
-        if count >= schedule['total'] or self.login_times >= 3:
-            ''' 账号限制 '''
-            logger.warning("账号限制")
-            return '账号限制'
+    def update_account_login_times(self):
+        self.ybw_info.update_one(
+            {"account": self.user.phone},
+            {"$set": {
+                "login_times": self.login_times,
+                "update_time": tool.get_current_date()
+            }}
+        )
 
+    def crawl_spider(self, account, item):
         _id = item["_id"]
-        err = "error"
-        for _ in range(3):
-            try:
-                CheckTask(item)  # 检查请求采集任务
-                response = self.crawl_request(item)
-                if response is not None:
-                    self.crawl_response(response, item)
-                    count += 1
-                    self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
-                    self.ybw_info.update_one(
-                        {"account": self.user.phone},
-                        {"$set": {
-                            "count": count,
-                            "update_time": tool.get_current_date(),
-                            "login_times": self.login_times
-                        }}
-                    )
-                    return True
-            except Exception as e:
-                err = e
-                logger.error(f"请求错误:{err}")
+        err = "unknown error"
+
+        try:
+            CheckTask(item)  # 检查请求采集任务
+            response = self.crawl_request(item)
+            if response is not None:
+                self.crawl_response(response, item)
+                self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
+                self.ybw_info.update_one(
+                    {"account": self.user.phone},
+                    {"$set": {
+                        "count": account["count"] + 1,
+                        "update_time": tool.get_current_date(),
+                    }}
+                )
+                return True
+        except Exception as e:
+            err = e
+            logger.error(f"请求错误:{err}")
 
         self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
         return False
@@ -276,11 +276,6 @@ class DetailSpider:
     def start(self):
         logger.debug(" *** start ***")
 
-        schedule = self.ybw_info.find_one({"account": self.user.phone})
-        if schedule is None:
-            logger.error(f"数据库无此账号信息|{self.user.phone}")
-            return
-
         query = {"crawl_status": {"$exists": False}, "es_count": 0}
         sort = [('publishtime', -1)]
         limit = 100
@@ -288,23 +283,36 @@ class DetailSpider:
             tasks = [doc for doc in cursor]
 
         download_count = 0
-        rdm = random.randint(30, 50)
         for item in tasks:
-            publish_ts = tool.date_to_timestamp(item['publishtime'])
+            # 检查账号
+            account = self.ybw_info.find_one({"account": self.user.phone})
+            if account is None:
+                logger.error(f"数据库无此账号信息|{self.user.phone}")
+                return
+
+            # 登录次数检查
+            self.login_times = account["login_times"]
+            if self.login_times >= 3:
+                logger.warning(f"账号限制|{self.user.phone}")
+                return
+
+            # 数据发布时间延迟采集
+            publish_ts = tool.date_to_timestamp(item["publishtime"])
             if publish_ts > int(time.time()) - 43200:
                 logger.warning("未到采集时间")
                 continue
 
-            fp = 'detail_' + item.get('competehref')
+            fp = "detail_" + item.get("competehref")
             if not self.dedup.get(fp):
                 self.dedup.add(fp)
 
                 download_count += 1
-                rst = self.crawl_spider(schedule, item)
-                if not rst or '账号限制' in str(rst):
+                rst = self.crawl_spider(account, item)
+                if not rst:
                     self.dedup.delete(fp)
 
-                if download_count >= rdm or '账号限制' in str(rst):
+                if download_count >= account["total"]:
+                    logger.warning("当日采集数量已达上限")
                     break
 
                 time.sleep(random.randint(80, 180))

+ 1 - 1
ybw_esquery.py

@@ -11,11 +11,11 @@ import warnings
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from elasticsearch import Elasticsearch
-from loguru import logger
 from pymongo import MongoClient
 
 import setting
 import utils.tools as tool
+from log import logger
 
 warnings.filterwarnings('ignore')
 

+ 1 - 1
ybw_query_list.py

@@ -13,12 +13,12 @@ import warnings
 from collections import namedtuple
 
 import requests
-from loguru import logger
 from pymongo import MongoClient
 
 import setting
 import utils.tools as tool
 from dbs.RedisDB import RedisFilter
+from log import logger
 
 warnings.filterwarnings('ignore')
 

+ 7 - 2
ybw_release_account.py

@@ -2,6 +2,7 @@ from pymongo import MongoClient
 
 import setting
 import utils.tools as tool
+from log import logger
 
 if __name__ == '__main__':
     to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
@@ -13,6 +14,10 @@ if __name__ == '__main__':
             update_date = tool.get_current_date()
             account_coll.update_one(
                 {"_id": item["_id"]},
-                {"$set": {"count": 0, "login_times": 0, "update_time": update_date()}}
+                {"$set": {
+                    "count": 0,
+                    "login_times": 0,
+                    "update_time": update_date()
+                }}
             )
-            print(f" {item['account']} 已更新 < {update_date} >")
+            logger.info(f" {item['account']} 已更新 < {update_date} >")