5 сар өмнө · 67ced21d8d
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,7 +6,6 @@ services:
 
				       - /mnt/ybw:/mnt
			
 
				     restart: always
			
 
				     privileged: true
			
 
				-    tty: true
			
 
				     logging:
			
 
				       driver: "json-file"
			
 
				       options:
			
@@ -18,3 +17,4 @@ services:
 
				           memory: 4G
			
 
				         reservations:
			
 
				           memory: 10M
			
 
				+    command: /sbin/init
			
--- a/install-crontab.sh
+++ b/install-crontab.sh
@@ -0,0 +1,6 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+# 解决bash: service: command not found 错误
			
 
				+yum list | grep initscripts && yum install initscripts -y
			
 
				+# 安装和启动
			
 
				+yum install crontabs -y && service crond start
			
--- a/log.py
+++ b/log.py
@@ -0,0 +1,33 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-08-22
			
 
				+---------
			
 
				+@summary:  日志模块
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from loguru import logger
			
 
				+
			
 
				+logger.remove()  # 删除默认logru配置
			
 
				+
			
 
				+_absolute = Path(__file__).absolute().parent
			
 
				+_log_path = (_absolute / 'logs/log_{time:YYYYMMDD}.log').resolve()
			
 
				+loguru_format = (
			
 
				+    "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
			
 
				+    "<level>{level: <4}</level> | "
			
 
				+    "<cyan>{thread.name}</cyan> | "
			
 
				+    "<cyan>{file.name}</cyan>:<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
			
 
				+)
			
 
				+logru_level = 'INFO'
			
 
				+logger.add(
			
 
				+    sink=_log_path,
			
 
				+    format=loguru_format,
			
 
				+    level=logru_level,
			
 
				+    rotation='00:00',
			
 
				+    retention='1 week',
			
 
				+    encoding='utf-8',
			
 
				+)
			
 
				+logger.add(sys.stdout, format=loguru_format, colorize=True, level=logru_level)
			
--- a/start.sh
+++ b/start.sh
@@ -1,4 +1,4 @@
 
				 #!/bin/bash
			
 
				 
			
 
				 ps -ef |grep python3 |grep "ybw_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
			
 
				-nohup python3 ybw_details.py > ybw_details.out 2>&1 &
			
 
				+nohup python3 ybw_details.py > /dev/null 2>&1 &
			
--- a/utils/login.py
+++ b/utils/login.py
@@ -4,16 +4,15 @@ import threading
 
				 import time
			
 
				 import uuid
			
 
				 from collections import namedtuple
			
 
				-from pathlib import Path
			
 
				 
			
 
				 import execjs
			
 
				 import requests
			
 
				 from requests import Session
			
 
				 from requests.utils import dict_from_cookiejar
			
 
				 
			
 
				-from utils.execptions import CrawlError
			
 
				-from loguru import logger
			
 
				 import setting
			
 
				+from log import logger
			
 
				+from utils.execptions import CrawlError
			
 
				 
			
 
				 LOCK = threading.RLock()
			
 
				 
			
--- a/utils/tools.py
+++ b/utils/tools.py
@@ -12,8 +12,9 @@ import time
 
				 
			
 
				 import bson
			
 
				 import requests
			
 
				-from loguru import logger
			
 
				+
			
 
				 import setting
			
 
				+from log import logger
			
 
				 
			
 
				 
			
 
				 def clean_title(title):
			
--- a/ybw_crontab.txt
+++ b/ybw_crontab.txt
@@ -1,5 +1,5 @@
 
				 # 元博网 限量采集
			
 
				-30 9 * * * cd /mnt && python3 ybw_query_list.py > ybw_list.out 2>&1
			
 
				+30 9 * * * cd /mnt && python3 ybw_query_list.py > /dev/null 2>&1
			
 
				 0 3 * * * python3 /mnt/ybw_release_account.py
			
 
				 0 9-16/2 * * 1-5 cd /mnt && ./start.sh
			
 
				-50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > esquery.out 2>&1
			
 
				+50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > /dev/null 2>&1
			
--- a/ybw_details.py
+++ b/ybw_details.py
@@ -11,7 +11,6 @@ import re
 
				 import time
			
 
				 
			
 
				 import requests.exceptions
			
 
				-from loguru import logger
			
 
				 from lxml.html import fromstring, HtmlElement, tostring
			
 
				 from lxml.html.clean import Cleaner
			
 
				 from pymongo import MongoClient
			
@@ -19,6 +18,7 @@ from pymongo import MongoClient
 
				 import setting
			
 
				 import utils.tools as tool
			
 
				 from dbs.RedisDB import RedisFilter
			
 
				+from log import logger
			
 
				 from utils.check_utils import CheckText, CheckTask
			
 
				 from utils.clean_html import cleaner
			
 
				 from utils.login import User, load_login_cookies, login, login_check
			
@@ -102,17 +102,15 @@ class DetailSpider:
 
				         params = {
			
 
				             "fid": f"{fid}"
			
 
				         }
			
 
				-
			
 
				-        res = requests.get(url, headers=headers, params=params, **request_params)
			
 
				-        return res
			
 
				+        return requests.get(url, headers=headers, params=params, **request_params)
			
 
				 
			
 
				     def crawl_request(self, item: dict):
			
 
				-        url = item['competehref']
			
 
				+        url = item["competehref"]
			
 
				         headers = {
			
 
				+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
			
 
				             'Host': 'www.chinabidding.cn',
			
 
				             'Upgrade-Insecure-Requests': '1',
			
 
				             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
			
 
				-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
			
 
				             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
			
 
				         }
			
 
				 
			
@@ -136,8 +134,10 @@ class DetailSpider:
 
				             if login_cookies is None:
			
 
				                 login(*self.user, **proxy_params)
			
 
				                 self.login_times += 1
			
 
				+                self.update_account_login_times()
			
 
				                 continue
			
 
				-            elif 'cookies' not in request_params:
			
 
				+
			
 
				+            if 'cookies' not in request_params:
			
 
				                 request_params.setdefault('cookies', login_cookies)
			
 
				             else:
			
 
				                 request_params.update({'cookies': login_cookies})
			
@@ -153,17 +153,18 @@ class DetailSpider:
 
				                         logger.info(f"[重新登录]{self.user.phone}")
			
 
				                         _, code = login(*self.user, **proxy_params)
			
 
				                         self.login_times += 1
			
 
				-                        if code == 200:
			
 
				-                            retries += 1
			
 
				-                        else:
			
 
				+                        retries += 1
			
 
				+                        if code != 200:
			
 
				                             time.sleep(600)
			
 
				-                            retries += 1
			
 
				+
			
 
				                         continue
			
 
				+
			
 
				                     logger.info(f'[采集正文] fid_{fid}')
			
 
				                     return r
			
 
				                 except:
			
 
				                     retries += 1
			
 
				-                    continue
			
 
				+                finally:
			
 
				+                    self.update_account_login_times()
			
 
				             else:
			
 
				                 try:
			
 
				                     r = requests.get(url, **request_params)
			
@@ -173,26 +174,25 @@ class DetailSpider:
 
				                         logger.info(f"[重新登录]{self.user.phone}")
			
 
				                         _, code = login(*self.user, **proxy_params)
			
 
				                         self.login_times += 1
			
 
				-                        if code == 200:
			
 
				-                            retries += 1
			
 
				-                        else:
			
 
				+                        retries += 1
			
 
				+                        if code != 200:
			
 
				                             time.sleep(1800)
			
 
				-                            retries += 1
			
 
				+
			
 
				                         continue
			
 
				 
			
 
				-                    element = fromstring(r.text)
			
 
				+                    element = fromstring(r.content.decode())
			
 
				                     nodes = element.xpath('//*[@id="main_dom"]/div[1]')
			
 
				                     if len(nodes) != 1:
			
 
				                         retries_502 += 1
			
 
				                         logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
			
 
				-                        continue
			
 
				                     else:
			
 
				-                        node = nodes[0]
			
 
				+                        node = nodes[0]  # list index out of range
			
 
				                         logger.info(f'[采集正文] id={node.attrib.get("id")}')
			
 
				                         return r
			
 
				                 except requests.RequestException:
			
 
				                     retries += 1
			
 
				-                    continue
			
 
				+                finally:
			
 
				+                    self.update_account_login_times()
			
 
				 
			
 
				         return None
			
 
				 
			
@@ -239,36 +239,36 @@ class DetailSpider:
 
				         self.save_tab.insert_one(insert)
			
 
				         logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
			
 
				 
			
 
				-    def crawl_spider(self, schedule, item):
			
 
				-        count = schedule['count']
			
 
				-        self.login_times = schedule['login_times']
			
 
				-        if count >= schedule['total'] or self.login_times >= 3:
			
 
				-            ''' 账号限制 '''
			
 
				-            logger.warning("账号限制")
			
 
				-            return '账号限制'
			
 
				+    def update_account_login_times(self):
			
 
				+        self.ybw_info.update_one(
			
 
				+            {"account": self.user.phone},
			
 
				+            {"$set": {
			
 
				+                "login_times": self.login_times,
			
 
				+                "update_time": tool.get_current_date()
			
 
				+            }}
			
 
				+        )
			
 
				 
			
 
				+    def crawl_spider(self, account, item):
			
 
				         _id = item["_id"]
			
 
				-        err = "error"
			
 
				-        for _ in range(3):
			
 
				-            try:
			
 
				-                CheckTask(item)  # 检查请求采集任务
			
 
				-                response = self.crawl_request(item)
			
 
				-                if response is not None:
			
 
				-                    self.crawl_response(response, item)
			
 
				-                    count += 1
			
 
				-                    self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
			
 
				-                    self.ybw_info.update_one(
			
 
				-                        {"account": self.user.phone},
			
 
				-                        {"$set": {
			
 
				-                            "count": count,
			
 
				-                            "update_time": tool.get_current_date(),
			
 
				-                            "login_times": self.login_times
			
 
				-                        }}
			
 
				-                    )
			
 
				-                    return True
			
 
				-            except Exception as e:
			
 
				-                err = e
			
 
				-                logger.error(f"请求错误：{err}")
			
 
				+        err = "unknown error"
			
 
				+
			
 
				+        try:
			
 
				+            CheckTask(item)  # 检查请求采集任务
			
 
				+            response = self.crawl_request(item)
			
 
				+            if response is not None:
			
 
				+                self.crawl_response(response, item)
			
 
				+                self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
			
 
				+                self.ybw_info.update_one(
			
 
				+                    {"account": self.user.phone},
			
 
				+                    {"$set": {
			
 
				+                        "count": account["count"] + 1,
			
 
				+                        "update_time": tool.get_current_date(),
			
 
				+                    }}
			
 
				+                )
			
 
				+                return True
			
 
				+        except Exception as e:
			
 
				+            err = e
			
 
				+            logger.error(f"请求错误：{err}")
			
 
				 
			
 
				         self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
			
 
				         return False
			
@@ -276,11 +276,6 @@ class DetailSpider:
 
				     def start(self):
			
 
				         logger.debug(" *** start ***")
			
 
				 
			
 
				-        schedule = self.ybw_info.find_one({"account": self.user.phone})
			
 
				-        if schedule is None:
			
 
				-            logger.error(f"数据库无此账号信息|{self.user.phone}")
			
 
				-            return
			
 
				-
			
 
				         query = {"crawl_status": {"$exists": False}, "es_count": 0}
			
 
				         sort = [('publishtime', -1)]
			
 
				         limit = 100
			
@@ -288,23 +283,36 @@ class DetailSpider:
 
				             tasks = [doc for doc in cursor]
			
 
				 
			
 
				         download_count = 0
			
 
				-        rdm = random.randint(30, 50)
			
 
				         for item in tasks:
			
 
				-            publish_ts = tool.date_to_timestamp(item['publishtime'])
			
 
				+            # 检查账号
			
 
				+            account = self.ybw_info.find_one({"account": self.user.phone})
			
 
				+            if account is None:
			
 
				+                logger.error(f"数据库无此账号信息|{self.user.phone}")
			
 
				+                return
			
 
				+
			
 
				+            # 登录次数检查
			
 
				+            self.login_times = account["login_times"]
			
 
				+            if self.login_times >= 3:
			
 
				+                logger.warning(f"账号限制|{self.user.phone}")
			
 
				+                return
			
 
				+
			
 
				+            # 数据发布时间延迟采集
			
 
				+            publish_ts = tool.date_to_timestamp(item["publishtime"])
			
 
				             if publish_ts > int(time.time()) - 43200:
			
 
				                 logger.warning("未到采集时间")
			
 
				                 continue
			
 
				 
			
 
				-            fp = 'detail_' + item.get('competehref')
			
 
				+            fp = "detail_" + item.get("competehref")
			
 
				             if not self.dedup.get(fp):
			
 
				                 self.dedup.add(fp)
			
 
				 
			
 
				                 download_count += 1
			
 
				-                rst = self.crawl_spider(schedule, item)
			
 
				-                if not rst or '账号限制' in str(rst):
			
 
				+                rst = self.crawl_spider(account, item)
			
 
				+                if not rst:
			
 
				                     self.dedup.delete(fp)
			
 
				 
			
 
				-                if download_count >= rdm or '账号限制' in str(rst):
			
 
				+                if download_count >= account["total"]:
			
 
				+                    logger.warning("当日采集数量已达上限")
			
 
				                     break
			
 
				 
			
 
				                 time.sleep(random.randint(80, 180))
			
--- a/ybw_esquery.py
+++ b/ybw_esquery.py
@@ -11,11 +11,11 @@ import warnings
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 
			
 
				 from elasticsearch import Elasticsearch
			
 
				-from loguru import logger
			
 
				 from pymongo import MongoClient
			
 
				 
			
 
				 import setting
			
 
				 import utils.tools as tool
			
 
				+from log import logger
			
 
				 
			
 
				 warnings.filterwarnings('ignore')
			
 
				 
			
--- a/ybw_query_list.py
+++ b/ybw_query_list.py
@@ -13,12 +13,12 @@ import warnings
 
				 from collections import namedtuple
			
 
				 
			
 
				 import requests
			
 
				-from loguru import logger
			
 
				 from pymongo import MongoClient
			
 
				 
			
 
				 import setting
			
 
				 import utils.tools as tool
			
 
				 from dbs.RedisDB import RedisFilter
			
 
				+from log import logger
			
 
				 
			
 
				 warnings.filterwarnings('ignore')
			
 
				 
			
--- a/ybw_release_account.py
+++ b/ybw_release_account.py
@@ -2,6 +2,7 @@ from pymongo import MongoClient
 
				 
			
 
				 import setting
			
 
				 import utils.tools as tool
			
 
				+from log import logger
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
			
@@ -13,6 +14,10 @@ if __name__ == '__main__':
 
				             update_date = tool.get_current_date()
			
 
				             account_coll.update_one(
			
 
				                 {"_id": item["_id"]},
			
 
				-                {"$set": {"count": 0, "login_times": 0, "update_time": update_date()}}
			
 
				+                {"$set": {
			
 
				+                    "count": 0,
			
 
				+                    "login_times": 0,
			
 
				+                    "update_time": update_date()
			
 
				+                }}
			
 
				             )
			
 
				-            print(f" {item['account']} 已更新 < {update_date} >")
			
 
				+            logger.info(f" {item['account']} 已更新 < {update_date} >")