5 months ago · 67ced21d8d
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,7 +6,6 @@ services:
 
															       - /mnt/ybw:/mnt
														
 
															     restart: always
														
 
															     privileged: true
														
 
															-    tty: true
														
 
															     logging:
														
 
															       driver: "json-file"
														
 
															       options:
														
@@ -18,3 +17,4 @@ services:
 
															           memory: 4G
														
 
															         reservations:
														
 
															           memory: 10M
														
 
															+    command: /sbin/init
														
--- a/install-crontab.sh
+++ b/install-crontab.sh
@@ -0,0 +1,6 @@
 
															+#!/usr/bin/env bash
														
 
															+
														
 
															+# 解决bash: service: command not found 错误
														
 
															+yum list | grep initscripts && yum install initscripts -y
														
 
															+# 安装和启动
														
 
															+yum install crontabs -y && service crond start
														
--- a/log.py
+++ b/log.py
@@ -0,0 +1,33 @@
 
															+# -*- coding: utf-8 -*-
														
 
															+"""
														
 
															+Created on 2024-08-22
														
 
															+---------
														
 
															+@summary:  日志模块
														
 
															+---------
														
 
															+@author: Dzr
														
 
															+"""
														
 
															+import sys
														
 
															+from pathlib import Path
														
 
															+
														
 
															+from loguru import logger
														
 
															+
														
 
															+logger.remove()  # 删除默认logru配置
														
 
															+
														
 
															+_absolute = Path(__file__).absolute().parent
														
 
															+_log_path = (_absolute / 'logs/log_{time:YYYYMMDD}.log').resolve()
														
 
															+loguru_format = (
														
 
															+    "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
														
 
															+    "<level>{level: <4}</level> | "
														
 
															+    "<cyan>{thread.name}</cyan> | "
														
 
															+    "<cyan>{file.name}</cyan>:<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
														
 
															+)
														
 
															+logru_level = 'INFO'
														
 
															+logger.add(
														
 
															+    sink=_log_path,
														
 
															+    format=loguru_format,
														
 
															+    level=logru_level,
														
 
															+    rotation='00:00',
														
 
															+    retention='1 week',
														
 
															+    encoding='utf-8',
														
 
															+)
														
 
															+logger.add(sys.stdout, format=loguru_format, colorize=True, level=logru_level)
														
--- a/start.sh
+++ b/start.sh
@@ -1,4 +1,4 @@
 
															 #!/bin/bash
														
 
															 ps -ef |grep python3 |grep "ybw_details.py" |grep -v grep |awk '{print $2}' |xargs kill -9
														
 
															-nohup python3 ybw_details.py > ybw_details.out 2>&1 &
														
 
															+nohup python3 ybw_details.py > /dev/null 2>&1 &
														
--- a/utils/login.py
+++ b/utils/login.py
@@ -4,16 +4,15 @@ import threading
 
															 import time
														
 
															 import uuid
														
 
															 from collections import namedtuple
														
 
															-from pathlib import Path
														
 
															 import execjs
														
 
															 import requests
														
 
															 from requests import Session
														
 
															 from requests.utils import dict_from_cookiejar
														
 
															-from utils.execptions import CrawlError
														
 
															-from loguru import logger
														
 
															 import setting
														
 
															+from log import logger
														
 
															+from utils.execptions import CrawlError
														
 
															 LOCK = threading.RLock()
														
--- a/utils/tools.py
+++ b/utils/tools.py
@@ -12,8 +12,9 @@ import time
 
															 import bson
														
 
															 import requests
														
 
															-from loguru import logger
														
 
															+
														
 
															 import setting
														
 
															+from log import logger
														
 
															 def clean_title(title):
														
--- a/ybw_crontab.txt
+++ b/ybw_crontab.txt
@@ -1,5 +1,5 @@
 
															 # 元博网 限量采集
														
 
															-30 9 * * * cd /mnt && python3 ybw_query_list.py > ybw_list.out 2>&1
														
 
															+30 9 * * * cd /mnt && python3 ybw_query_list.py > /dev/null 2>&1
														
 
															 0 3 * * * python3 /mnt/ybw_release_account.py
														
 
															 0 9-16/2 * * 1-5 cd /mnt && ./start.sh
														
 
															-50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > esquery.out 2>&1
														
 
															+50 8-18/2 * * * cd /mnt && python3 ybw_esquery.py > /dev/null 2>&1
														
--- a/ybw_details.py
+++ b/ybw_details.py
@@ -11,7 +11,6 @@ import re
 
															 import time
														
 
															 import requests.exceptions
														
 
															-from loguru import logger
														
 
															 from lxml.html import fromstring, HtmlElement, tostring
														
 
															 from lxml.html.clean import Cleaner
														
 
															 from pymongo import MongoClient
														
@@ -19,6 +18,7 @@ from pymongo import MongoClient
 
															 import setting
														
 
															 import utils.tools as tool
														
 
															 from dbs.RedisDB import RedisFilter
														
 
															+from log import logger
														
 
															 from utils.check_utils import CheckText, CheckTask
														
 
															 from utils.clean_html import cleaner
														
 
															 from utils.login import User, load_login_cookies, login, login_check
														
@@ -102,17 +102,15 @@ class DetailSpider:
 
															         params = {
														
 
															             "fid": f"{fid}"
														
 
															         }
														
 
															-
														
 
															-        res = requests.get(url, headers=headers, params=params, **request_params)
														
 
															-        return res
														
 
															+        return requests.get(url, headers=headers, params=params, **request_params)
														
 
															     def crawl_request(self, item: dict):
														
 
															-        url = item['competehref']
														
 
															+        url = item["competehref"]
														
 
															         headers = {
														
 
															+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
														
 
															             'Host': 'www.chinabidding.cn',
														
 
															             'Upgrade-Insecure-Requests': '1',
														
 
															             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
														
 
															-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
														
 
															             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
														
 
															         }
														
@@ -136,8 +134,10 @@ class DetailSpider:
 
															             if login_cookies is None:
														
 
															                 login(*self.user, **proxy_params)
														
 
															                 self.login_times += 1
														
 
															+                self.update_account_login_times()
														
 
															                 continue
														
 
															-            elif 'cookies' not in request_params:
														
 
															+
														
 
															+            if 'cookies' not in request_params:
														
 
															                 request_params.setdefault('cookies', login_cookies)
														
 
															             else:
														
 
															                 request_params.update({'cookies': login_cookies})
														
@@ -153,17 +153,18 @@ class DetailSpider:
 
															                         logger.info(f"[重新登录]{self.user.phone}")
														
 
															                         _, code = login(*self.user, **proxy_params)
														
 
															                         self.login_times += 1
														
 
															-                        if code == 200:
														
 
															-                            retries += 1
														
 
															-                        else:
														
 
															+                        retries += 1
														
 
															+                        if code != 200:
														
 
															                             time.sleep(600)
														
 
															-                            retries += 1
														
 
															+
														
 
															                         continue
														
 
															+
														
 
															                     logger.info(f'[采集正文] fid_{fid}')
														
 
															                     return r
														
 
															                 except:
														
 
															                     retries += 1
														
 
															-                    continue
														
 
															+                finally:
														
 
															+                    self.update_account_login_times()
														
 
															             else:
														
 
															                 try:
														
 
															                     r = requests.get(url, **request_params)
														
@@ -173,26 +174,25 @@ class DetailSpider:
 
															                         logger.info(f"[重新登录]{self.user.phone}")
														
 
															                         _, code = login(*self.user, **proxy_params)
														
 
															                         self.login_times += 1
														
 
															-                        if code == 200:
														
 
															-                            retries += 1
														
 
															-                        else:
														
 
															+                        retries += 1
														
 
															+                        if code != 200:
														
 
															                             time.sleep(1800)
														
 
															-                            retries += 1
														
 
															+
														
 
															                         continue
														
 
															-                    element = fromstring(r.text)
														
 
															+                    element = fromstring(r.content.decode())
														
 
															                     nodes = element.xpath('//*[@id="main_dom"]/div[1]')
														
 
															                     if len(nodes) != 1:
														
 
															                         retries_502 += 1
														
 
															                         logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
														
 
															-                        continue
														
 
															                     else:
														
 
															-                        node = nodes[0]
														
 
															+                        node = nodes[0]  # list index out of range
														
 
															                         logger.info(f'[采集正文] id={node.attrib.get("id")}')
														
 
															                         return r
														
 
															                 except requests.RequestException:
														
 
															                     retries += 1
														
 
															-                    continue
														
 
															+                finally:
														
 
															+                    self.update_account_login_times()
														
 
															         return None
														
@@ -239,36 +239,36 @@ class DetailSpider:
 
															         self.save_tab.insert_one(insert)
														
 
															         logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
														
 
															-    def crawl_spider(self, schedule, item):
														
 
															-        count = schedule['count']
														
 
															-        self.login_times = schedule['login_times']
														
 
															-        if count >= schedule['total'] or self.login_times >= 3:
														
 
															-            ''' 账号限制 '''
														
 
															-            logger.warning("账号限制")
														
 
															-            return '账号限制'
														
 
															+    def update_account_login_times(self):
														
 
															+        self.ybw_info.update_one(
														
 
															+            {"account": self.user.phone},
														
 
															+            {"$set": {
														
 
															+                "login_times": self.login_times,
														
 
															+                "update_time": tool.get_current_date()
														
 
															+            }}
														
 
															+        )
														
 
															+    def crawl_spider(self, account, item):
														
 
															         _id = item["_id"]
														
 
															-        err = "error"
														
 
															-        for _ in range(3):
														
 
															-            try:
														
 
															-                CheckTask(item)  # 检查请求采集任务
														
 
															-                response = self.crawl_request(item)
														
 
															-                if response is not None:
														
 
															-                    self.crawl_response(response, item)
														
 
															-                    count += 1
														
 
															-                    self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
														
 
															-                    self.ybw_info.update_one(
														
 
															-                        {"account": self.user.phone},
														
 
															-                        {"$set": {
														
 
															-                            "count": count,
														
 
															-                            "update_time": tool.get_current_date(),
														
 
															-                            "login_times": self.login_times
														
 
															-                        }}
														
 
															-                    )
														
 
															-                    return True
														
 
															-            except Exception as e:
														
 
															-                err = e
														
 
															-                logger.error(f"请求错误：{err}")
														
 
															+        err = "unknown error"
														
 
															+
														
 
															+        try:
														
 
															+            CheckTask(item)  # 检查请求采集任务
														
 
															+            response = self.crawl_request(item)
														
 
															+            if response is not None:
														
 
															+                self.crawl_response(response, item)
														
 
															+                self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
														
 
															+                self.ybw_info.update_one(
														
 
															+                    {"account": self.user.phone},
														
 
															+                    {"$set": {
														
 
															+                        "count": account["count"] + 1,
														
 
															+                        "update_time": tool.get_current_date(),
														
 
															+                    }}
														
 
															+                )
														
 
															+                return True
														
 
															+        except Exception as e:
														
 
															+            err = e
														
 
															+            logger.error(f"请求错误：{err}")
														
 
															         self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
														
 
															         return False
														
@@ -276,11 +276,6 @@ class DetailSpider:
 
															     def start(self):
														
 
															         logger.debug(" *** start ***")
														
 
															-        schedule = self.ybw_info.find_one({"account": self.user.phone})
														
 
															-        if schedule is None:
														
 
															-            logger.error(f"数据库无此账号信息|{self.user.phone}")
														
 
															-            return
														
 
															-
														
 
															         query = {"crawl_status": {"$exists": False}, "es_count": 0}
														
 
															         sort = [('publishtime', -1)]
														
 
															         limit = 100
														
@@ -288,23 +283,36 @@ class DetailSpider:
 
															             tasks = [doc for doc in cursor]
														
 
															         download_count = 0
														
 
															-        rdm = random.randint(30, 50)
														
 
															         for item in tasks:
														
 
															-            publish_ts = tool.date_to_timestamp(item['publishtime'])
														
 
															+            # 检查账号
														
 
															+            account = self.ybw_info.find_one({"account": self.user.phone})
														
 
															+            if account is None:
														
 
															+                logger.error(f"数据库无此账号信息|{self.user.phone}")
														
 
															+                return
														
 
															+
														
 
															+            # 登录次数检查
														
 
															+            self.login_times = account["login_times"]
														
 
															+            if self.login_times >= 3:
														
 
															+                logger.warning(f"账号限制|{self.user.phone}")
														
 
															+                return
														
 
															+
														
 
															+            # 数据发布时间延迟采集
														
 
															+            publish_ts = tool.date_to_timestamp(item["publishtime"])
														
 
															             if publish_ts > int(time.time()) - 43200:
														
 
															                 logger.warning("未到采集时间")
														
 
															                 continue
														
 
															-            fp = 'detail_' + item.get('competehref')
														
 
															+            fp = "detail_" + item.get("competehref")
														
 
															             if not self.dedup.get(fp):
														
 
															                 self.dedup.add(fp)
														
 
															                 download_count += 1
														
 
															-                rst = self.crawl_spider(schedule, item)
														
 
															-                if not rst or '账号限制' in str(rst):
														
 
															+                rst = self.crawl_spider(account, item)
														
 
															+                if not rst:
														
 
															                     self.dedup.delete(fp)
														
 
															-                if download_count >= rdm or '账号限制' in str(rst):
														
 
															+                if download_count >= account["total"]:
														
 
															+                    logger.warning("当日采集数量已达上限")
														
 
															                     break
														
 
															                 time.sleep(random.randint(80, 180))
														
--- a/ybw_esquery.py
+++ b/ybw_esquery.py
@@ -11,11 +11,11 @@ import warnings
 
															 from concurrent.futures import ThreadPoolExecutor, as_completed
														
 
															 from elasticsearch import Elasticsearch
														
 
															-from loguru import logger
														
 
															 from pymongo import MongoClient
														
 
															 import setting
														
 
															 import utils.tools as tool
														
 
															+from log import logger
														
 
															 warnings.filterwarnings('ignore')
														
--- a/ybw_query_list.py
+++ b/ybw_query_list.py
@@ -13,12 +13,12 @@ import warnings
 
															 from collections import namedtuple
														
 
															 import requests
														
 
															-from loguru import logger
														
 
															 from pymongo import MongoClient
														
 
															 import setting
														
 
															 import utils.tools as tool
														
 
															 from dbs.RedisDB import RedisFilter
														
 
															+from log import logger
														
 
															 warnings.filterwarnings('ignore')
														
--- a/ybw_release_account.py
+++ b/ybw_release_account.py
@@ -2,6 +2,7 @@ from pymongo import MongoClient
 
															 import setting
														
 
															 import utils.tools as tool
														
 
															+from log import logger
														
 
															 if __name__ == '__main__':
														
 
															     to_mongo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
														
@@ -13,6 +14,10 @@ if __name__ == '__main__':
 
															             update_date = tool.get_current_date()
														
 
															             account_coll.update_one(
														
 
															                 {"_id": item["_id"]},
														
 
															-                {"$set": {"count": 0, "login_times": 0, "update_time": update_date()}}
														
 
															+                {"$set": {
														
 
															+                    "count": 0,
														
 
															+                    "login_times": 0,
														
 
															+                    "update_time": update_date()
														
 
															+                }}
														
 
															             )
														
 
															-            print(f" {item['account']} 已更新 < {update_date} >")
														
 
															+            logger.info(f" {item['account']} 已更新 < {update_date} >")