3 年之前 · 3e724becba
--- a/zbytb/crawler/check_utils.py
+++ b/zbytb/crawler/check_utils.py
@@ -2,8 +2,8 @@ import re
 
															 from utils.databases import es_query
														
 
															 from utils.execptions import (
														
 
															-    CustomAccountPrivilegeError,
														
 
															-    CustomCheckError
														
 
															+    AccountError,
														
 
															+    CheckError
														
 
															 )
														
 
															 __all__ = ['CheckText', 'CheckTask']
														
@@ -20,21 +20,21 @@ class CheckContent:
 
															     @staticmethod
														
 
															     def check_text_length(val: str):
														
 
															         if len(val) == 0:
														
 
															-            raise CustomCheckError(code=10101, reason='文本内容为空')
														
 
															+            raise CheckError(code=10101, reason='文本内容为空')
														
 
															         elif not re.findall(r'[\u4e00-\u9fa5]', val, re.S):
														
 
															-            raise CustomCheckError(code=10102, reason='不存在中文字符')
														
 
															+            raise CheckError(code=10102, reason='不存在中文字符')
														
 
															     @staticmethod
														
 
															     def check_content(val: str):
														
 
															         if val.count("部分文件可能不支持在线浏览"):
														
 
															-            raise CustomCheckError(code=10103, reason='文件不支持在线浏览')
														
 
															+            raise CheckError(code=10103, reason='文件不支持在线浏览')
														
 
															     @staticmethod
														
 
															     def check_account_privilege(val: str):
														
 
															         if val.count("高级会员"):
														
 
															-            raise CustomAccountPrivilegeError
														
 
															+            raise AccountError(code=10011, reason='账号权限等级过低')
														
 
															         elif "本招标项目仅供正式会员查阅" in val:
														
 
															-            raise CustomAccountPrivilegeError
														
 
															+            raise AccountError(code=10012, reason='账号无会员访问权限')
														
 
															     def check_sensitive_word(self, val: str):
														
 
															         total = set()
														
@@ -44,7 +44,7 @@ class CheckContent:
 
															                 total.add(word)
														
 
															         if len(total) > 0:
														
 
															-            raise CustomCheckError(code=10104, reason='详情内容包含敏感词')
														
 
															+            raise CheckError(code=10104, reason='敏感词过滤')
														
 
															     def __check(self, text):
														
 
															         self.check_sensitive_word(text)
														
@@ -80,7 +80,7 @@ class CheckPrePareRequest:
 
															         if retrieved_result != 0:
														
 
															             '''es查询数据结果'''
														
 
															             rows['count'] = retrieved_result
														
 
															-            raise CustomCheckError(code=10105, reason='标题内容已存在es')
														
 
															+            raise CheckError(code=10105, reason='es已收录标题')
														
 
															     def check_crawl_title(self, title: str):
														
 
															         for keyword in self.crawl_keywords:
														
@@ -88,7 +88,7 @@ class CheckPrePareRequest:
 
															             if valid_keyword is not None:
														
 
															                 break
														
 
															         else:
														
 
															-            raise CustomCheckError(code=10106, reason='标题未检索到采集关键词', title=title)
														
 
															+            raise CheckError(code=10106, reason='标题未检索到采集关键词', title=title)
														
 
															     def __check(self, rows: dict):
														
 
															         title, publish_time = rows['title'], rows['l_np_publishtime']
														
--- a/zbytb/crawler/crawl_scheduler.py
+++ b/zbytb/crawler/crawl_scheduler.py
@@ -7,7 +7,7 @@ import requests
 
															 from crawler.login import User
														
 
															 from utils.databases import mongo_table, int2long, object_id
														
 
															-from utils.execptions import JyBasicException
														
 
															+from utils.execptions import ZbYTbCrawlError
														
 
															 from utils.log import logger
														
 
															 from utils.tools import get_host_ip
														
@@ -29,16 +29,76 @@ class Scheduler:
 
															         self.crawl_exception = None
														
 
															         self.kwargs = kwargs
														
 
															+        self._headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
														
 
															+
														
 
															         self.account_tab = mongo_table('py_spider', 'match_account')
														
 
															         self.record_tab = mongo_table('py_spider', 'match_account_record')
														
 
															         self.crawl_error_tab = mongo_table('py_spider', 'crawl_error')
														
 
															+        list_attr = dict(
														
 
															+            name='crawl_list',
														
 
															+            lock=dict(crawl_list=True),
														
 
															+            release=dict(crawl_list=False),
														
 
															+        )
														
 
															+        detail_attr = dict(
														
 
															+            name='crawl_detail',
														
 
															+            lock=dict(crawl_detail=True),
														
 
															+            release=dict(crawl_detail=False),
														
 
															+        )
														
 
															+        self._schedule = {'list': list_attr, 'detail': detail_attr}
														
 
															+        self.account = self.get_account()
														
 
															+
														
 
															+    def _init(self):
														
 
															+        self.account_id = self.account['_id']
														
 
															+        self.user = User(self.account['account'], self.account['password'])
														
 
															+        logger.info(f'[启用账号]{self.user.username}')
														
 
															+        history = self.account_history_crawl_record()
														
 
															+        self.count = history['count']  # 访问条数
														
 
															+
														
 
															+    def get_account(self):
														
 
															+        url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
														
 
															+        params = {
														
 
															+            "site": self.site,
														
 
															+            "crawl_type": self.crawl_type
														
 
															+        }
														
 
															+        try:
														
 
															+            response = requests.get(url,
														
 
															+                                    headers=self._headers,
														
 
															+                                    params=params,
														
 
															+                                    timeout=10)
														
 
															+            print(response.json())
														
 
															+            data = response.json()['data']
														
 
															+        except requests.RequestException:
														
 
															+            # 网络不通信时,无法获取账号
														
 
															+            data = None
														
 
															+        return data
														
 
															+
														
 
															+    def _release_account(self):
														
 
															+        url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
														
 
															+        if self.account_id is not None:
														
 
															+            params = {
														
 
															+                "uid": self.account_id,
														
 
															+                "crawl_type": self.crawl_type
														
 
															+            }
														
 
															+            while True:
														
 
															+                try:
														
 
															+                    response = requests.get(url,
														
 
															+                                            headers=self._headers,
														
 
															+                                            params=params,
														
 
															+                                            timeout=10)
														
 
															+                    if response.status_code == 200:
														
 
															+                        logger.debug(f"_release_account >>> {response.json()}")
														
 
															+                        break
														
 
															+                except requests.RequestException:
														
 
															+                    logger.error("网络异常,归还账号失败")
														
 
															+                    self._wait_schedule(1)
														
 
															+
														
 
															     def crawl_counter(self, number: int):
														
 
															         """采集计数器"""
														
 
															         records = self.record_tab.find_one({'_id': self.record_id})
														
 
															         records['count'] += number
														
 
															         self.count = records['count']
														
 
															-        self._update_tab(self.record_tab, self.record_id, records)
														
 
															+        self._update_tab(self.record_tab, self.record_id, count=self.count)
														
 
															     def query_user(self, account: str):
														
 
															         query = {'account': account}
														
@@ -47,78 +107,25 @@ class Scheduler:
 
															             return None
														
 
															         return User(item['account'], item['password'])
														
 
															-    def finished(self, execute_next_time=None):
														
 
															-        logger.info("任务结束")
														
 
															-        self._release_account()
														
 
															-        self.sleep(execute_next_time)
														
 
															-
														
 
															-    def err_record(self, e: JyBasicException):
														
 
															+    def err_record(self, err: ZbYTbCrawlError):
														
 
															         rows = {
														
 
															             'account': self.user.username if self.user is not None else '',
														
 
															             'spidercode': self.spider_code,
														
 
															             'url': self.crawl_url,
														
 
															-            'status_code': e.code,
														
 
															-            'reason': e.reason,
														
 
															-            'params': getattr(e, 'title', ''),
														
 
															+            'status_code': err.code,
														
 
															+            'reason': err.reason,
														
 
															+            'params': getattr(err, 'title', ''),
														
 
															             'crawl_time': int2long(int(time.time())),
														
 
															             'crawl_type': self.crawl_type,
														
 
															         }
														
 
															         self.crawl_error_tab.insert_one(rows)
														
 
															-    def _update_tab(self, mgo_coll, _id, item):
														
 
															-        """
														
 
															-        更新mongo表
														
 
															-
														
 
															-        :param mgo_coll: mongo表
														
 
															-        :param _id: mongo_id
														
 
															-        :param item: 数据
														
 
															-        """
														
 
															-        item['update_time'] = self.current_time
														
 
															-        mgo_coll.update_one({'_id': _id}, {'$set': item})
														
 
															-
														
 
															-    def _release_account(self):
														
 
															-        if self.crawl_type == 'detail':
														
 
															-            rows = dict(crawl_detail=False,)
														
 
															-        else:
														
 
															-            rows = dict(crawl_list=False,)
														
 
															-        if self.account_id is not None:
														
 
															-            self._update_tab(self.account_tab, self.account_id, rows)
														
 
															-
														
 
															-    def __enter__(self):
														
 
															-        logger.info(f'[开启调度]')
														
 
															-        '''获取闲置账号'''
														
 
															-        if self.account is not None:
														
 
															-            self.account_id = self.account['_id']
														
 
															-            self.user = User(self.account['account'], self.account['password'])
														
 
															-            logger.info(f'[启用账号]{self.user.username}')
														
 
															-            '''初始化记录表'''
														
 
															-            records = self.account_records
														
 
															-            if self.crawl_type == 'detail':
														
 
															-                item = {'crawl_detail': True}
														
 
															-                self.total = records['total']
														
 
															-                self.count = records['count']
														
 
															-            else:
														
 
															-                item = {'crawl_list': True}
														
 
															-            '''初始化采集账号记录'''
														
 
															-            self._update_tab(self.account_tab, self.account_id, item)
														
 
															-            self.crawl_start = True
														
 
															-        else:
														
 
															-            logger.warning(f'[{self.site}]暂无闲置账号')
														
 
															-        return self
														
 
															-
														
 
															-    @staticmethod
														
 
															-    def wait_for_next_task(wait_time=None):
														
 
															-        _sleep = (wait_time or random.choice(range(5, 15)))
														
 
															-        time.sleep(_sleep)
														
 
															-
														
 
															-    @staticmethod
														
 
															-    def sleep(wait_time=None):
														
 
															-        sleep_time = (wait_time or 600)
														
 
															-        time.sleep(sleep_time)
														
 
															+    def _update_tab(self, collection, mid, **update):
														
 
															+        update['update_time'] = self.current_time
														
 
															+        collection.update_one({'_id': mid}, {'$set': update})
														
 
															-    @property
														
 
															-    def account_records(self):
														
 
															-        """账号使用记录"""
														
 
															+    def account_history_crawl_record(self):
														
 
															+        """使用账号采集记录"""
														
 
															         query = dict(
														
 
															             account=self.account['account'],
														
 
															             date=self.today,
														
@@ -142,15 +149,19 @@ class Scheduler:
 
															         self.record_id = item['_id']
														
 
															         return item
														
 
															-    @property
														
 
															-    def account(self):
														
 
															-        """账号"""
														
 
															-        query = dict(site=self.site)
														
 
															-        if self.crawl_type == 'detail':
														
 
															-            query['crawl_detail'] = False
														
 
															-        else:
														
 
															-            query['crawl_list'] = False
														
 
															-        return self.account_tab.find_one(query, sort=[('update_time', 1)])
														
 
															+    def wait_for_next_task(self, interval=None):
														
 
															+        interval = (interval or random.choice(range(5, 15)))
														
 
															+        self._wait_schedule(interval)
														
 
															+
														
 
															+    def finished(self, execute_next_time=None):
														
 
															+        logger.info("任务结束")
														
 
															+        self._release_account()
														
 
															+        self._wait_schedule(execute_next_time)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _wait_schedule(interval=None):
														
 
															+        _interval = (interval or 600)
														
 
															+        time.sleep(_interval)
														
 
															     @property
														
 
															     def crawl_task(self):
														
@@ -181,14 +192,24 @@ class Scheduler:
 
															     def yesterday(self):
														
 
															         return (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
														
 
															+    def __enter__(self):
														
 
															+        logger.info('[开启调度]')
														
 
															+        '''获取闲置账号'''
														
 
															+        if self.account is not None:
														
 
															+            self._init()
														
 
															+            self.crawl_start = True
														
 
															+        else:
														
 
															+            logger.warning(f'[{self.site}]暂无闲置账号')
														
 
															+        return self
														
 
															+
														
 
															     def __exit__(self, exc_type, exc_val, exc_tb):
														
 
															-        logger.info(f'[关闭调度]')
														
 
															+        logger.info('[关闭调度]')
														
 
															         self._release_account()
														
 
															         self.crawl_start = False
														
 
															         if exc_type is not None:
														
 
															             logger.exception(exc_tb)
														
 
															-            e = JyBasicException(
														
 
															+            e = ZbYTbCrawlError(
														
 
															                 code=10500,
														
 
															                 reason=str(exc_type),
														
 
															                 title='未知系统错误'
														
--- a/zbytb/crawler/spiders/DetailPageSpider.py
+++ b/zbytb/crawler/spiders/DetailPageSpider.py
@@ -4,7 +4,6 @@ import time
 
															 from urllib.parse import urlencode, urlparse
														
 
															 from bs4 import BeautifulSoup
														
 
															-from pymongo.errors import DuplicateKeyError
														
 
															 from crawler.check_utils import CheckText, CheckTask
														
 
															 from crawler.clean_html import cleaner, clean_js
														
@@ -18,9 +17,10 @@ from utils.attachment import (
 
															 )
														
 
															 from utils.databases import mongo_table, int2long
														
 
															 from utils.execptions import (
														
 
															-    CustomAccountPrivilegeError,
														
 
															-    AttachmentNullError,
														
 
															-    CustomCheckError, JyBasicException
														
 
															+    AccountError,
														
 
															+    AttachmentError,
														
 
															+    CheckError,
														
 
															+    ZbYTbCrawlError
														
 
															 )
														
 
															 from utils.log import logger
														
@@ -75,7 +75,7 @@ class CrawlDetailPageSpider:
 
															         :param tid: 采集条目ObjectId
														
 
															         """
														
 
															-        # 需要高级会员才能查询的招标信息,切换高级账号
														
 
															+        # 需要高级会员才能查询的招标信息,指定使用高级账号
														
 
															         self._update_crawl_task(tid, account=self.senior_account)
														
 
															     def crawl_error(
														
@@ -110,11 +110,12 @@ class CrawlDetailPageSpider:
 
															         self.crawl_error_tab.insert_one(items)
														
 
															         logger.error(err_msg)
														
 
															-    def get_attachment(self, content: str, rows: dict):
														
 
															+    def download_attachment(self, content: str, rows: dict):
														
 
															+        logger.info('>>> 下载附件')
														
 
															+        index = 0
														
 
															+        attachments = {}
														
 
															         soup = BeautifulSoup(content, "lxml")
														
 
															         all_node = soup.findAll("a") or soup.findAll("iframe")
														
 
															-        attachments = {}
														
 
															-        index = 0
														
 
															         for node in all_node:
														
 
															             file_name, file_type = (node.string or node.text), None
														
 
															             file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
														
@@ -128,29 +129,33 @@ class CrawlDetailPageSpider:
 
															                              or extract_file_type(file_path))
														
 
															             # 抽取文件名称
														
 
															-            parser = urlparse(file_path)
														
 
															-            if parser.scheme in ['https', 'http'] and file_type is not None:
														
 
															-                if not file_name:
														
 
															-                    name = extract_file_name_by_href(file_path, file_type)
														
 
															-                    if name is not None:
														
 
															-                        file_name = name
														
 
															-                    else:
														
 
															-                        file_name = f"{rows['title']}_{index}"
														
 
															+            try:
														
 
															+                parser = urlparse(file_path)
														
 
															+            except ValueError:
														
 
															+                pass
														
 
															+            else:
														
 
															+                if parser.scheme in ['https', 'http'] and file_type is not None:
														
 
															+                    if not file_name:
														
 
															+                        name = extract_file_name_by_href(file_path, file_type)
														
 
															+                        if name is not None:
														
 
															+                            file_name = name
														
 
															+                        else:
														
 
															+                            file_name = f"{rows['title']}_{index}"
														
 
															-                attachment = self.attachment_downloader.download(
														
 
															-                    file_name=file_name,
														
 
															-                    file_type=file_type,
														
 
															-                    download_url=file_path,
														
 
															-                )
														
 
															-                if len(attachment) > 0:
														
 
															-                    attachments[str(index + 1)] = attachment
														
 
															-                    index += 1
														
 
															+                    attachment = self.attachment_downloader.download(
														
 
															+                        file_name=file_name,
														
 
															+                        file_type=file_type,
														
 
															+                        download_url=file_path,
														
 
															+                    )
														
 
															+                    if len(attachment) > 0:
														
 
															+                        attachments[str(index + 1)] = attachment
														
 
															+                        index += 1
														
 
															-        if attachments:
														
 
															+        if len(attachments) > 0:
														
 
															             rows["projectinfo"] = {"attachments": attachments}
														
 
															     def save_data(self, content, rows: dict):
														
 
															-        self.get_attachment(content, rows)
														
 
															+        logger.info('>>> 保存数据')
														
 
															         rows["contenthtml"] = clean_js(content)
														
 
															         special = {
														
 
															             '<iframe[^<>]*>[\s\S]*?</iframe>': ''
														
@@ -158,7 +163,7 @@ class CrawlDetailPageSpider:
 
															         rows["detail"] = cleaner(content, special=special)
														
 
															         try:
														
 
															             CheckText(rows["detail"])
														
 
															-        except CustomCheckError:
														
 
															+        except CheckError:
														
 
															             # 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
														
 
															             rows["detail"] = "<br/>详细内容请访问原网页！"
														
 
															         rows["comeintime"] = int2long(int(time.time()))
														
@@ -170,7 +175,7 @@ class CrawlDetailPageSpider:
 
															         logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
														
 
															     def crawl_response(self, response, rows: dict):
														
 
															-        # print(rows['competehref'])
														
 
															+        logger.info('>>> 采集响应')
														
 
															         source = re.findall(r'Inner(.*?);Inner', response.text)
														
 
															         if len(source) > 0:
														
 
															             content = source[0][13:-1]
														
@@ -180,10 +185,11 @@ class CrawlDetailPageSpider:
 
															         counter = 0
														
 
															         try:
														
 
															             CheckText(content)
														
 
															+            self.download_attachment(content, rows)
														
 
															             self.save_data(content, rows)
														
 
															             self._update_crawl_task(rows['_id'], crawl_status='finished')
														
 
															             counter = 1
														
 
															-        except (AttachmentNullError, CustomCheckError) as e:
														
 
															+        except (AttachmentError, CheckError) as e:
														
 
															             if e.code == 10104 and self.account != self.senior_account:
														
 
															                 self.switch_senior_user(rows)
														
 
															             else:
														
@@ -197,11 +203,12 @@ class CrawlDetailPageSpider:
 
															                     account=self.account,
														
 
															                     err_msg=err_msg
														
 
															                 )
														
 
															-        except CustomAccountPrivilegeError:
														
 
															+        except AccountError:
														
 
															             self.switch_senior_user(rows)
														
 
															         return counter
														
 
															     def crawl_request(self, url: str, referer: str, user: User):
														
 
															+        logger.info('>>> 采集请求')
														
 
															         headers = {
														
 
															             'Host': 'www.zbytb.com',
														
 
															             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
														
@@ -248,12 +255,14 @@ class CrawlDetailPageSpider:
 
															             self._lock_task(item)
														
 
															             sc.spider_code = self.spider_code = item['spidercode']
														
 
															             sc.crawl_url = item['competehref']
														
 
															-            # 获取闲置用户账号
														
 
															+
														
 
															+            # 分配账号和账号cookie
														
 
															             self.account = item.get('account', sc.user.username)
														
 
															             self.cookies = load_login_cookies(self.account)
														
 
															             user = sc.query_user(self.account)
														
 
															             if user is None:
														
 
															                 return False
														
 
															+
														
 
															             try:
														
 
															                 CheckTask(item)
														
 
															                 url = self.prepare_url(item)
														
@@ -263,7 +272,7 @@ class CrawlDetailPageSpider:
 
															                     num = self.crawl_response(response, item)
														
 
															                     sc.crawl_counter(num)
														
 
															                 next_task_interval = 10
														
 
															-            except JyBasicException as e:
														
 
															+            except (ZbYTbCrawlError, Exception) as e:
														
 
															                 if e.code == 10105:
														
 
															                     # 抛出异常时,将es查询统计结果进行更新
														
 
															                     self._update_crawl_task(item["_id"], count=item['count'])
														
@@ -280,8 +289,5 @@ class CrawlDetailPageSpider:
 
															         while True:
														
 
															             with Scheduler(site='中国招标与采购网', crawl_type='detail') as scheduler:
														
 
															                 if scheduler.crawl_start:
														
 
															-                    finished = self.crawl_spider(scheduler)
														
 
															-                    if not finished:
														
 
															-                        scheduler.wait_for_next_task(2)
														
 
															-                else:
														
 
															-                    scheduler.wait_for_next_task(60)
														
 
															+                    self.crawl_spider(scheduler)
														
 
															+                scheduler.finished(10)