3 年之前 · c6bf1239f0
--- a/bdzbw/ListPageSpider.py
+++ b/bdzbw/ListPageSpider.py
@@ -0,0 +1,219 @@
 
															+import random
														
 
															+import time
														
 
															+import re
														
 
															+from collections import namedtuple
														
 
															+import requests
														
 
															+from lxml.html import fromstring, HtmlElement
														
 
															+from config.load import region
														
 
															+from crawler.crawl_scheduler import Scheduler
														
 
															+from utils.databases import mongo_table, int2long,redis_client, es_query
														
 
															+from utils.log import logger
														
 
															+from utils.socks5 import Proxy
														
 
															+from utils.tools import sha1, check_crawl_title
														
 
															+from utils.execptions import JyBasicException,CustomCheckError
														
 
															+
														
 
															+
														
 
															+CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
														
 
															+
														
 
															+
														
 
															+class ListSpider:
														
 
															+
														
 
															+    def __init__(self, db: str, crawl_tab: str, crawl_max_page=None, enable_proxy=False, allow_show_exception=False):
														
 
															+        self.crawl_menus = [
														
 
															+            CrawlMenu('招标预告', 'a_bdzbw_zbyg', 'retrieval_list.do?single=true&ChannelIds=102'),
														
 
															+            CrawlMenu('招标公告', 'a_bdzbw_zbgg', 'retrieval_list.do?single=true&ChannelIds=52'),
														
 
															+            CrawlMenu('公告变更', 'a_bdzbw_ggbg', 'retrieval_list.do?single=true&ChannelIds=51'),
														
 
															+            CrawlMenu('招标答疑', 'a_bdzbw_zbdy', 'retrieval_list.do?single=true&ChannelIds=103'),
														
 
															+            CrawlMenu('资审结果', 'a_bdzbw_zsjg', 'retrieval_list.do?single=true&ChannelIds=105'),
														
 
															+            CrawlMenu('招标文件', 'a_bdzbw_zbwj', 'retrieval_list.do?single=true&ChannelIds=104'),
														
 
															+            CrawlMenu('中标公告', 'a_bdzbw_zhbgg', 'retrieval_list.do?single=true&ChannelIds=101'),
														
 
															+            CrawlMenu('采购意向', 'a_bdzbw_cgyx', 'retrieval_list.do?single=true&ChannelIds=114'),
														
 
															+            CrawlMenu('审批项目', 'a_bdzbw_spxm', 'spxm_list.do'),
														
 
															+            CrawlMenu('拍卖出让', 'a_bdzbw_pmcr', 'retrieval_list.do?single=true&ChannelIds=115'),
														
 
															+            CrawlMenu('土地矿产', 'a_bdzbw_tdkc', 'retrieval_list.do?single=true&ChannelIds=116'),
														
 
															+            CrawlMenu('产权交易', 'a_bdzbw_cqjy', 'retrieval_list.do?single=true&ChannelIds=117'),
														
 
															+        ]
														
 
															+        self.total = 0
														
 
															+        self.crawl_max_page = crawl_max_page or 1
														
 
															+        self.crawl_tab = mongo_table(db, crawl_tab)
														
 
															+        self.r = redis_client()
														
 
															+        self.session = requests.session()
														
 
															+        self.proxy = Proxy(enable_proxy)
														
 
															+        self.redis_key = 'bdzbw_2022'
														
 
															+        self.allow_show_exception = allow_show_exception
														
 
															+
														
 
															+    def crawl_request(self, url: str, data):
														
 
															+
														
 
															+        headers = {
														
 
															+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
														
 
															+            "Accept-Language": "zh-CN,zh;q=0.9",
														
 
															+            "Cache-Control": "no-cache",
														
 
															+            "Connection": "keep-alive",
														
 
															+            "Content-Type": "application/x-www-form-urlencoded",
														
 
															+            "Origin": "http://www.bidizhaobiao.com",
														
 
															+            "Pragma": "no-cache",
														
 
															+            "Upgrade-Insecure-Requests": "1",
														
 
															+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"
														
 
															+        }
														
 
															+        request_params = {}
														
 
															+        request_params.setdefault('headers', headers)
														
 
															+        request_params.setdefault('timeout', 60)
														
 
															+
														
 
															+        retries = 0
														
 
															+        while retries < 10:
														
 
															+            try:
														
 
															+                # logger.debug(self.proxy.proxies)
														
 
															+                response = self.session.post(url, data=data, proxies=self.proxy.proxies, **request_params)
														
 
															+            except:
														
 
															+                self.proxy.switch()
														
 
															+                retries += 1
														
 
															+                time.sleep(20)
														
 
															+                continue
														
 
															+            if response.status_code != 200:
														
 
															+                self.proxy.switch()
														
 
															+                retries += 1
														
 
															+            else:
														
 
															+                element = fromstring(response.text)
														
 
															+                if element.xpath('//*[@id="searchResultList"]') or element.xpath('//*[@id="ulList"]'):
														
 
															+                    return response
														
 
															+                else:
														
 
															+                    '''没有搜索到列表页'''
														
 
															+                    return None
														
 
															+
														
 
															+        return None
														
 
															+
														
 
															+    def crawl_response(self, response, menu: CrawlMenu, pro_area):
														
 
															+        results = []
														
 
															+        last_page = []
														
 
															+        increase = []
														
 
															+        element: HtmlElement = fromstring(response.text)
														
 
															+        feature = '//div[@id="searchResultList"]/div[2]/div|//div[@id="ulList"]/table[@class="tableList"]/tbody/tr'
														
 
															+        for node in element.xpath(feature):
														
 
															+            try:
														
 
															+                if node.xpath('./div[1]/div[1]/p/a[1]'):
														
 
															+                    competehref = "".join(node.xpath('./div[1]/div[1]/p/a[1]/@href')).strip()
														
 
															+                    title = "".join("".join(node.xpath('./div[1]/div[1]/p/a[1]//text()')).split())
														
 
															+                    area = "".join("".join(node.xpath('./div[1]/div[2]/div[2]/a/span/text()')).split())
														
 
															+                    publish_time = "".join("".join(node.xpath('./div[1]/div[2]/div[4]/p/text()')).split())
														
 
															+                else:
														
 
															+                    href_info = "".join(node.xpath('./td[@class="projectName"]/a/@onclick')).strip()
														
 
															+                    href_params = "".join(re.findall("spxmInfo\('(.*?)'", href_info, re.S)).strip()
														
 
															+                    competehref = f"http://www.bidizhaobiao.com/spxm-{href_params}.html"
														
 
															+                    title = "".join(node.xpath('./td[@class="projectName"]/a/text()')).strip()
														
 
															+                    area = "".join(node.xpath('./td[@class="address"]/span/text()')).strip()
														
 
															+                    publish_time = "".join(node.xpath('./td[@class="time"]/span/text()')).strip()
														
 
															+            except:
														
 
															+                '''某条数据格式异常'''
														
 
															+                continue
														
 
															+
														
 
															+            item = {
														
 
															+                "site": "比地招标网",
														
 
															+                "channel": menu.channel,
														
 
															+                "area": pro_area,
														
 
															+                "_d": "comeintime",
														
 
															+                "comeintime": int2long(int(time.time())),
														
 
															+                "T": "bidding",
														
 
															+                "sendflag": "false",
														
 
															+                "spidercode": menu.spidercode,
														
 
															+                "city": area,
														
 
															+                "type": "",
														
 
															+                "publishdept": "",
														
 
															+                "title": title,
														
 
															+                "competehref": competehref,
														
 
															+                "href": "#",
														
 
															+                "publishtime": publish_time,
														
 
															+                "l_np_publishtime": int2long(int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))),
														
 
															+            }
														
 
															+            if title is None or publish_time is None:
														
 
															+                raise CustomCheckError(code=10107, reason='发布标题或时间为空')
														
 
															+            present_time = time.strftime("%Y-%m-%d 00:00:00", time.localtime(int(round(time.time()))))
														
 
															+            timeArray = time.strptime(present_time, "%Y-%m-%d %H:%M:%S")
														
 
															+            start_date = round(time.mktime(timeArray)) - 86400
														
 
															+            if item.get('l_np_publishtime') >= start_date:
														
 
															+                last_page.append(item)
														
 
															+                # logger.debug(item)
														
 
															+                item['crawl'] = False
														
 
															+                sign = sha1(item['competehref'])
														
 
															+                if not self.r.hexists(self.redis_key, sign):
														
 
															+                    increase.append(item)
														
 
															+                    if check_crawl_title(title):
														
 
															+                        # item["count"] = 0
														
 
															+                        item["count"] = es_query(item["title"], item["l_np_publishtime"])
														
 
															+                        results.append(item)
														
 
															+                        self.r.hset(self.redis_key, sign, '')
														
 
															+
														
 
															+        if len(results) > 0:
														
 
															+            self.crawl_tab.insert_many(results)
														
 
															+        return len(results),len(last_page),len(increase)
														
 
															+
														
 
															+    def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
														
 
															+        for region_id, region_name in region.items():
														
 
															+            page_size = 22
														
 
															+            for page in range(1, self.crawl_max_page + 1):
														
 
															+                url = f'http://www.bidizhaobiao.com/advsearch/{menu.table_type}'
														
 
															+                data = {
														
 
															+                    "pageNum": f"{page}",
														
 
															+                    "province_id": f"{region_id}",
														
 
															+                    "provinceCityJson": '{'+f'{region_name}'+":[]}",
														
 
															+                    "searchCondition.dtype": "50",
														
 
															+                    "searchCondition.SearchType": "any",
														
 
															+                    "searchCondition.infosources": "",
														
 
															+                    "searchCondition.regionId": "",
														
 
															+                    "provinceState": f"{region_name}",
														
 
															+                    "searchCondition.Pattern": "30",
														
 
															+                    "searchCondition.isOr": "false",
														
 
															+                    "isSelectDtype": "0",
														
 
															+                    "isSelectPattern": "0",
														
 
															+                }
														
 
															+                sc.crawl_url = url
														
 
															+                sc.spider_code = menu.spidercode
														
 
															+                try:
														
 
															+                    response = self.crawl_request(url,data)
														
 
															+                    if response is None:
														
 
															+                        logger.info(f'[没有搜索到列表页]{menu.channel}-{region_name}-第{page}页-0条')
														
 
															+                        break
														
 
															+                    item_size = self.crawl_response(response, menu, region_name)
														
 
															+                    self.total += item_size[0]
														
 
															+                    logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-{item_size[0]}条')
														
 
															+
														
 
															+                    # 第一次采集
														
 
															+                    if item_size[1] < page_size:
														
 
															+                        '''当前页面之后已无信息'''
														
 
															+                        break
														
 
															+                    # 增量采集
														
 
															+                    if item_size[2] == 0:
														
 
															+                        '''当前及之后页面信息已采集'''
														
 
															+                        break
														
 
															+
														
 
															+                except (JyBasicException, Exception) as e:
														
 
															+                    logger.error('[采集失败]{}-{}-第{}页, 错误类型:{}'.format(
														
 
															+                        menu.channel,
														
 
															+                        region_name,
														
 
															+                        page,
														
 
															+                        e.__class__.__name__,
														
 
															+                    ))
														
 
															+                finally:
														
 
															+                    sc.wait_for_next_task(random.choice(range(2, 6)))
														
 
															+            logger.debug(f'[{menu.channel}]-[采集地区]-{region_name}-已采集{self.total}条数据')
														
 
															+            self.session.close()
														
 
															+
														
 
															+    def start(self):
														
 
															+
														
 
															+        with Scheduler(site='比地招标网', crawl_type='list') as scheduler:
														
 
															+            for menu in self.crawl_menus:
														
 
															+                if scheduler.crawl_start:
														
 
															+                    self.crawl_spider(scheduler, menu)
														
 
															+            scheduler.finished(5)
														
 
															+            logger.info(f'本次共采集{self.total}条数据')
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    ListSpider(
														
 
															+        db='py_spider',
														
 
															+        crawl_tab='bdzbw_list',
														
 
															+        crawl_max_page=50,
														
 
															+        enable_proxy=True,
														
 
															+    ).start()
														
 
															+
														
 
															+
														
 
															+
														
--- a/bdzbw/config/areas.yaml
+++ b/bdzbw/config/areas.yaml
@@ -0,0 +1,33 @@
 
															+area:
														
 
															+  2: '北京'
														
 
															+  4: '广东'
														
 
															+  5: '海南'
														
 
															+  6: '广西'
														
 
															+  16: '天津'
														
 
															+  17: '河北'
														
 
															+  18: '山西'
														
 
															+  19: '内蒙古'
														
 
															+  20: '上海'
														
 
															+  21: '江苏'
														
 
															+  22: '浙江'
														
 
															+  23: '安徽'
														
 
															+  24: '福建'
														
 
															+  25: '江西'
														
 
															+  26: '山东'
														
 
															+  27: '河南'
														
 
															+  28: '湖北'
														
 
															+  29: '湖南'
														
 
															+  30: '重庆'
														
 
															+  31: '四川'
														
 
															+  32: '贵州'
														
 
															+  33: '云南'
														
 
															+  34: '西藏'
														
 
															+  35: '黑龙江'
														
 
															+  36: '辽宁'
														
 
															+  37: '吉林'
														
 
															+  38: '陕西'
														
 
															+  39: '甘肃'
														
 
															+  40: '青海'
														
 
															+  41: '宁夏'
														
 
															+  42: '新疆'
														
 
															+
														
--- a/bdzbw/config/conf.yaml
+++ b/bdzbw/config/conf.yaml
@@ -0,0 +1,41 @@
 
															+# mongo
														
 
															+mongo:
														
 
															+  host: 172.17.4.87
														
 
															+  port: !!int 27080
														
 
															+#  host: 127.0.0.1
														
 
															+#  port: !!int 27017
														
 
															+
														
 
															+
														
 
															+# redis
														
 
															+redis:
														
 
															+  host: 127.0.0.1
														
 
															+  port: !!int 6379
														
 
															+  pwd: ""
														
 
															+  db: !!int 10
														
 
															+
														
 
															+
														
 
															+# 阿里oss
														
 
															+ali_oss:
														
 
															+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
														
 
															+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
														
 
															+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
														
 
															+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
														
 
															+  bucket_name: jy-datafile
														
 
															+
														
 
															+
														
 
															+# es
														
 
															+es:
														
 
															+  host: 172.17.145.170
														
 
															+#  host: 192.168.3.206 # 测试
														
 
															+#  host: 127.0.0.1  # 本地代理
														
 
															+  port: !!int 9800
														
 
															+#  db: bidding # 测试
														
 
															+  db: biddingall # es库别名
														
 
															+
														
 
															+
														
 
															+# 代理
														
 
															+proxy:
														
 
															+  socks5:
														
 
															+    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
														
 
															+    auth:
														
 
															+      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB
														
--- a/bdzbw/config/constants.yaml
+++ b/bdzbw/config/constants.yaml
@@ -0,0 +1,2 @@
 
															+headers:
														
 
															+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
														
--- a/bdzbw/config/load.py
+++ b/bdzbw/config/load.py
@@ -0,0 +1,45 @@
 
															+from pathlib import Path
														
 
															+
														
 
															+import yaml
														
 
															+
														
 
															+__all__ = [
														
 
															+    'mongo_conf',
														
 
															+    'redis_conf',
														
 
															+    'oss_conf',
														
 
															+    'es_conf',
														
 
															+    'constants',
														
 
															+    'headers',
														
 
															+    'jy_proxy',
														
 
															+    # 'crawler_url',
														
 
															+    'region',
														
 
															+    'analyze_url',
														
 
															+    'node_module_path'
														
 
															+]
														
 
															+
														
 
															+_base_path = Path(__file__).parent
														
 
															+_yaml_conf = (_base_path / 'conf.yaml').resolve()
														
 
															+_yaml_constants = (_base_path / 'constants.yaml').resolve()
														
 
															+_yaml_areas = (_base_path / 'areas.yaml').resolve()
														
 
															+_node_modules = (_base_path.parent / 'node_modules').resolve()
														
 
															+
														
 
															+with open(_yaml_conf, encoding="utf-8") as f:
														
 
															+    conf = yaml.safe_load(f)
														
 
															+    mongo_conf = conf['mongo']
														
 
															+    redis_conf = conf['redis']
														
 
															+    es_conf: dict = conf['es']
														
 
															+    oss_conf: dict = conf['ali_oss']
														
 
															+    jy_proxy: dict = conf['proxy']
														
 
															+
														
 
															+
														
 
															+with open(_yaml_constants, encoding="utf-8") as fp:
														
 
															+    constants = yaml.safe_load(fp)
														
 
															+    headers: dict = constants['headers']
														
 
															+    # crawler_url: dict = constants['crawler_url']
														
 
															+    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
														
 
															+
														
 
															+with open(_yaml_areas, encoding="utf-8") as fr:
														
 
															+    areas = yaml.safe_load(fr)
														
 
															+    region: dict = areas['area']
														
 
															+
														
 
															+
														
 
															+node_module_path = _node_modules
														
--- a/bdzbw/crawler/crawl_scheduler.py
+++ b/bdzbw/crawler/crawl_scheduler.py
@@ -0,0 +1,78 @@
 
															+import random
														
 
															+import time
														
 
															+import traceback
														
 
															+
														
 
															+from utils.databases import mongo_table, int2long
														
 
															+from utils.execptions import JyBasicException
														
 
															+from utils.log import logger
														
 
															+
														
 
															+
														
 
															+class Scheduler:
														
 
															+
														
 
															+    def __init__(self, site, crawl_type, **kwargs):
														
 
															+        self.site = site
														
 
															+        self.crawl_type = crawl_type
														
 
															+        self.crawl_start = False
														
 
															+        self.count = None  # 日采集数量
														
 
															+        self.total = None  # 日采集上限
														
 
															+        self.account_id = None
														
 
															+        self.record_id = None
														
 
															+        self.user = None
														
 
															+        self.spider_code = None
														
 
															+        self.crawl_url = None
														
 
															+        self.crawl_params = None
														
 
															+        self.crawl_exception = None
														
 
															+        self.kwargs = kwargs
														
 
															+
														
 
															+        self.crawl_error_tab = mongo_table('py_spider', 'crawl_error')
														
 
															+
														
 
															+
														
 
															+    def finished(self, execute_next_time=None):
														
 
															+        # logger.info("任务结束")
														
 
															+        self.sleep(execute_next_time)
														
 
															+
														
 
															+    def err_record(self, e: JyBasicException):
														
 
															+        print(e)
														
 
															+        rows = {
														
 
															+            'spidercode': self.spider_code,
														
 
															+            'url': self.crawl_url,
														
 
															+            'status_code': 10500,
														
 
															+            'reason': e,
														
 
															+            'params': '未知系统错误',
														
 
															+            'crawl_time': int2long(int(time.time())),
														
 
															+            'crawl_type': self.crawl_type,
														
 
															+        }
														
 
															+        self.crawl_error_tab.insert_one(rows)
														
 
															+
														
 
															+    def __enter__(self):
														
 
															+        logger.info(f'[任务开始]')
														
 
															+
														
 
															+        self.crawl_start = True
														
 
															+
														
 
															+        return self
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def wait_for_next_task(wait_time=None):
														
 
															+        _sleep = (wait_time or random.choice(range(5, 15)))
														
 
															+        time.sleep(_sleep)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def sleep(wait_time=None):
														
 
															+        sleep_time = (wait_time or 600)
														
 
															+        time.sleep(sleep_time)
														
 
															+
														
 
															+
														
 
															+    def __exit__(self, exc_type, exc_val, exc_tb):
														
 
															+        logger.info(f'[任务结束]')
														
 
															+        self.crawl_start = False
														
 
															+
														
 
															+        if exc_type is not None:
														
 
															+            errmsg = traceback.extract_tb(exc_tb)
														
 
															+            e = JyBasicException(
														
 
															+                code=10500,
														
 
															+                reason=str(exc_type),
														
 
															+                title='未知系统错误'
														
 
															+            )
														
 
															+            self.err_record(e)
														
 
															+            logger.error(f'错误类型: {exc_type}, 错误内容: {exc_val}, 错误详情: {errmsg}')
														
 
															+        return True
														
--- a/bdzbw/start.sh
+++ b/bdzbw/start.sh
@@ -0,0 +1,7 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# 切换到指定目录
														
 
															+cd /mnt/bdzbw
														
 
															+/usr/bin/python3 /mnt/bdzbw/ListPageSpider.py
														
 
															+#保留终端，防止容器自动退出
														
 
															+/usr/sbin/init
														
--- a/bdzbw/utils/databases.py
+++ b/bdzbw/utils/databases.py
@@ -0,0 +1,109 @@
 
															+import bson
														
 
															+import pymongo
														
 
															+import redis
														
 
															+import requests
														
 
															+from elasticsearch import Elasticsearch
														
 
															+
														
 
															+from config.load import mongo_conf, redis_conf, es_conf, analyze_url
														
 
															+
														
 
															+
														
 
															+# ---------------------------------- mongo ----------------------------------
														
 
															+def mongo_client(cfg=None):
														
 
															+    if cfg is None:
														
 
															+        cfg = mongo_conf
														
 
															+    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
														
 
															+
														
 
															+
														
 
															+def mongo_database(db: str):
														
 
															+    client = mongo_client()
														
 
															+    return client[db]
														
 
															+
														
 
															+
														
 
															+def mongo_table(db: str, coll: str):
														
 
															+    client = mongo_client()
														
 
															+    return client[db][coll]
														
 
															+
														
 
															+
														
 
															+def int2long(param: int):
														
 
															+    """int 转换成 long """
														
 
															+    return bson.int64.Int64(param)
														
 
															+
														
 
															+
														
 
															+def object_id(_id: str):
														
 
															+    return bson.objectid.ObjectId(_id)
														
 
															+
														
 
															+
														
 
															+# ---------------------------------- es ----------------------------------
														
 
															+def es_client(cfg=None):
														
 
															+    if cfg is None:
														
 
															+        cfg = es_conf
														
 
															+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
														
 
															+
														
 
															+
														
 
															+def es_participles_service(text: str):
														
 
															+    """
														
 
															+    获取文本的分词列表
														
 
															+
														
 
															+    :param text: 需要分词的文本
														
 
															+    :return: 分词列表
														
 
															+    """
														
 
															+    result = []
														
 
															+    params = {"text": text, "analyzer": "ik_smart"}
														
 
															+    res = requests.get(analyze_url, params=params, timeout=60)
														
 
															+    if res.status_code == 200:
														
 
															+        tokens = res.json().get('tokens', [])
														
 
															+        for x in tokens:
														
 
															+            if x["token"].encode('utf-8').isalpha():
														
 
															+                continue
														
 
															+            result.append(x["token"])
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def es_query(title: str, publish_time: int):
														
 
															+    """
														
 
															+    查询es
														
 
															+
														
 
															+    :param title: 标题
														
 
															+    :param publish_time: 发布时间
														
 
															+    :return:
														
 
															+    """
														
 
															+    client = es_client()
														
 
															+    stime = publish_time - 432000  # 往前推5天
														
 
															+    etime = publish_time + 432000
														
 
															+    conditions = []
														
 
															+    participles = es_participles_service(title)
														
 
															+    for word in participles:
														
 
															+        conditions.append({
														
 
															+            "multi_match": {
														
 
															+                "query": word,
														
 
															+                "type": "phrase",
														
 
															+                "fields": ["title"]
														
 
															+            }
														
 
															+        })
														
 
															+    conditions.append({
														
 
															+        "range": {"publishtime": {"from": stime, "to": etime}}
														
 
															+    })
														
 
															+    query = {
														
 
															+        "query": {
														
 
															+            "bool": {
														
 
															+                "must": conditions,
														
 
															+                "minimum_should_match": 1
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+    result = client.search(index=es_conf['db'], body=query, request_timeout=100)
														
 
															+    count = len(result['hits']['hits'])
														
 
															+    return count
														
 
															+
														
 
															+
														
 
															+# ---------------------------------- redis ----------------------------------
														
 
															+def redis_client(cfg=None):
														
 
															+    if cfg is None:
														
 
															+        cfg = redis_conf
														
 
															+    pool = redis.ConnectionPool(
														
 
															+        host=cfg['host'],
														
 
															+        port=cfg['port'],
														
 
															+        password=cfg['pwd'],
														
 
															+        db=cfg['db']
														
 
															+    )
														
 
															+    return redis.Redis(connection_pool=pool, decode_responses=True)
														
--- a/bdzbw/utils/execptions.py
+++ b/bdzbw/utils/execptions.py
@@ -0,0 +1,36 @@
 
															+
														
 
															+class JyBasicException(Exception):
														
 
															+
														
 
															+    def __init__(self, code: int, reason: str, **kwargs):
														
 
															+        self.code = code
														
 
															+        self.reason = reason
														
 
															+        self.err_details = kwargs
														
 
															+        for key, val in kwargs.items():
														
 
															+            setattr(self, key, val)
														
 
															+
														
 
															+
														
 
															+class CustomCheckError(JyBasicException):
														
 
															+
														
 
															+    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
														
 
															+        self.code = code
														
 
															+        self.reason = reason
														
 
															+        self.err_details = kwargs
														
 
															+        for key, val in kwargs.items():
														
 
															+            setattr(self, key, val)
														
 
															+
														
 
															+
														
 
															+class VoidCrawlError(JyBasicException):
														
 
															+
														
 
															+    def __init__(self, code: int = 10003, reason: str = '空页面采集错误', **kwargs):
														
 
															+        self.code = code
														
 
															+        self.reason = reason
														
 
															+        self.err_details = kwargs
														
 
															+        for key, val in kwargs.items():
														
 
															+            setattr(self, key, val)
														
 
															+
														
 
															+
														
 
															+
														
 
															+class CustomAccountPrivilegeError(JyBasicException):
														
 
															+
														
 
															+    def __init__(self, *args, **kwargs):
														
 
															+        pass
														
--- a/bdzbw/utils/log.py
+++ b/bdzbw/utils/log.py
@@ -0,0 +1,14 @@
 
															+from pathlib import Path
														
 
															+
														
 
															+from loguru import logger
														
 
															+
														
 
															+_absolute = Path(__file__).absolute().parent.parent
														
 
															+_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
														
 
															+logger.add(
														
 
															+    _log_path,
														
 
															+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
														
 
															+    level='INFO',
														
 
															+    rotation='00:00',
														
 
															+    retention='1 week',
														
 
															+    encoding='utf-8',
														
 
															+)
														
--- a/bdzbw/utils/socks5.py
+++ b/bdzbw/utils/socks5.py
@@ -0,0 +1,44 @@
 
															+import threading
														
 
															+
														
 
															+import requests
														
 
															+
														
 
															+from utils.log import logger
														
 
															+from config.load import jy_proxy
														
 
															+
														
 
															+__all__ = ['Proxy']
														
 
															+
														
 
															+
														
 
															+class Socks5Proxy:
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self._lock = threading.RLock()
														
 
															+        self._enable_proxy = False
														
 
															+        self._url = jy_proxy['socks5']['url']
														
 
															+        self._auth = jy_proxy['socks5']['auth']
														
 
															+        self._proxies = None
														
 
															+
														
 
															+    @property
														
 
															+    def proxies(self):
														
 
															+        return self._proxies
														
 
															+
														
 
															+    def switch(self):
														
 
															+        with self._lock:
														
 
															+            if self._enable_proxy:
														
 
															+                self._proxies = self._fetch_proxies()
														
 
															+
														
 
															+    def _fetch_proxies(self):
														
 
															+        _proxy = {}
														
 
															+        try:
														
 
															+            _proxy = requests.get(self._url, headers=self._auth, timeout=10).json()
														
 
															+        finally:
														
 
															+            return _proxy.get("data")
														
 
															+
														
 
															+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
														
 
															+        self._enable_proxy = enable_proxy
														
 
															+        if self._enable_proxy:
														
 
															+            logger.info("[socks5代理 - 开启]")
														
 
															+            self._proxies = self._fetch_proxies()
														
 
															+        return self
														
 
															+
														
 
															+
														
 
															+Proxy = Socks5Proxy()
														
--- a/bdzbw/utils/tools.py
+++ b/bdzbw/utils/tools.py
@@ -0,0 +1,42 @@
 
															+import hashlib
														
 
															+import socket
														
 
															+import re
														
 
															+
														
 
															+
														
 
															+def sha1(text: str):
														
 
															+    """
														
 
															+    十六进制数字字符串形式摘要值
														
 
															+
														
 
															+    @param text: 字符串文本
														
 
															+    @return: 摘要值
														
 
															+    """
														
 
															+    _sha1 = hashlib.sha1()
														
 
															+    _sha1.update(text.encode("utf-8"))
														
 
															+    return _sha1.hexdigest()
														
 
															+
														
 
															+
														
 
															+def get_host_ip():
														
 
															+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
														
 
															+    try:
														
 
															+        s.connect(('8.8.8.8', 80))
														
 
															+        ip = s.getsockname()[0]
														
 
															+    finally:
														
 
															+        s.close()
														
 
															+    return ip
														
 
															+
														
 
															+
														
 
															+def check_crawl_title(title: str):
														
 
															+    crawl_keywords = {
														
 
															+        '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
														
 
															+        '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
														
 
															+        '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
														
 
															+        '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
														
 
															+        '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
														
 
															+        '终止', '系统'
														
 
															+    }
														
 
															+    for keyword in crawl_keywords:
														
 
															+        valid_keyword = re.search(keyword, title)
														
 
															+        if valid_keyword is not None:
														
 
															+            return valid_keyword
														
 
															+    else:
														
 
															+        return None
														
--- a/ybw/list_spider.py
+++ b/ybw/list_spider.py
@@ -128,7 +128,7 @@ class ListSpider:
 
															             crawl_total, cookies = 1, None
														
 
															             self.session = requests.session()
														
 
															             '''每个普通账号仅能查询4000条数据,设置每页最大条数:100,共计40页'''
														
 
															-            page_size = 100
														
 
															+            page_size = 30
														
 
															             for page in range(1, self.crawl_max_page + 1):
														
 
															                 '''生成 url 和 refer'''
														
 
															                 if page == 1:
														
@@ -199,5 +199,5 @@ if __name__ == '__main__':
 
															     ListSpider(
														
 
															         db='py_spider',
														
 
															         crawl_tab='ybw_list',
														
 
															-        crawl_max_page=40,
														
 
															+        crawl_max_page=134,
														
 
															     ).start()
														
--- a/zbytb/utils/clean_file.py
+++ b/zbytb/utils/clean_file.py
@@ -39,7 +39,10 @@ def sha1(val):
 
															 def remove(file_path: str):
														
 
															-    os.remove(file_path)
														
 
															+    try:
														
 
															+        os.remove(file_path)
														
 
															+    except:
														
 
															+        pass
														
 
															 def getsize(file):
	`@@ -0,0 +1,2 @@`
			`+headers:`
			`+ User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36`