преди 3 години · c699d2a6d0
--- a/jzsc/chaojiying.py
+++ b/jzsc/chaojiying.py
@@ -0,0 +1,55 @@
 
				+#!/usr/bin/env python
			
 
				+# coding:utf-8
			
 
				+
			
 
				+import requests
			
 
				+from hashlib import md5
			
 
				+
			
 
				+
			
 
				+class Chaojiying_Client(object):
			
 
				+
			
 
				+    def __init__(self, username, password, soft_id):
			
 
				+        self.username = username
			
 
				+        password = password.encode('utf8')
			
 
				+        self.password = md5(password).hexdigest()
			
 
				+        self.soft_id = soft_id
			
 
				+        self.base_params = {
			
 
				+            'user': self.username,
			
 
				+            'pass2': self.password,
			
 
				+            'softid': self.soft_id,
			
 
				+        }
			
 
				+        self.headers = {
			
 
				+            'Connection': 'Keep-Alive',
			
 
				+            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
			
 
				+        }
			
 
				+
			
 
				+    def PostPic(self, im, codetype):
			
 
				+        """
			
 
				+        im: 图片字节
			
 
				+        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
			
 
				+        """
			
 
				+        params = {
			
 
				+            'codetype': codetype,
			
 
				+        }
			
 
				+        params.update(self.base_params)
			
 
				+        files = {'userfile': ('ccc.jpg', im)}
			
 
				+        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
			
 
				+        return r.json()
			
 
				+
			
 
				+    def ReportError(self, im_id):
			
 
				+        """
			
 
				+        im_id:报错题目的图片ID
			
 
				+        """
			
 
				+        params = {
			
 
				+            'id': im_id,
			
 
				+        }
			
 
				+        params.update(self.base_params)
			
 
				+        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
			
 
				+        return r.json()
			
 
				+
			
 
				+
			
 
				+# if __name__ == '__main__':
			
 
				+#     chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '929622')  # 用户中心>>软件ID 生成一个替换 96001
			
 
				+    # im = open('img.png', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
			
 
				+    # print(chaojiying.PostPic(im, 9008))  #1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
			
 
				+    # result = chaojiying.ReportError('1170412067373400583')
			
 
				+    # print(result)
			
--- a/jzsc/config/__init__.py
+++ b/jzsc/config/__init__.py
--- a/jzsc/config/conf.yaml
+++ b/jzsc/config/conf.yaml
@@ -0,0 +1,32 @@
 
				+# mongo
			
 
				+mongo:
			
 
				+#  host: 172.17.4.87
			
 
				+#  port: !!int 27080
			
 
				+  host: 127.0.0.1
			
 
				+  port: !!int 27017
			
 
				+
			
 
				+
			
 
				+# redis
			
 
				+redis:
			
 
				+  host: 127.0.0.1
			
 
				+  port: !!int 6379
			
 
				+  pwd: ""
			
 
				+  db: !!int 10
			
 
				+
			
 
				+
			
 
				+# es
			
 
				+es:
			
 
				+  host: 172.17.145.170
			
 
				+#  host: 127.0.0.1
			
 
				+#  host: 192.168.3.206
			
 
				+  port: !!int 9800
			
 
				+  db: biddingall
			
 
				+
			
 
				+
			
 
				+# 阿里oss
			
 
				+ali_oss:
			
 
				+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
			
 
				+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
			
 
				+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
			
 
				+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
			
 
				+  bucket_name: jy-datafile
			
--- a/jzsc/config/constants.yaml
+++ b/jzsc/config/constants.yaml
@@ -0,0 +1,13 @@
 
				+headers:
			
 
				+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
			
 
				+  Accept: '*/*'
			
 
				+
			
 
				+proxy:
			
 
				+  socks5:
			
 
				+    url: http://socks.spdata.jianyu360.com/socks/getips?limit=10
			
 
				+    decrypt: ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/
			
 
				+
			
 
				+
			
 
				+node_module:
			
 
				+  windows: C:\Users\dell\AppData\Roaming\npm\node_modules
			
 
				+  linux: /usr/lib/node_modules
			
--- a/jzsc/config/load.py
+++ b/jzsc/config/load.py
@@ -0,0 +1,35 @@
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import yaml
			
 
				+
			
 
				+__all__ = [
			
 
				+    'mongo_conf', 'redis_conf', 'oss_conf', 'es_conf',
			
 
				+    'constants',
			
 
				+    'headers', 'jy_proxy', 'node_module',
			
 
				+    'analyze_url', 'node_module_path'
			
 
				+]
			
 
				+
			
 
				+base_path = Path(__file__).parent
			
 
				+yaml_conf = (base_path / 'conf.yaml').resolve()
			
 
				+yaml_constants = (base_path / 'constants.yaml').resolve()
			
 
				+
			
 
				+with open(yaml_conf, encoding="utf-8") as f:
			
 
				+    conf = yaml.safe_load(f)
			
 
				+    mongo_conf = conf['mongo']
			
 
				+    redis_conf = conf['redis']
			
 
				+    es_conf: dict = conf['es']
			
 
				+    oss_conf: dict = conf['ali_oss']
			
 
				+
			
 
				+with open(yaml_constants, encoding="utf-8") as fp:
			
 
				+    constants = yaml.safe_load(fp)
			
 
				+    headers: dict = constants['headers']
			
 
				+    jy_proxy: dict = constants['proxy']
			
 
				+    node_module: dict = constants['node_module']
			
 
				+    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
			
 
				+
			
 
				+
			
 
				+if sys.platform == 'linux':
			
 
				+    node_module_path = node_module['linux']
			
 
				+else:
			
 
				+    node_module_path = node_module['windows']
			
--- a/jzsc/spider.py
+++ b/jzsc/spider.py
@@ -0,0 +1,416 @@
 
				+import io
			
 
				+import time
			
 
				+
			
 
				+import pandas as pd
			
 
				+from PIL import Image
			
 
				+from lxml.html import fromstring, tostring
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver import ActionChains
			
 
				+from selenium.webdriver import Chrome
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.webdriver.support.wait import WebDriverWait
			
 
				+
			
 
				+from chaojiying import Chaojiying_Client
			
 
				+from utils.databases import mongo_table
			
 
				+from utils.log import logger
			
 
				+
			
 
				+'''MongoDB'''
			
 
				+company_tab = mongo_table('national', 'company')
			
 
				+
			
 
				+'''验证码服务'''
			
 
				+chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '929622')
			
 
				+
			
 
				+'''企业资质'''
			
 
				+COMPANY_QUALITY_MAPS = {
			
 
				+    '资质类别': 'quality_type',
			
 
				+    '资质证书号': 'quality_no',
			
 
				+    '资质名称': 'quality_name',
			
 
				+    '发证日期': 'fzrq',
			
 
				+    '发证有效期': 'fzyxq',
			
 
				+    '发证机关': 'fzjg',
			
 
				+}
			
 
				+'''不良行为'''
			
 
				+BAD_BEHAVIOR_MAPS = {
			
 
				+    '诚信记录主体及编号': 'integrity_no',
			
 
				+    '决定内容': 'decide_content',
			
 
				+    '实施部门': 'ssbm',
			
 
				+    '决定日期与有效期': 'execution_date',
			
 
				+}
			
 
				+'''黑名单记录'''
			
 
				+BLACK_LIST_MAPS = {
			
 
				+    '黑名单记录主体及编号': 'black_list_no',
			
 
				+    '黑名单认定依据': 'black_list_rdyj',
			
 
				+    '认定部门': 'rdbm',
			
 
				+    '决定日期与有效期': 'execution_date',
			
 
				+}
			
 
				+'''失信联合惩戒记录'''
			
 
				+PUNISH_MAPS = {
			
 
				+    '失信记录编号': 'punish_no',
			
 
				+    '失信联合惩戒记录主体': 'punish_subject',
			
 
				+    '法人姓名': 'legal_person',
			
 
				+    '列入名单事由': 'reason',
			
 
				+    '认定部门': 'rdbm',
			
 
				+    '列入日期': 'join_date',
			
 
				+}
			
 
				+
			
 
				+CRAWL_SITE = 'http://jzsc.mohurd.gov.cn/data/company'
			
 
				+
			
 
				+
			
 
				+def html2element(html):
			
 
				+    return fromstring(html)
			
 
				+
			
 
				+
			
 
				+def element2html(lxml_element):
			
 
				+    return tostring(lxml_element, encoding='utf-8').decode()
			
 
				+
			
 
				+
			
 
				+def display_prompt_popup(html):
			
 
				+    _element = html2element(html)
			
 
				+    node = _element.xpath('//div[@class="el-dialog__wrapper"]')[0]
			
 
				+    _popup_style = node.attrib.get('style')
			
 
				+    if _popup_style is not None:
			
 
				+        _styles = str(_popup_style).split(';')
			
 
				+        res = list(filter(lambda x: len(x) > 0, _styles))[-1].strip().lower()
			
 
				+        if res == 'display: none':
			
 
				+            '''无提示弹框'''
			
 
				+            return False
			
 
				+    '''有提示弹框'''
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def display_geetest_panel(html):
			
 
				+    _element = html2element(html)
			
 
				+    node = _element.xpath('//div[@class="geetest_panel_next"]')
			
 
				+    if len(node) == 0:
			
 
				+        '''无验证码'''
			
 
				+        return False
			
 
				+    _geetest_panel = node[0]
			
 
				+    geetest_style = _geetest_panel.attrib.get('style')
			
 
				+    if geetest_style is not None and geetest_style == 'display: block;':
			
 
				+        '''有验证码'''
			
 
				+        return True
			
 
				+    else:
			
 
				+        '''无验证码'''
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def prompt_popup(driver: Chrome):
			
 
				+    while True:
			
 
				+        if not display_prompt_popup(driver.page_source):
			
 
				+            break
			
 
				+        driver.find_element_by_xpath('//div[@class="el-dialog captchaDilaog"]/div[3]/div/button[1]').click()
			
 
				+        time.sleep(2)
			
 
				+
			
 
				+
			
 
				+def geetest_panel(driver: Chrome, save_img_to_local=False):
			
 
				+    pic_id = None
			
 
				+    while True:
			
 
				+        if not display_geetest_panel(driver.page_source):
			
 
				+            break
			
 
				+
			
 
				+        if pic_id is not None:
			
 
				+            '''打码平台失败'''
			
 
				+            captcha_result = chaojiying.ReportError(pic_id)
			
 
				+            pic_id = None
			
 
				+            logger.info(captcha_result)
			
 
				+
			
 
				+        '''获取验证图片对象'''
			
 
				+        wait = WebDriverWait(driver, 60, 0.5)
			
 
				+        locator = (By.CLASS_NAME, 'geetest_panel_next')
			
 
				+        touclick_element = wait.until(EC.presence_of_element_located(locator))
			
 
				+
			
 
				+        '''获取网页截图'''
			
 
				+        element_png = touclick_element.screenshot_as_png
			
 
				+        screenshot = Image.open(io.BytesIO(element_png))
			
 
				+
			
 
				+        '''修改截图尺寸;超级鹰:推荐宽不超过460px,高不超过310px'''
			
 
				+        # reim = screenshot.resize((306, 310))
			
 
				+        # reim = screenshot.resize((307, 300))
			
 
				+        reim = screenshot.resize((310, 300))
			
 
				+
			
 
				+        '''获取验证码图片'''
			
 
				+        bytes_array = io.BytesIO()
			
 
				+        reim.save(bytes_array, format='PNG')
			
 
				+
			
 
				+        '''保存验证码到本地'''
			
 
				+        if save_img_to_local:
			
 
				+            touclick_element.screenshot('captcha.png')
			
 
				+            with open('ele.png', 'wb') as wp:
			
 
				+                wp.write(bytes_array.getvalue())
			
 
				+
			
 
				+        '''识别验证码'''
			
 
				+        captcha_result = chaojiying.PostPic(bytes_array.getvalue(), 9004)
			
 
				+        logger.info(captcha_result)
			
 
				+        pic_id = captcha_result['pic_id']
			
 
				+
			
 
				+        '''解析识别结果'''
			
 
				+        groups = captcha_result.get('pic_str').split('|')
			
 
				+        locations = [[int(number) for number in group.split(',')] for group in groups]
			
 
				+
			
 
				+        '''点击验证图片'''
			
 
				+        for location in locations:
			
 
				+            # logger.info(location)
			
 
				+            ActionChains(driver).move_to_element_with_offset(
			
 
				+                touclick_element,
			
 
				+                location[0] + 10,
			
 
				+                location[1] + 47
			
 
				+            ).click().perform()
			
 
				+            time.sleep(1)
			
 
				+
			
 
				+        '''保存点击之后的图片'''
			
 
				+        if save_img_to_local:
			
 
				+            touclick_element.screenshot('touclick_img.png')
			
 
				+
			
 
				+        '''提交验证码'''
			
 
				+        locator = (By.CLASS_NAME, 'geetest_commit')
			
 
				+        commit_element = wait.until(EC.presence_of_element_located(locator))
			
 
				+        ActionChains(driver).click(commit_element).perform()
			
 
				+        time.sleep(5)
			
 
				+
			
 
				+
			
 
				+def check_page(driver: Chrome, **kwargs):
			
 
				+    """检查页面"""
			
 
				+    prompt_popup(driver)
			
 
				+    geetest_panel(driver, save_img_to_local=kwargs.get('save_img_to_local'))
			
 
				+
			
 
				+
			
 
				+def click(driver: Chrome, button, allow_check_page=False, wait_time=1):
			
 
				+    driver.execute_script("arguments[0].click();", button)
			
 
				+    time.sleep(wait_time)
			
 
				+    if allow_check_page:
			
 
				+        check_page(driver)
			
 
				+
			
 
				+
			
 
				+def next_page(driver: Chrome):
			
 
				+    element = html2element(driver.page_source)
			
 
				+    node = element.xpath('//button[@class="btn-next"]')[0]
			
 
				+    attrib = node.attrib.get('disabled')
			
 
				+    if attrib is not None and attrib == 'disabled':
			
 
				+        '''最大页码'''
			
 
				+        return False
			
 
				+    else:
			
 
				+        '''继续翻页'''
			
 
				+        button = driver.find_element_by_class_name('btn-next')
			
 
				+        click(driver, button)
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+def current_page(html):
			
 
				+    element = html2element(html)
			
 
				+    nodes = element.xpath('//ul[@class="el-pager"]/li')
			
 
				+    for node in nodes:
			
 
				+        if node.attrib.get('class') == 'number active':
			
 
				+            return node.text
			
 
				+
			
 
				+
			
 
				+def extract_content(html):
			
 
				+    """抽取页面结构化数据"""
			
 
				+    results = []
			
 
				+    '''字段映射表'''
			
 
				+    _maps = {
			
 
				+        **COMPANY_QUALITY_MAPS,
			
 
				+        **BAD_BEHAVIOR_MAPS,
			
 
				+        **BLACK_LIST_MAPS,
			
 
				+        **PUNISH_MAPS,
			
 
				+    }
			
 
				+    '''转化成dataframe'''
			
 
				+    dfs = pd.read_html(html)
			
 
				+    if len(dfs) == 2:
			
 
				+        columns = list(dfs[0].columns.array)
			
 
				+        values = dfs[1].values
			
 
				+        '''合并内容'''
			
 
				+        panel_container = [dict(zip(columns, val)) for val in values]
			
 
				+        '''转换字段'''
			
 
				+        for item in panel_container:
			
 
				+            _item = {}
			
 
				+            for key, val in item.items():
			
 
				+                if key in _maps:
			
 
				+                    _item[_maps[key]] = val
			
 
				+            results.append(_item)
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def click_query(driver: Chrome):
			
 
				+    """查询按钮"""
			
 
				+    button = driver.find_element_by_class_name("ssButton")
			
 
				+    click(driver, button)
			
 
				+    time.sleep(1)
			
 
				+
			
 
				+
			
 
				+def select_qualify_category(driver: Chrome, records):
			
 
				+    span_elements = driver.find_elements(by=By.XPATH, value='//div[@class="labelInPut labelInPutRadio"]/span')
			
 
				+    for span_element in span_elements:
			
 
				+        qualify_category = span_element.text
			
 
				+        if qualify_category not in records:
			
 
				+            logger.info(f'>>资质类别:{qualify_category} <<')
			
 
				+            click(driver, span_element)
			
 
				+            click_query(driver)
			
 
				+            records.append(span_element.text)
			
 
				+            return False
			
 
				+    else:
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+def crawl_spider(driver: Chrome, handler):
			
 
				+    """采集爬虫"""
			
 
				+    exception_count = 0
			
 
				+    td_elements = driver.find_elements(By.XPATH, value='//table[@class="el-table__body"]//tr/td[3]')
			
 
				+    for td_element in td_elements:
			
 
				+        if exception_count > 3:
			
 
				+            '''数据异常,停止采集'''
			
 
				+            return False
			
 
				+        button = td_element.find_element_by_class_name("link")
			
 
				+        click(driver, button, wait_time=2)
			
 
				+        title = td_element.text
			
 
				+        for current_handler in driver.window_handles:
			
 
				+            if current_handler == handler:
			
 
				+                continue
			
 
				+            '''切换到弹出页面'''
			
 
				+            driver.switch_to.window(current_handler)
			
 
				+            current_url = driver.current_url
			
 
				+            '''首次进入详情页,检查页面弹框和验证码面板'''
			
 
				+            check_page(driver)
			
 
				+            company = {}
			
 
				+
			
 
				+            '''企业基础数据'''
			
 
				+            element = html2element(driver.page_source)
			
 
				+            nodes = element.xpath('//div[@class="detaile-header__info--table"]')
			
 
				+            for node in nodes:
			
 
				+                credit_no = "".join(node.xpath('./div[1]/div[1]/div[2]/text()')).strip()
			
 
				+                legal_person = "".join(node.xpath('./div[1]/div[2]/div[2]/text()')).strip()
			
 
				+                company_type = "".join(node.xpath('./div[2]/div[1]/div[2]/text()')).strip()
			
 
				+                address = "".join(node.xpath('./div[2]/div[2]/div[2]/text()')).strip()
			
 
				+                business_address = "".join(node.xpath('./div[3]/div[1]/div[2]/text()')).strip()
			
 
				+                company = {
			
 
				+                    'credit_no': credit_no,  # 统一社会信用代码
			
 
				+                    'legal_person': legal_person,  # 企业法定代表人
			
 
				+                    'company_type': company_type,  # 企业登记注册类型
			
 
				+                    'address': address,  # 企业注册属地
			
 
				+                    'business_address': business_address,  # 企业经营地址
			
 
				+                }
			
 
				+                # logger.info(item)
			
 
				+
			
 
				+            '''企业资质'''
			
 
				+            element = html2element(driver.page_source)
			
 
				+            node = element.xpath('//div[@class="panel-container"]')[0]
			
 
				+            company_quality_html = element2html(node)
			
 
				+            company_quality = extract_content(company_quality_html)
			
 
				+            company['company_quality'] = company_quality
			
 
				+            company['company_quality_html'] = {'html': company_quality_html}
			
 
				+
			
 
				+            '''注册人员'''
			
 
				+            company_staff = driver.find_element_by_id("tab-companyStaff")
			
 
				+            click(driver, company_staff, allow_check_page=True)
			
 
				+            registrar = []
			
 
				+            reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
			
 
				+            for btn in reg_buttons:
			
 
				+                '''点击分类'''
			
 
				+                driver.execute_script("arguments[0].click();", btn)
			
 
				+                element = html2element(driver.page_source)
			
 
				+                nodes = element.xpath('//div[@class="el-table__body-wrapper is-scrolling-none"]/table//tr')
			
 
				+                for node in nodes:
			
 
				+                    name = "".join(node.xpath('./td[2]//span/text()')).strip()
			
 
				+                    id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
			
 
				+                    reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
			
 
				+                    reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
			
 
				+                    reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
			
 
				+                    registrar.append({
			
 
				+                        'name': name,  # 姓名
			
 
				+                        'id_no': id_no,  # 身份证号
			
 
				+                        'reg_type': reg_type,  # 注册类别
			
 
				+                        'reg_no': reg_no,  # 注册号(执业印章号)
			
 
				+                        'reg_major': reg_major,  # 注册专业
			
 
				+                    })
			
 
				+            company['company_staff'] = registrar
			
 
				+
			
 
				+            '''不良行为'''
			
 
				+            bad_behavior = driver.find_element_by_id('tab-badBehavior')
			
 
				+            click(driver, bad_behavior, allow_check_page=True)
			
 
				+            element = html2element(driver.page_source)
			
 
				+            node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
			
 
				+            bad_behavior_html = element2html(node)
			
 
				+            bad_behaviors = extract_content(company_quality_html)
			
 
				+            company['bad_behavior'] = bad_behaviors
			
 
				+            company['bad_behavior_html'] = {'html': bad_behavior_html}
			
 
				+
			
 
				+            '''黑名单记录'''
			
 
				+            black_list = driver.find_element_by_id('tab-blackList')
			
 
				+            click(driver, black_list, allow_check_page=True)
			
 
				+            element = html2element(driver.page_source)
			
 
				+            node = element.xpath('//div[@id="pane-blackList"]/div')[0]
			
 
				+            black_list_html = element2html(node)
			
 
				+            black_list_array = extract_content(company_quality_html)
			
 
				+            company['black_list'] = black_list_array
			
 
				+            company['black_list_html'] = {'html': black_list_html}
			
 
				+
			
 
				+            '''失信联合惩戒记录'''
			
 
				+            punish = driver.find_element_by_id('tab-punishLog')
			
 
				+            click(driver, punish, allow_check_page=True)
			
 
				+            element = html2element(driver.page_source)
			
 
				+            node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
			
 
				+            punish_html = element2html(node)
			
 
				+            punish_array = extract_content(company_quality_html)
			
 
				+            company['punish'] = punish_array
			
 
				+            company['punish_html'] = {'html': punish_html}
			
 
				+
			
 
				+            '''保存企业数据'''
			
 
				+            if len(company['credit_no']) > 0:
			
 
				+                company_tab.insert_one(company)
			
 
				+                logger.info(f'>>> {title} - {current_url} - 采集成功 - 保存入库')
			
 
				+            else:
			
 
				+                exception_count += 1  # 页面无企业数据
			
 
				+                logger.info(f'>>> {title} - {current_url} - 采集失败 - 无社会信用代码')
			
 
				+
			
 
				+            '''关闭详情页标签'''
			
 
				+            driver.close()
			
 
				+            '''返回列表页'''
			
 
				+            driver.switch_to.window(handler)
			
 
				+    else:
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+def downloader(driver: Chrome, handler):
			
 
				+    while True:
			
 
				+        logger.info(f">>> 第{current_page(driver.page_source)}页<<<")
			
 
				+        allow_crawl = crawl_spider(driver, handler)
			
 
				+        '''是否继续采集'''
			
 
				+        if not allow_crawl:
			
 
				+            logger.info("网站数据异常,终止采集")
			
 
				+            return False
			
 
				+        '''翻页'''
			
 
				+        if not next_page(driver):
			
 
				+            logger.info('采集结束')
			
 
				+            break
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def start(enable_remote_driver=False):
			
 
				+    options = webdriver.ChromeOptions()
			
 
				+    if enable_remote_driver:
			
 
				+        options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
			
 
				+    options.add_argument("--disable-gpu")
			
 
				+    chrome_driver = webdriver.Chrome(options=options)
			
 
				+    main_handler = chrome_driver.current_window_handle  # 获取操作句柄
			
 
				+    chrome_driver.get(CRAWL_SITE)
			
 
				+    time.sleep(3)
			
 
				+    '''采集记录'''
			
 
				+    records = ['全部', '勘察企业', '监理企业', '设计与施工一体化企业', '建筑业企业']
			
 
				+    # records = ['全部']
			
 
				+    while True:
			
 
				+        '''选择资质类别'''
			
 
				+        crawl_finished = select_qualify_category(chrome_driver, records)
			
 
				+        if crawl_finished:
			
 
				+            logger.info('任务结束')
			
 
				+            break
			
 
				+        '''下载数据'''
			
 
				+        _continue = downloader(chrome_driver, main_handler)
			
 
				+        if not _continue:
			
 
				+            break
			
 
				+
			
 
				+    if not enable_remote_driver:
			
 
				+        chrome_driver.quit()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    start(enable_remote_driver=True)
			
--- a/jzsc/utils/__init__.py
+++ b/jzsc/utils/__init__.py
--- a/jzsc/utils/databases.py
+++ b/jzsc/utils/databases.py
@@ -0,0 +1,109 @@
 
				+import bson
			
 
				+import pymongo
			
 
				+import redis
			
 
				+import requests
			
 
				+from elasticsearch import Elasticsearch
			
 
				+
			
 
				+from config.load import mongo_conf, redis_conf, es_conf, analyze_url
			
 
				+
			
 
				+
			
 
				+# ---------------------------------- mongo ----------------------------------
			
 
				+def mongo_client(cfg=None):
			
 
				+    if cfg is None:
			
 
				+        cfg = mongo_conf
			
 
				+    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
			
 
				+
			
 
				+
			
 
				+def mongo_database(db: str):
			
 
				+    client = mongo_client()
			
 
				+    return client[db]
			
 
				+
			
 
				+
			
 
				+def mongo_table(db: str, coll: str):
			
 
				+    client = mongo_client()
			
 
				+    return client[db][coll]
			
 
				+
			
 
				+
			
 
				+def int2long(param: int):
			
 
				+    """int 转换成 long """
			
 
				+    return bson.int64.Int64(param)
			
 
				+
			
 
				+
			
 
				+def object_id(_id: str):
			
 
				+    return bson.objectid.ObjectId(_id)
			
 
				+
			
 
				+
			
 
				+# ---------------------------------- es ----------------------------------
			
 
				+def es_client(cfg=None):
			
 
				+    if cfg is None:
			
 
				+        cfg = es_conf
			
 
				+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
			
 
				+
			
 
				+
			
 
				+def es_participles_service(text: str):
			
 
				+    """
			
 
				+    获取文本的分词列表
			
 
				+
			
 
				+    :param text: 需要分词的文本
			
 
				+    :return: 分词列表
			
 
				+    """
			
 
				+    result = []
			
 
				+    params = {"text": text, "analyzer": "ik_smart"}
			
 
				+    res = requests.get(analyze_url, params=params, timeout=60)
			
 
				+    if res.status_code == 200:
			
 
				+        tokens = res.json().get('tokens', [])
			
 
				+        for x in tokens:
			
 
				+            if x["token"].encode('utf-8').isalpha():
			
 
				+                continue
			
 
				+            result.append(x["token"])
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def es_query(title: str, publish_time: int):
			
 
				+    """
			
 
				+    查询es
			
 
				+
			
 
				+    :param title: 标题
			
 
				+    :param publish_time: 发布时间
			
 
				+    :return:
			
 
				+    """
			
 
				+    client = es_client()
			
 
				+    stime = publish_time - 432000  # 往前推5天
			
 
				+    etime = publish_time + 432000
			
 
				+    conditions = []
			
 
				+    participles = es_participles_service(title)
			
 
				+    for word in participles:
			
 
				+        conditions.append({
			
 
				+            "multi_match": {
			
 
				+                "query": word,
			
 
				+                "type": "phrase",
			
 
				+                "fields": ["title"]
			
 
				+            }
			
 
				+        })
			
 
				+    conditions.append({
			
 
				+        "range": {"publishtime": {"from": stime, "to": etime}}
			
 
				+    })
			
 
				+    query = {
			
 
				+        "query": {
			
 
				+            "bool": {
			
 
				+                "must": conditions,
			
 
				+                "minimum_should_match": 1
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    result = client.search(index='bidding', body=query, request_timeout=100)
			
 
				+    count = len(result['hits']['hits'])
			
 
				+    return count
			
 
				+
			
 
				+
			
 
				+# ---------------------------------- redis ----------------------------------
			
 
				+def redis_client(cfg=None):
			
 
				+    if cfg is None:
			
 
				+        cfg = redis_conf
			
 
				+    pool = redis.ConnectionPool(
			
 
				+        host=cfg['host'],
			
 
				+        port=cfg['port'],
			
 
				+        password=cfg['pwd'],
			
 
				+        db=cfg['db']
			
 
				+    )
			
 
				+    return redis.Redis(connection_pool=pool, decode_responses=True)
			
--- a/jzsc/utils/execptions.py
+++ b/jzsc/utils/execptions.py
@@ -0,0 +1,49 @@
 
				+
			
 
				+class JyBasicException(Exception):
			
 
				+
			
 
				+    def __init__(self, code: int, reason: str, **kwargs):
			
 
				+        self.code = code
			
 
				+        self.reason = reason
			
 
				+        self.err_details = kwargs
			
 
				+        for key, val in kwargs.items():
			
 
				+            setattr(self, key, val)
			
 
				+
			
 
				+
			
 
				+class CustomAccountPrivilegeError(JyBasicException):
			
 
				+
			
 
				+    def __init__(self, code: int = 10001, reason: str = '账号权限登录异常', **kwargs):
			
 
				+        self.code = code
			
 
				+        self.reason = reason
			
 
				+        self.err_details = kwargs
			
 
				+        for key, val in kwargs.items():
			
 
				+            setattr(self, key, val)
			
 
				+
			
 
				+
			
 
				+class CustomCheckError(JyBasicException):
			
 
				+
			
 
				+    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
			
 
				+        self.code = code
			
 
				+        self.reason = reason
			
 
				+        self.err_details = kwargs
			
 
				+        for key, val in kwargs.items():
			
 
				+            setattr(self, key, val)
			
 
				+
			
 
				+
			
 
				+class VoidCrawlError(JyBasicException):
			
 
				+
			
 
				+    def __init__(self, code: int = 10003, reason: str = '空页面采集错误', **kwargs):
			
 
				+        self.code = code
			
 
				+        self.reason = reason
			
 
				+        self.err_details = kwargs
			
 
				+        for key, val in kwargs.items():
			
 
				+            setattr(self, key, val)
			
 
				+
			
 
				+
			
 
				+class AttachmentNullError(JyBasicException):
			
 
				+
			
 
				+    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
			
 
				+        self.code = code
			
 
				+        self.reason = reason
			
 
				+        self.err_details = kwargs
			
 
				+        for key, val in kwargs.items():
			
 
				+            setattr(self, key, val)
			
--- a/jzsc/utils/log.py
+++ b/jzsc/utils/log.py
@@ -0,0 +1,14 @@
 
				+from pathlib import Path
			
 
				+
			
 
				+from loguru import logger
			
 
				+
			
 
				+_absolute = Path(__file__).absolute().parent.parent
			
 
				+_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
			
 
				+logger.add(
			
 
				+    _log_path,
			
 
				+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
			
 
				+    level='INFO',
			
 
				+    rotation='00:00',
			
 
				+    retention='1 week',
			
 
				+    encoding='utf-8',
			
 
				+)
			
--- a/jzsc/utils/socks5.py
+++ b/jzsc/utils/socks5.py
@@ -0,0 +1,153 @@
 
				+import threading
			
 
				+import time
			
 
				+from collections import deque
			
 
				+from urllib.parse import urlparse
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+from config.load import jy_proxy, headers
			
 
				+from utils.log import logger
			
 
				+
			
 
				+__all__ = ['Proxy']
			
 
				+
			
 
				+
			
 
				+def decrypt(input_str: str) -> str:
			
 
				+    """
			
 
				+    定义base64解密函数
			
 
				+
			
 
				+    :param input_str:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 对前面不是“=”的字节取索引，然后转换为2进制
			
 
				+    key = jy_proxy['socks5']['decrypt']
			
 
				+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
			
 
				+    output_str = ''
			
 
				+    # 补齐“=”的个数
			
 
				+    equal_num = input_str.count('=')
			
 
				+    while ascii_list:
			
 
				+        temp_list = ascii_list[:4]
			
 
				+        # 转换成2进制字符串
			
 
				+        temp_str = ''.join(temp_list)
			
 
				+        # 对没有8位2进制的字符串补够8位2进制
			
 
				+        if len(temp_str) % 8 != 0:
			
 
				+            temp_str = temp_str[0:-1 * equal_num * 2]
			
 
				+        # 4个6字节的二进制  转换  为三个8字节的二进制
			
 
				+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
			
 
				+        # 二进制转为10进制
			
 
				+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
			
 
				+        # 连接成字符串
			
 
				+        output_str += ''.join([chr(x) for x in temp_str_list])
			
 
				+        ascii_list = ascii_list[4:]
			
 
				+    return output_str
			
 
				+
			
 
				+
			
 
				+class Socks5Proxy:
			
 
				+
			
 
				+    __instance = None
			
 
				+
			
 
				+    def __new__(cls, *args, **kwargs):
			
 
				+        if cls.__instance is None:
			
 
				+            cls.__instance = super().__new__(cls)
			
 
				+        return cls.__instance
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.seconds = 60
			
 
				+        self._lock = threading.RLock()
			
 
				+        self._url = jy_proxy['socks5']['url']
			
 
				+        self._dq = deque([])
			
 
				+        self._proxies = {}
			
 
				+        self._pool = []
			
 
				+        self._counter = {}
			
 
				+
			
 
				+    def _init(self):
			
 
				+        while not self._proxies:
			
 
				+            if len(self._dq) > 0:
			
 
				+                '''队列左边取值'''
			
 
				+                self._proxies = self._dq.popleft()
			
 
				+                '''添加到队尾'''
			
 
				+                self._dq.append(self._proxies)
			
 
				+            else:
			
 
				+                self.__request_service()
			
 
				+                self.__check_proxies()
			
 
				+
			
 
				+    @property
			
 
				+    def proxies(self):
			
 
				+        with self._lock:
			
 
				+            return self._proxies if len(self._proxies) > 0 else None
			
 
				+
			
 
				+    def switch(self, reset=False):
			
 
				+        with self._lock:
			
 
				+            if reset is True:
			
 
				+                self.__flush_proxy_pool()
			
 
				+            elif len(self._counter) > 0:
			
 
				+                end_time = self._counter[self.get_netloc(self._proxies)]
			
 
				+                current_time = int(time.time())
			
 
				+                if end_time - current_time < self.seconds:
			
 
				+                    logger.info(f"[移除socks5代理]{self.get_netloc(self._proxies)}")
			
 
				+                    self._dq.remove(self._proxies)
			
 
				+                    del self._counter[self.get_netloc(self._proxies)]
			
 
				+                    logger.info(f"[socks5代理]剩余 {len(self._dq)} 个")
			
 
				+
			
 
				+            self._proxies = {}  # 重置代理
			
 
				+            while len(self._proxies) == 0:
			
 
				+                if len(self._dq) > 0:
			
 
				+                    self._proxies = self._dq.popleft()
			
 
				+                    self._dq.append(self._proxies)
			
 
				+                else:
			
 
				+                    self.__flush_proxy_pool()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_netloc(item: dict):
			
 
				+        parser = urlparse(item.get('http'))
			
 
				+        return parser.netloc
			
 
				+
			
 
				+    def __request_service(self):
			
 
				+        try:
			
 
				+            response = requests.get(self._url, timeout=10)
			
 
				+            self.__extract_ip(response)
			
 
				+        except requests.RequestException:
			
 
				+            pass
			
 
				+
			
 
				+    def __extract_ip(self, response):
			
 
				+        for proxy in response.json():
			
 
				+            host = decrypt(proxy['host'])
			
 
				+            port = int(proxy['port'])
			
 
				+            end_time = proxy['EndTime']
			
 
				+            items = {
			
 
				+                'http': 'socks5://{}:{}'.format(host, port),
			
 
				+                'https': 'socks5://{}:{}'.format(host, port)
			
 
				+            }
			
 
				+            self._pool.append(items)
			
 
				+            self._counter.setdefault(self.get_netloc(items), end_time)
			
 
				+
			
 
				+    def __check_proxies(self):
			
 
				+        check_ip = 'https://myip.ipip.net'
			
 
				+        logger.info(f"[socks5代理检验]访问地址-{check_ip}")
			
 
				+        for proxies in self._pool:
			
 
				+            try:
			
 
				+                requests_param = {
			
 
				+                    "headers": headers,
			
 
				+                    "proxies": proxies,
			
 
				+                    "timeout": 2
			
 
				+                }
			
 
				+                requests.get(check_ip, **requests_param)
			
 
				+                self._dq.append(proxies)
			
 
				+            except requests.RequestException:
			
 
				+                del self._counter[self.get_netloc(proxies)]
			
 
				+
			
 
				+    def __flush_proxy_pool(self):
			
 
				+        logger.info(f"[socks5代理]刷新代理池")
			
 
				+        self._pool.clear()
			
 
				+        self._dq.clear()
			
 
				+        self._counter.clear()
			
 
				+        self.__request_service()
			
 
				+        self.__check_proxies()
			
 
				+
			
 
				+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
			
 
				+        if enable_proxy:
			
 
				+            logger.info("[加载socks5代理]")
			
 
				+            self._init()
			
 
				+        return self
			
 
				+
			
 
				+
			
 
				+Proxy = Socks5Proxy()
			
--- a/jzsc/utils/tools.py
+++ b/jzsc/utils/tools.py
@@ -0,0 +1,11 @@
 
				+import socket
			
 
				+
			
 
				+
			
 
				+def get_host_ip():
			
 
				+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			
 
				+    try:
			
 
				+        s.connect(('8.8.8.8', 80))
			
 
				+        ip = s.getsockname()[0]
			
 
				+    finally:
			
 
				+        s.close()
			
 
				+    return ip