dongzhaorui@topnet.net.cn 3 سال پیش
والد
کامیت
5dffcfcd4b

+ 0 - 0
geocode/config/__init__.py


+ 32 - 0
geocode/config/conf.yaml

@@ -0,0 +1,32 @@
+# mongo
+mongo:
+#  host: 172.17.4.87
+#  port: !!int 27080
+  host: 127.0.0.1
+  port: !!int 27017
+
+
+# redis
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10
+
+
+# es
+es:
+  host: 172.17.145.170
+#  host: 127.0.0.1
+#  host: 192.168.3.206
+  port: !!int 9800
+  db: bidding_all
+
+
+# 阿里oss
+ali_oss:
+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
+#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
+  bucket_name: jy-datafile

+ 13 - 0
geocode/config/constants.yaml

@@ -0,0 +1,13 @@
+headers:
+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
+  Accept: '*/*'
+
+proxy:
+  socks5:
+    url: http://socks.spdata.jianyu360.com/socks/getips?limit=10
+    decrypt: ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/
+
+
+node_module:
+  windows: C:\Users\dell\AppData\Roaming\npm\node_modules
+  linux: /usr/lib/node_modules

+ 35 - 0
geocode/config/load.py

@@ -0,0 +1,35 @@
+import sys
+from pathlib import Path
+
+import yaml
+
+__all__ = [
+    'mongo_conf', 'redis_conf', 'oss_conf', 'es_conf',
+    'constants',
+    'headers', 'jy_proxy', 'node_module',
+    'analyze_url', 'node_module_path'
+]
+
+base_path = Path(__file__).parent
+yaml_conf = (base_path / 'conf.yaml').resolve()
+yaml_constants = (base_path / 'constants.yaml').resolve()
+
+with open(yaml_conf, encoding="utf-8") as f:
+    conf = yaml.safe_load(f)
+    mongo_conf = conf['mongo']
+    redis_conf = conf['redis']
+    es_conf: dict = conf['es']
+    oss_conf: dict = conf['ali_oss']
+
+with open(yaml_constants, encoding="utf-8") as fp:
+    constants = yaml.safe_load(fp)
+    headers: dict = constants['headers']
+    jy_proxy: dict = constants['proxy']
+    node_module: dict = constants['node_module']
+    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
+
+
+if sys.platform == 'linux':
+    node_module_path = node_module['linux']
+else:
+    node_module_path = node_module['windows']

+ 393 - 0
geocode/crawl_spider.py

@@ -0,0 +1,393 @@
+import re
+import time
+from urllib.parse import urljoin
+
+import lxml.etree
+import requests
+from lxml.html import fromstring
+
+from utils.databases import mongo_table, int2long
+
+province_tab = mongo_table('address', 'province')
+city_tab = mongo_table('address', 'city')
+district_tab = mongo_table('address', 'district')
+town_tab = mongo_table('address', 'town')
+village_tab = mongo_table('address', 'village')
+address_tab = mongo_table('address', 'new_address_2021')
+
+
+def page_source(url, headers=None, cookies=None, **kwargs):
+    request_params = {}
+    if headers is None:
+        headers = {
+            "Connection": "keep-alive",
+            "Pragma": "no-cache",
+            "Cache-Control": "no-cache",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
+        }
+    if cookies is None:
+        cookies = {
+            "SF_cookie_1": "37059734"
+        }
+    request_params.setdefault('headers', headers)
+    request_params.setdefault('cookies', cookies)
+    request_params.setdefault('timeout', 60)
+    request_params.setdefault('allow_redirects', False)
+    request_params.setdefault('proxies', kwargs.get('proxies'))
+    response = requests.get(url, **request_params)
+    response.encoding = response.apparent_encoding
+    return response
+
+
+def html2element(html):
+    element = fromstring(html)
+    return element
+
+
+def province():
+    """
+        {
+            "_id" : ObjectId("6098cafbb9b8e6b1903a83f4"),
+            "province_code" : NumberInt(11),
+            "province" : "北京市"
+        }
+    """
+    url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html"
+    response = page_source(url)
+
+    element = html2element(response.text)
+    node = element.xpath('//table[@width="100%"]//tr[position()>3]/td')
+    item = []
+    for td in node:
+        name = ''.join(td.xpath('./a/text()')).strip()
+        href = ''.join(td.xpath('./a/@href')).strip()
+        province_code = re.match('\d+', href).group()
+        province_url = urljoin(url, href)
+        print(name, province_code, province_url)
+        item.append({
+            'province_code': int(province_code),
+            'province': name,
+            'province_url': province_url
+        })
+    province_tab.insert_many(item)
+    print('[省级]下载完成')
+
+
+def city():
+    """
+    {
+        "_id" : ObjectId("6098cb97b9b8e6b1903a841a"),
+        "province_code" : NumberInt(11),
+        "province" : "北京市",
+        "city" : "市辖区",
+        "city_code" : NumberInt(1101)
+    }
+    """
+    with province_tab.find() as cursor:
+        for item in cursor:
+            url = item['province_url']
+            response = page_source(url)
+            element = html2element(response.text)
+            node = element.xpath('//table[@class="citytable"]//tr[position()>1]')
+            city_item = []
+            for tr in node:
+                city_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:4]
+                name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
+                href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
+                city_url = urljoin(url, href)
+                city_item.append({
+                    'province_code': item['province_code'],
+                    'province': item['province'],
+                    'city': name,
+                    'city_code': int(city_code),
+                    'city_url': city_url
+                })
+            city_tab.insert_many(city_item)
+            print(f'[市级]{item["province"]}下载完成')
+
+
+def district():
+    """
+    {
+        "_id" : ObjectId("6098cbb8b9b8e6b1903a8593"),
+        "province_code" : NumberInt(12),
+        "province" : "天津市",
+        "city" : "市辖区",
+        "city_code" : NumberInt(1201),
+        "district" : "宝坻区",
+        "district_code" : NumberInt(120115)
+    }
+    """
+    with city_tab.find() as cursor:
+        for item in cursor:
+            url = item['city_url']
+            while True:
+                response = page_source(url)
+                try:
+                    element = html2element(response.text)
+                    node = element.xpath('//table[@class="countytable"]//tr[position()>1]')
+                    district_item = []
+                    district_level_item = []
+                    for tr in node:
+                        attrib = tr.attrib.get('class')
+                        href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
+                        '''
+                        1、县级市辖区为街道
+                        2、市辖区无街道
+                        '''
+                        if attrib == 'countytr':
+                            if len(href) > 0:
+                                district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
+                                name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
+                                district_url = urljoin(url, href)
+                                district_item.append({
+                                    'province_code': item['province_code'],
+                                    'province': item['province'],
+                                    'city': item['city'],
+                                    'city_code': item['city_code'],
+                                    'district': name,
+                                    'district_code': int(district_code),
+                                    'district_url': district_url
+                                })
+                            else:
+                                district_code = ''.join(tr.xpath('./td[1]/text()')).strip()[0:6]
+                                name = ''.join(tr.xpath('./td[2]/text()')).strip()
+                                district_item.append({
+                                    'province_code': item['province_code'],
+                                    'province': item['province'],
+                                    'city': item['city'],
+                                    'city_code': item['city_code'],
+                                    'district': name,
+                                    'district_code': int(district_code),
+                                })
+                        elif attrib == 'towntr':
+                            '''区、县页面出现 towntr 表示该区、县为县级市'''
+                            district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
+                            name = item['city']
+                            town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
+                            town_name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
+                            town_url = urljoin(url, href)
+                            district_level_item.append({
+                                'province_code': item['province_code'],
+                                'province': item['province'],
+                                'city': item['city'],
+                                'city_code': item['city_code'],
+                                'district': name,
+                                'district_code': int(district_code),
+                                'town': town_name,
+                                'town_code': int(town_code),
+                                'town_url': town_url
+                            })
+                        else:
+                            raise
+                    break
+                except lxml.etree.ParserError:
+                    print(f'[县级]{item["province"]}{item["city"]}下载超时,重新获取')
+                    time.sleep(1)
+            if len(district_item) > 0:
+                district_tab.insert_many(district_item)
+                print(f'[县级]{item["province"]}{item["city"]}下载完成')
+            if len(district_level_item) > 0:
+                district_tab.insert_many(district_level_item)
+                print(f'[县级市]{item["province"]}{item["city"]}下载完成')
+            time.sleep(0.5)
+
+
+def town():
+    """
+    {
+        "_id" : ObjectId("6098cbceb9b8e6b1903a91b4"),
+        "province_code" : NumberInt(11),
+        "province" : "北京市",
+        "city" : "市辖区",
+        "city_code" : NumberInt(1101),
+        "district" : "海淀区",
+        "district_code" : NumberInt(110108),
+        "town" : "上庄地区",
+        "town_code" : NumberInt(110108030)
+    }
+    """
+    query = {"town": {"$exists": True}}
+    with district_tab.find(query) as cursor:
+        for item in cursor:
+            town_tab.insert_one(item)
+            print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
+
+    query = {"town": {"$exists": False}, "district_url": {"$exists": True}}
+    with district_tab.find(query) as cursor:
+        for item in cursor:
+            url = item['district_url']
+            while True:
+                response = page_source(url)
+                try:
+                    element = html2element(response.text)
+                    node = element.xpath('//table[@class="towntable"]//tr[position()>1]')
+                    town_item = []
+                    for tr in node:
+                        href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
+                        town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
+                        name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
+                        town_url = urljoin(url, href)
+                        town_item.append({
+                            'province_code': item['province_code'],
+                            'province': item['province'],
+                            'city': item['city'],
+                            'city_code': item['city_code'],
+                            'district': item['district'],
+                            'district_code': item['district_code'],
+                            'town': name,
+                            'town_code': int(town_code),
+                            'town_url': town_url,
+                        })
+                    break
+                except lxml.etree.ParserError:
+                    print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载超时,重新获取')
+                    time.sleep(1)
+            town_tab.insert_many(town_item)
+            print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
+            time.sleep(0.5)
+
+
+def village():
+    """
+    {
+        "_id" : ObjectId("6098cc2bb9b8e6b1903b3a38"),
+        "province_code" : NumberInt(11),
+        "province" : "北京市",
+        "city" : "市辖区",
+        "city_code" : NumberInt(1101),
+        "district" : "海淀区",
+        "district_code" : NumberInt(110108),
+        "town" : "万寿路街道",
+        "town_code" : NumberInt(110108001),
+        "village" : "复兴路26号社区居委会",
+        "village_code" : NumberLong(110108001020)
+    }
+    """
+    with town_tab.find() as cursor:
+        for item in cursor:
+            url = item['town_url']
+            while True:
+                try:
+                    response = page_source(url)
+                except requests.RequestException:
+                    print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}访问超时,重新获取')
+                    time.sleep(1)
+                    continue
+                try:
+                    element = html2element(response.text)
+                    node = element.xpath('//table[@class="villagetable"]//tr[position()>1]')
+                    village_item = []
+                    for tr in node:
+                        village_code = ''.join(tr.xpath('./td[1]/text()')).strip()
+                        name = ''.join(tr.xpath('./td[3]/text()')).strip()
+                        village_item.append({
+                            'province_code': item['province_code'],
+                            'province': item['province'],
+                            'city': item['city'],
+                            'city_code': item['city_code'],
+                            'district': item['district'],
+                            'district_code': item['district_code'],
+                            'town': item['town'],
+                            'town_code': item['town_code'],
+                            'village': name,
+                            'village_code': int2long(int(village_code))
+                        })
+                    break
+                except lxml.etree.ParserError:
+                    print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载超时,重新获取')
+                    time.sleep(1)
+            try:
+                village_tab.insert_many(village_item)
+                print(f'[行政区划代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载完成')
+                time.sleep(0.5)
+            except TypeError:
+                print(url)
+                breakpoint()
+
+
+def address():
+    mgo_maps = {
+        'province': {
+            'table': province_tab,
+            'query': {},
+            'projection': {'province_code': 1, 'province': 1}
+        },
+        'city': {
+            'table': city_tab,
+            'query': {},
+            'projection': {
+                'province_code': 1,
+                'province': 1,
+                'city': 1,
+                'city_code': 1
+            }
+        },
+        'district': {
+            'table': district_tab,
+            'query': {},
+            'projection': {
+                'province_code': 1,
+                'province': 1,
+                'city': 1,
+                'city_code': 1,
+                'district': 1,
+                'district_code': 1
+            }
+        },
+        'town': {
+            'table': town_tab,
+            'query': {},
+            'projection': {
+                'province_code': 1,
+                'province': 1,
+                'city': 1,
+                'city_code': 1,
+                'district': 1,
+                'district_code': 1,
+                'town': 1,
+                'town_code': 1
+            }
+        },
+        'village': {
+            'table': village_tab,
+            'query': {},
+            'projection': {
+                'province_code': 1,
+                'province': 1,
+                'city': 1,
+                'city_code': 1,
+                'district': 1,
+                'district_code': 1,
+                'town': 1,
+                'town_code': 1,
+                'village': 1,
+                'village_code': 1
+            }
+        }
+    }
+    for key, maps in mgo_maps.items():
+        tab = maps['table']
+        query = maps['query']
+        projection = maps['projection']
+        with tab.find(query, projection) as cursor:
+            for item in cursor:
+                del item['_id']
+                address_tab.insert_one(item)
+                print(f'{key} >> {item}')
+
+
+def main():
+    province()
+    city()
+    district()
+    town()
+    village()
+    address()
+
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
geocode/utils/__init__.py


+ 109 - 0
geocode/utils/databases.py

@@ -0,0 +1,109 @@
+import bson
+import pymongo
+import redis
+import requests
+from elasticsearch import Elasticsearch
+
+from config.load import mongo_conf, redis_conf, es_conf, analyze_url
+
+
+# ---------------------------------- mongo ----------------------------------
+def mongo_client(cfg=None):
+    if cfg is None:
+        cfg = mongo_conf
+    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
+
+
+def mongo_database(db: str):
+    client = mongo_client()
+    return client[db]
+
+
+def mongo_table(db: str, coll: str):
+    client = mongo_client()
+    return client[db][coll]
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)
+
+
+def object_id(_id: str):
+    return bson.objectid.ObjectId(_id)
+
+
+# ---------------------------------- es ----------------------------------
+def es_client(cfg=None):
+    if cfg is None:
+        cfg = es_conf
+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
+
+
+def es_participles_service(text: str):
+    """
+    获取文本的分词列表
+
+    :param text: 需要分词的文本
+    :return: 分词列表
+    """
+    result = []
+    params = {"text": text, "analyzer": "ik_smart"}
+    res = requests.get(analyze_url, params=params, timeout=60)
+    if res.status_code == 200:
+        tokens = res.json().get('tokens', [])
+        for x in tokens:
+            if x["token"].encode('utf-8').isalpha():
+                continue
+            result.append(x["token"])
+    return result
+
+
+def es_query(title: str, publish_time: int):
+    """
+    查询es
+
+    :param title: 标题
+    :param publish_time: 发布时间
+    :return:
+    """
+    client = es_client()
+    stime = publish_time - 432000  # 往前推5天
+    etime = publish_time + 432000
+    conditions = []
+    participles = es_participles_service(title)
+    for word in participles:
+        conditions.append({
+            "multi_match": {
+                "query": word,
+                "type": "phrase",
+                "fields": ["title"]
+            }
+        })
+    conditions.append({
+        "range": {"publishtime": {"from": stime, "to": etime}}
+    })
+    query = {
+        "query": {
+            "bool": {
+                "must": conditions,
+                "minimum_should_match": 1
+            }
+        }
+    }
+    result = client.search(index='bidding', body=query, request_timeout=100)
+    count = len(result['hits']['hits'])
+    return count
+
+
+# ---------------------------------- redis ----------------------------------
+def redis_client(cfg=None):
+    if cfg is None:
+        cfg = redis_conf
+    pool = redis.ConnectionPool(
+        host=cfg['host'],
+        port=cfg['port'],
+        password=cfg['pwd'],
+        db=cfg['db']
+    )
+    return redis.Redis(connection_pool=pool, decode_responses=True)

+ 49 - 0
geocode/utils/execptions.py

@@ -0,0 +1,49 @@
+
+class JyBasicException(Exception):
+
+    def __init__(self, code: int, reason: str, **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomAccountPrivilegeError(JyBasicException):
+
+    def __init__(self, code: int = 10001, reason: str = '账号权限登录异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class CustomCheckError(JyBasicException):
+
+    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class VoidCrawlError(JyBasicException):
+
+    def __init__(self, code: int = 10003, reason: str = '空页面采集错误', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+class AttachmentNullError(JyBasicException):
+
+    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
+        self.code = code
+        self.reason = reason
+        self.err_details = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)

+ 14 - 0
geocode/utils/log.py

@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from loguru import logger
+
+_absolute = Path(__file__).absolute().parent.parent
+_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
+logger.add(
+    _log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)

+ 153 - 0
geocode/utils/socks5.py

@@ -0,0 +1,153 @@
+import threading
+import time
+from collections import deque
+from urllib.parse import urlparse
+
+import requests
+
+from config.load import jy_proxy, headers
+from utils.log import logger
+
+__all__ = ['Proxy']
+
+
+def decrypt(input_str: str) -> str:
+    """
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    key = jy_proxy['socks5']['decrypt']
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
+
+class Socks5Proxy:
+
+    __instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls.__instance is None:
+            cls.__instance = super().__new__(cls)
+        return cls.__instance
+
+    def __init__(self):
+        self.seconds = 60
+        self._lock = threading.RLock()
+        self._url = jy_proxy['socks5']['url']
+        self._dq = deque([])
+        self._proxies = {}
+        self._pool = []
+        self._counter = {}
+
+    def _init(self):
+        while not self._proxies:
+            if len(self._dq) > 0:
+                '''队列左边取值'''
+                self._proxies = self._dq.popleft()
+                '''添加到队尾'''
+                self._dq.append(self._proxies)
+            else:
+                self.__request_service()
+                self.__check_proxies()
+
+    @property
+    def proxies(self):
+        with self._lock:
+            return self._proxies if len(self._proxies) > 0 else None
+
+    def switch(self, reset=False):
+        with self._lock:
+            if reset is True:
+                self.__flush_proxy_pool()
+            elif len(self._counter) > 0:
+                end_time = self._counter[self.get_netloc(self._proxies)]
+                current_time = int(time.time())
+                if end_time - current_time < self.seconds:
+                    logger.info(f"[移除socks5代理]{self.get_netloc(self._proxies)}")
+                    self._dq.remove(self._proxies)
+                    del self._counter[self.get_netloc(self._proxies)]
+                    logger.info(f"[socks5代理]剩余 {len(self._dq)} 个")
+
+            self._proxies = {}  # 重置代理
+            while len(self._proxies) == 0:
+                if len(self._dq) > 0:
+                    self._proxies = self._dq.popleft()
+                    self._dq.append(self._proxies)
+                else:
+                    self.__flush_proxy_pool()
+
+    @staticmethod
+    def get_netloc(item: dict):
+        parser = urlparse(item.get('http'))
+        return parser.netloc
+
+    def __request_service(self):
+        try:
+            response = requests.get(self._url, timeout=10)
+            self.__extract_ip(response)
+        except requests.RequestException:
+            pass
+
+    def __extract_ip(self, response):
+        for proxy in response.json():
+            host = decrypt(proxy['host'])
+            port = int(proxy['port'])
+            end_time = proxy['EndTime']
+            items = {
+                'http': 'socks5://{}:{}'.format(host, port),
+                'https': 'socks5://{}:{}'.format(host, port)
+            }
+            self._pool.append(items)
+            self._counter.setdefault(self.get_netloc(items), end_time)
+
+    def __check_proxies(self):
+        check_ip = 'https://myip.ipip.net'
+        logger.info(f"[socks5代理检验]访问地址-{check_ip}")
+        for proxies in self._pool:
+            try:
+                requests_param = {
+                    "headers": headers,
+                    "proxies": proxies,
+                    "timeout": 2
+                }
+                requests.get(check_ip, **requests_param)
+                self._dq.append(proxies)
+            except requests.RequestException:
+                del self._counter[self.get_netloc(proxies)]
+
+    def __flush_proxy_pool(self):
+        logger.info(f"[socks5代理]刷新代理池")
+        self._pool.clear()
+        self._dq.clear()
+        self._counter.clear()
+        self.__request_service()
+        self.__check_proxies()
+
+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
+        if enable_proxy:
+            logger.info("[加载socks5代理]")
+            self._init()
+        return self
+
+
+Proxy = Socks5Proxy()

+ 11 - 0
geocode/utils/tools.py

@@ -0,0 +1,11 @@
+import socket
+
+
+def get_host_ip():
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(('8.8.8.8', 80))
+        ip = s.getsockname()[0]
+    finally:
+        s.close()
+    return ip