|
@@ -0,0 +1,393 @@
|
|
|
+import re
|
|
|
+import time
|
|
|
+from urllib.parse import urljoin
|
|
|
+
|
|
|
+import lxml.etree
|
|
|
+import requests
|
|
|
+from lxml.html import fromstring
|
|
|
+
|
|
|
+from utils.databases import mongo_table, int2long
|
|
|
+
|
|
|
+province_tab = mongo_table('address', 'province')
|
|
|
+city_tab = mongo_table('address', 'city')
|
|
|
+district_tab = mongo_table('address', 'district')
|
|
|
+town_tab = mongo_table('address', 'town')
|
|
|
+village_tab = mongo_table('address', 'village')
|
|
|
+address_tab = mongo_table('address', 'new_address_2021')
|
|
|
+
|
|
|
+
|
|
|
+def page_source(url, headers=None, cookies=None, **kwargs):
|
|
|
+ request_params = {}
|
|
|
+ if headers is None:
|
|
|
+ headers = {
|
|
|
+ "Connection": "keep-alive",
|
|
|
+ "Pragma": "no-cache",
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
+ "Upgrade-Insecure-Requests": "1",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
|
|
|
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
|
|
|
+ }
|
|
|
+ if cookies is None:
|
|
|
+ cookies = {
|
|
|
+ "SF_cookie_1": "37059734"
|
|
|
+ }
|
|
|
+ request_params.setdefault('headers', headers)
|
|
|
+ request_params.setdefault('cookies', cookies)
|
|
|
+ request_params.setdefault('timeout', 60)
|
|
|
+ request_params.setdefault('allow_redirects', False)
|
|
|
+ request_params.setdefault('proxies', kwargs.get('proxies'))
|
|
|
+ response = requests.get(url, **request_params)
|
|
|
+ response.encoding = response.apparent_encoding
|
|
|
+ return response
|
|
|
+
|
|
|
+
|
|
|
+def html2element(html):
|
|
|
+ element = fromstring(html)
|
|
|
+ return element
|
|
|
+
|
|
|
+
|
|
|
+def province():
|
|
|
+ """
|
|
|
+ {
|
|
|
+ "_id" : ObjectId("6098cafbb9b8e6b1903a83f4"),
|
|
|
+ "province_code" : NumberInt(11),
|
|
|
+ "province" : "北京市"
|
|
|
+ }
|
|
|
+ """
|
|
|
+ url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html"
|
|
|
+ response = page_source(url)
|
|
|
+
|
|
|
+ element = html2element(response.text)
|
|
|
+ node = element.xpath('//table[@width="100%"]//tr[position()>3]/td')
|
|
|
+ item = []
|
|
|
+ for td in node:
|
|
|
+ name = ''.join(td.xpath('./a/text()')).strip()
|
|
|
+ href = ''.join(td.xpath('./a/@href')).strip()
|
|
|
+ province_code = re.match('\d+', href).group()
|
|
|
+ province_url = urljoin(url, href)
|
|
|
+ print(name, province_code, province_url)
|
|
|
+ item.append({
|
|
|
+ 'province_code': int(province_code),
|
|
|
+ 'province': name,
|
|
|
+ 'province_url': province_url
|
|
|
+ })
|
|
|
+ province_tab.insert_many(item)
|
|
|
+ print('[省级]下载完成')
|
|
|
+
|
|
|
+
|
|
|
+def city():
|
|
|
+ """
|
|
|
+ {
|
|
|
+ "_id" : ObjectId("6098cb97b9b8e6b1903a841a"),
|
|
|
+ "province_code" : NumberInt(11),
|
|
|
+ "province" : "北京市",
|
|
|
+ "city" : "市辖区",
|
|
|
+ "city_code" : NumberInt(1101)
|
|
|
+ }
|
|
|
+ """
|
|
|
+ with province_tab.find() as cursor:
|
|
|
+ for item in cursor:
|
|
|
+ url = item['province_url']
|
|
|
+ response = page_source(url)
|
|
|
+ element = html2element(response.text)
|
|
|
+ node = element.xpath('//table[@class="citytable"]//tr[position()>1]')
|
|
|
+ city_item = []
|
|
|
+ for tr in node:
|
|
|
+ city_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:4]
|
|
|
+ name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
|
|
|
+ href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
|
|
|
+ city_url = urljoin(url, href)
|
|
|
+ city_item.append({
|
|
|
+ 'province_code': item['province_code'],
|
|
|
+ 'province': item['province'],
|
|
|
+ 'city': name,
|
|
|
+ 'city_code': int(city_code),
|
|
|
+ 'city_url': city_url
|
|
|
+ })
|
|
|
+ city_tab.insert_many(city_item)
|
|
|
+ print(f'[市级]{item["province"]}下载完成')
|
|
|
+
|
|
|
+
|
|
|
+def district():
|
|
|
+ """
|
|
|
+ {
|
|
|
+ "_id" : ObjectId("6098cbb8b9b8e6b1903a8593"),
|
|
|
+ "province_code" : NumberInt(12),
|
|
|
+ "province" : "天津市",
|
|
|
+ "city" : "市辖区",
|
|
|
+ "city_code" : NumberInt(1201),
|
|
|
+ "district" : "宝坻区",
|
|
|
+ "district_code" : NumberInt(120115)
|
|
|
+ }
|
|
|
+ """
|
|
|
+ with city_tab.find() as cursor:
|
|
|
+ for item in cursor:
|
|
|
+ url = item['city_url']
|
|
|
+ while True:
|
|
|
+ response = page_source(url)
|
|
|
+ try:
|
|
|
+ element = html2element(response.text)
|
|
|
+ node = element.xpath('//table[@class="countytable"]//tr[position()>1]')
|
|
|
+ district_item = []
|
|
|
+ district_level_item = []
|
|
|
+ for tr in node:
|
|
|
+ attrib = tr.attrib.get('class')
|
|
|
+ href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
|
|
|
+ '''
|
|
|
+ 1、县级市辖区为街道
|
|
|
+ 2、市辖区无街道
|
|
|
+ '''
|
|
|
+ if attrib == 'countytr':
|
|
|
+ if len(href) > 0:
|
|
|
+ district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
|
|
|
+ name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
|
|
|
+ district_url = urljoin(url, href)
|
|
|
+ district_item.append({
|
|
|
+ 'province_code': item['province_code'],
|
|
|
+ 'province': item['province'],
|
|
|
+ 'city': item['city'],
|
|
|
+ 'city_code': item['city_code'],
|
|
|
+ 'district': name,
|
|
|
+ 'district_code': int(district_code),
|
|
|
+ 'district_url': district_url
|
|
|
+ })
|
|
|
+ else:
|
|
|
+ district_code = ''.join(tr.xpath('./td[1]/text()')).strip()[0:6]
|
|
|
+ name = ''.join(tr.xpath('./td[2]/text()')).strip()
|
|
|
+ district_item.append({
|
|
|
+ 'province_code': item['province_code'],
|
|
|
+ 'province': item['province'],
|
|
|
+ 'city': item['city'],
|
|
|
+ 'city_code': item['city_code'],
|
|
|
+ 'district': name,
|
|
|
+ 'district_code': int(district_code),
|
|
|
+ })
|
|
|
+ elif attrib == 'towntr':
|
|
|
+ '''区、县页面出现 towntr 表示该区、县为县级市'''
|
|
|
+ district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
|
|
|
+ name = item['city']
|
|
|
+ town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
|
|
|
+ town_name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
|
|
|
+ town_url = urljoin(url, href)
|
|
|
+ district_level_item.append({
|
|
|
+ 'province_code': item['province_code'],
|
|
|
+ 'province': item['province'],
|
|
|
+ 'city': item['city'],
|
|
|
+ 'city_code': item['city_code'],
|
|
|
+ 'district': name,
|
|
|
+ 'district_code': int(district_code),
|
|
|
+ 'town': town_name,
|
|
|
+ 'town_code': int(town_code),
|
|
|
+ 'town_url': town_url
|
|
|
+ })
|
|
|
+ else:
|
|
|
+ raise
|
|
|
+ break
|
|
|
+ except lxml.etree.ParserError:
|
|
|
+ print(f'[县级]{item["province"]}{item["city"]}下载超时,重新获取')
|
|
|
+ time.sleep(1)
|
|
|
+ if len(district_item) > 0:
|
|
|
+ district_tab.insert_many(district_item)
|
|
|
+ print(f'[县级]{item["province"]}{item["city"]}下载完成')
|
|
|
+ if len(district_level_item) > 0:
|
|
|
+ district_tab.insert_many(district_level_item)
|
|
|
+ print(f'[县级市]{item["province"]}{item["city"]}下载完成')
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+def town():
|
|
|
+ """
|
|
|
+ {
|
|
|
+ "_id" : ObjectId("6098cbceb9b8e6b1903a91b4"),
|
|
|
+ "province_code" : NumberInt(11),
|
|
|
+ "province" : "北京市",
|
|
|
+ "city" : "市辖区",
|
|
|
+ "city_code" : NumberInt(1101),
|
|
|
+ "district" : "海淀区",
|
|
|
+ "district_code" : NumberInt(110108),
|
|
|
+ "town" : "上庄地区",
|
|
|
+ "town_code" : NumberInt(110108030)
|
|
|
+ }
|
|
|
+ """
|
|
|
+ query = {"town": {"$exists": True}}
|
|
|
+ with district_tab.find(query) as cursor:
|
|
|
+ for item in cursor:
|
|
|
+ town_tab.insert_one(item)
|
|
|
+ print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
|
|
|
+
|
|
|
+ query = {"town": {"$exists": False}, "district_url": {"$exists": True}}
|
|
|
+ with district_tab.find(query) as cursor:
|
|
|
+ for item in cursor:
|
|
|
+ url = item['district_url']
|
|
|
+ while True:
|
|
|
+ response = page_source(url)
|
|
|
+ try:
|
|
|
+ element = html2element(response.text)
|
|
|
+ node = element.xpath('//table[@class="towntable"]//tr[position()>1]')
|
|
|
+ town_item = []
|
|
|
+ for tr in node:
|
|
|
+ href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
|
|
|
+ town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
|
|
|
+ name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
|
|
|
+ town_url = urljoin(url, href)
|
|
|
+ town_item.append({
|
|
|
+ 'province_code': item['province_code'],
|
|
|
+ 'province': item['province'],
|
|
|
+ 'city': item['city'],
|
|
|
+ 'city_code': item['city_code'],
|
|
|
+ 'district': item['district'],
|
|
|
+ 'district_code': item['district_code'],
|
|
|
+ 'town': name,
|
|
|
+ 'town_code': int(town_code),
|
|
|
+ 'town_url': town_url,
|
|
|
+ })
|
|
|
+ break
|
|
|
+ except lxml.etree.ParserError:
|
|
|
+ print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载超时,重新获取')
|
|
|
+ time.sleep(1)
|
|
|
+ town_tab.insert_many(town_item)
|
|
|
+ print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+
|
|
|
+def village():
|
|
|
+ """
|
|
|
+ {
|
|
|
+ "_id" : ObjectId("6098cc2bb9b8e6b1903b3a38"),
|
|
|
+ "province_code" : NumberInt(11),
|
|
|
+ "province" : "北京市",
|
|
|
+ "city" : "市辖区",
|
|
|
+ "city_code" : NumberInt(1101),
|
|
|
+ "district" : "海淀区",
|
|
|
+ "district_code" : NumberInt(110108),
|
|
|
+ "town" : "万寿路街道",
|
|
|
+ "town_code" : NumberInt(110108001),
|
|
|
+ "village" : "复兴路26号社区居委会",
|
|
|
+ "village_code" : NumberLong(110108001020)
|
|
|
+ }
|
|
|
+ """
|
|
|
+ with town_tab.find() as cursor:
|
|
|
+ for item in cursor:
|
|
|
+ url = item['town_url']
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ response = page_source(url)
|
|
|
+ except requests.RequestException:
|
|
|
+ print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}访问超时,重新获取')
|
|
|
+ time.sleep(1)
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ element = html2element(response.text)
|
|
|
+ node = element.xpath('//table[@class="villagetable"]//tr[position()>1]')
|
|
|
+ village_item = []
|
|
|
+ for tr in node:
|
|
|
+ village_code = ''.join(tr.xpath('./td[1]/text()')).strip()
|
|
|
+ name = ''.join(tr.xpath('./td[3]/text()')).strip()
|
|
|
+ village_item.append({
|
|
|
+ 'province_code': item['province_code'],
|
|
|
+ 'province': item['province'],
|
|
|
+ 'city': item['city'],
|
|
|
+ 'city_code': item['city_code'],
|
|
|
+ 'district': item['district'],
|
|
|
+ 'district_code': item['district_code'],
|
|
|
+ 'town': item['town'],
|
|
|
+ 'town_code': item['town_code'],
|
|
|
+ 'village': name,
|
|
|
+ 'village_code': int2long(int(village_code))
|
|
|
+ })
|
|
|
+ break
|
|
|
+ except lxml.etree.ParserError:
|
|
|
+ print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载超时,重新获取')
|
|
|
+ time.sleep(1)
|
|
|
+ try:
|
|
|
+ village_tab.insert_many(village_item)
|
|
|
+ print(f'[行政区划代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载完成')
|
|
|
+ time.sleep(0.5)
|
|
|
+ except TypeError:
|
|
|
+ print(url)
|
|
|
+ breakpoint()
|
|
|
+
|
|
|
+
|
|
|
+def address():
|
|
|
+ mgo_maps = {
|
|
|
+ 'province': {
|
|
|
+ 'table': province_tab,
|
|
|
+ 'query': {},
|
|
|
+ 'projection': {'province_code': 1, 'province': 1}
|
|
|
+ },
|
|
|
+ 'city': {
|
|
|
+ 'table': city_tab,
|
|
|
+ 'query': {},
|
|
|
+ 'projection': {
|
|
|
+ 'province_code': 1,
|
|
|
+ 'province': 1,
|
|
|
+ 'city': 1,
|
|
|
+ 'city_code': 1
|
|
|
+ }
|
|
|
+ },
|
|
|
+ 'district': {
|
|
|
+ 'table': district_tab,
|
|
|
+ 'query': {},
|
|
|
+ 'projection': {
|
|
|
+ 'province_code': 1,
|
|
|
+ 'province': 1,
|
|
|
+ 'city': 1,
|
|
|
+ 'city_code': 1,
|
|
|
+ 'district': 1,
|
|
|
+ 'district_code': 1
|
|
|
+ }
|
|
|
+ },
|
|
|
+ 'town': {
|
|
|
+ 'table': town_tab,
|
|
|
+ 'query': {},
|
|
|
+ 'projection': {
|
|
|
+ 'province_code': 1,
|
|
|
+ 'province': 1,
|
|
|
+ 'city': 1,
|
|
|
+ 'city_code': 1,
|
|
|
+ 'district': 1,
|
|
|
+ 'district_code': 1,
|
|
|
+ 'town': 1,
|
|
|
+ 'town_code': 1
|
|
|
+ }
|
|
|
+ },
|
|
|
+ 'village': {
|
|
|
+ 'table': village_tab,
|
|
|
+ 'query': {},
|
|
|
+ 'projection': {
|
|
|
+ 'province_code': 1,
|
|
|
+ 'province': 1,
|
|
|
+ 'city': 1,
|
|
|
+ 'city_code': 1,
|
|
|
+ 'district': 1,
|
|
|
+ 'district_code': 1,
|
|
|
+ 'town': 1,
|
|
|
+ 'town_code': 1,
|
|
|
+ 'village': 1,
|
|
|
+ 'village_code': 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for key, maps in mgo_maps.items():
|
|
|
+ tab = maps['table']
|
|
|
+ query = maps['query']
|
|
|
+ projection = maps['projection']
|
|
|
+ with tab.find(query, projection) as cursor:
|
|
|
+ for item in cursor:
|
|
|
+ del item['_id']
|
|
|
+ address_tab.insert_one(item)
|
|
|
+ print(f'{key} >> {item}')
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ province()
|
|
|
+ city()
|
|
|
+ district()
|
|
|
+ town()
|
|
|
+ village()
|
|
|
+ address()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|