import re import time from urllib.parse import urljoin import lxml.etree import requests from lxml.html import fromstring from utils.databases import mongo_table, int2long province_tab = mongo_table('address', 'province') city_tab = mongo_table('address', 'city') district_tab = mongo_table('address', 'district') town_tab = mongo_table('address', 'town') village_tab = mongo_table('address', 'village') address_tab = mongo_table('address', 'new_address_2021') def page_source(url, headers=None, cookies=None, **kwargs): request_params = {} if headers is None: headers = { "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8" } if cookies is None: cookies = { "SF_cookie_1": "37059734" } request_params.setdefault('headers', headers) request_params.setdefault('cookies', cookies) request_params.setdefault('timeout', 60) request_params.setdefault('allow_redirects', False) request_params.setdefault('proxies', kwargs.get('proxies')) response = requests.get(url, **request_params) response.encoding = response.apparent_encoding return response def html2element(html): element = fromstring(html) return element def province(): """ { "_id" : ObjectId("6098cafbb9b8e6b1903a83f4"), "province_code" : NumberInt(11), "province" : "北京市" } """ url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html" response = page_source(url) element = html2element(response.text) node = element.xpath('//table[@width="100%"]//tr[position()>3]/td') item = [] for td in node: name = ''.join(td.xpath('./a/text()')).strip() href = ''.join(td.xpath('./a/@href')).strip() province_code = re.match('\d+', href).group() province_url = urljoin(url, href) print(name, province_code, province_url) item.append({ 'province_code': int(province_code), 'province': name, 'province_url': province_url }) province_tab.insert_many(item) print('[省级]下载完成') def city(): """ { "_id" : ObjectId("6098cb97b9b8e6b1903a841a"), "province_code" : NumberInt(11), "province" : "北京市", "city" : "市辖区", "city_code" : NumberInt(1101) } """ with province_tab.find() as cursor: for item in cursor: url = item['province_url'] response = page_source(url) element = html2element(response.text) node = element.xpath('//table[@class="citytable"]//tr[position()>1]') city_item = [] for tr in node: city_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:4] name = ''.join(tr.xpath('./td[2]/a/text()')).strip() href = ''.join(tr.xpath('./td[2]/a/@href')).strip() city_url = urljoin(url, href) city_item.append({ 'province_code': item['province_code'], 'province': item['province'], 'city': name, 'city_code': int(city_code), 'city_url': city_url }) city_tab.insert_many(city_item) print(f'[市级]{item["province"]}下载完成') def district(): """ { "_id" : ObjectId("6098cbb8b9b8e6b1903a8593"), "province_code" : NumberInt(12), "province" : "天津市", "city" : "市辖区", "city_code" : NumberInt(1201), "district" : "宝坻区", "district_code" : NumberInt(120115) } """ with city_tab.find() as cursor: for item in cursor: url = item['city_url'] while True: response = page_source(url) try: element = html2element(response.text) node = element.xpath('//table[@class="countytable"]//tr[position()>1]') district_item = [] district_level_item = [] for tr in node: attrib = tr.attrib.get('class') href = ''.join(tr.xpath('./td[2]/a/@href')).strip() ''' 1、县级市辖区为街道 2、市辖区无街道 ''' if attrib == 'countytr': if len(href) > 0: district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6] name = ''.join(tr.xpath('./td[2]/a/text()')).strip() district_url = urljoin(url, href) district_item.append({ 'province_code': item['province_code'], 'province': item['province'], 'city': item['city'], 'city_code': item['city_code'], 'district': name, 'district_code': int(district_code), 'district_url': district_url }) else: district_code = ''.join(tr.xpath('./td[1]/text()')).strip()[0:6] name = ''.join(tr.xpath('./td[2]/text()')).strip() district_item.append({ 'province_code': item['province_code'], 'province': item['province'], 'city': item['city'], 'city_code': item['city_code'], 'district': name, 'district_code': int(district_code), }) elif attrib == 'towntr': '''区、县页面出现 towntr 表示该区、县为县级市''' district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6] name = item['city'] town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9] town_name = ''.join(tr.xpath('./td[2]/a/text()')).strip() town_url = urljoin(url, href) district_level_item.append({ 'province_code': item['province_code'], 'province': item['province'], 'city': item['city'], 'city_code': item['city_code'], 'district': name, 'district_code': int(district_code), 'town': town_name, 'town_code': int(town_code), 'town_url': town_url }) else: raise break except lxml.etree.ParserError: print(f'[县级]{item["province"]}{item["city"]}下载超时,重新获取') time.sleep(1) if len(district_item) > 0: district_tab.insert_many(district_item) print(f'[县级]{item["province"]}{item["city"]}下载完成') if len(district_level_item) > 0: district_tab.insert_many(district_level_item) print(f'[县级市]{item["province"]}{item["city"]}下载完成') time.sleep(0.5) def town(): """ { "_id" : ObjectId("6098cbceb9b8e6b1903a91b4"), "province_code" : NumberInt(11), "province" : "北京市", "city" : "市辖区", "city_code" : NumberInt(1101), "district" : "海淀区", "district_code" : NumberInt(110108), "town" : "上庄地区", "town_code" : NumberInt(110108030) } """ query = {"town": {"$exists": True}} with district_tab.find(query) as cursor: for item in cursor: town_tab.insert_one(item) print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成') query = {"town": {"$exists": False}, "district_url": {"$exists": True}} with district_tab.find(query) as cursor: for item in cursor: url = item['district_url'] while True: response = page_source(url) try: element = html2element(response.text) node = element.xpath('//table[@class="towntable"]//tr[position()>1]') town_item = [] for tr in node: href = ''.join(tr.xpath('./td[2]/a/@href')).strip() town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9] name = ''.join(tr.xpath('./td[2]/a/text()')).strip() town_url = urljoin(url, href) town_item.append({ 'province_code': item['province_code'], 'province': item['province'], 'city': item['city'], 'city_code': item['city_code'], 'district': item['district'], 'district_code': item['district_code'], 'town': name, 'town_code': int(town_code), 'town_url': town_url, }) break except lxml.etree.ParserError: print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载超时,重新获取') time.sleep(1) town_tab.insert_many(town_item) print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成') time.sleep(0.5) def village(): """ { "_id" : ObjectId("6098cc2bb9b8e6b1903b3a38"), "province_code" : NumberInt(11), "province" : "北京市", "city" : "市辖区", "city_code" : NumberInt(1101), "district" : "海淀区", "district_code" : NumberInt(110108), "town" : "万寿路街道", "town_code" : NumberInt(110108001), "village" : "复兴路26号社区居委会", "village_code" : NumberLong(110108001020) } """ with town_tab.find() as cursor: for item in cursor: url = item['town_url'] while True: try: response = page_source(url) except requests.RequestException: print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}访问超时,重新获取') time.sleep(1) continue try: element = html2element(response.text) node = element.xpath('//table[@class="villagetable"]//tr[position()>1]') village_item = [] for tr in node: village_code = ''.join(tr.xpath('./td[1]/text()')).strip() name = ''.join(tr.xpath('./td[3]/text()')).strip() village_item.append({ 'province_code': item['province_code'], 'province': item['province'], 'city': item['city'], 'city_code': item['city_code'], 'district': item['district'], 'district_code': item['district_code'], 'town': item['town'], 'town_code': item['town_code'], 'village': name, 'village_code': int2long(int(village_code)) }) break except lxml.etree.ParserError: print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载超时,重新获取') time.sleep(1) try: village_tab.insert_many(village_item) print(f'[行政区划代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载完成') time.sleep(0.5) except TypeError: print(url) breakpoint() def address(): mgo_maps = { 'province': { 'table': province_tab, 'query': {}, 'projection': {'province_code': 1, 'province': 1} }, 'city': { 'table': city_tab, 'query': {}, 'projection': { 'province_code': 1, 'province': 1, 'city': 1, 'city_code': 1 } }, 'district': { 'table': district_tab, 'query': {}, 'projection': { 'province_code': 1, 'province': 1, 'city': 1, 'city_code': 1, 'district': 1, 'district_code': 1 } }, 'town': { 'table': town_tab, 'query': {}, 'projection': { 'province_code': 1, 'province': 1, 'city': 1, 'city_code': 1, 'district': 1, 'district_code': 1, 'town': 1, 'town_code': 1 } }, 'village': { 'table': village_tab, 'query': {}, 'projection': { 'province_code': 1, 'province': 1, 'city': 1, 'city_code': 1, 'district': 1, 'district_code': 1, 'town': 1, 'town_code': 1, 'village': 1, 'village_code': 1 } } } for key, maps in mgo_maps.items(): tab = maps['table'] query = maps['query'] projection = maps['projection'] with tab.find(query, projection) as cursor: for item in cursor: del item['_id'] address_tab.insert_one(item) print(f'{key} >> {item}') def main(): province() city() district() town() village() address() if __name__ == '__main__': main()