123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393 |
- import re
- import time
- from urllib.parse import urljoin
- import lxml.etree
- import requests
- from lxml.html import fromstring
- from utils.databases import mongo_table, int2long
- province_tab = mongo_table('address', 'province')
- city_tab = mongo_table('address', 'city')
- district_tab = mongo_table('address', 'district')
- town_tab = mongo_table('address', 'town')
- village_tab = mongo_table('address', 'village')
- address_tab = mongo_table('address', 'new_address_2021')
- def page_source(url, headers=None, cookies=None, **kwargs):
- request_params = {}
- if headers is None:
- headers = {
- "Connection": "keep-alive",
- "Pragma": "no-cache",
- "Cache-Control": "no-cache",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
- }
- if cookies is None:
- cookies = {
- "SF_cookie_1": "37059734"
- }
- request_params.setdefault('headers', headers)
- request_params.setdefault('cookies', cookies)
- request_params.setdefault('timeout', 60)
- request_params.setdefault('allow_redirects', False)
- request_params.setdefault('proxies', kwargs.get('proxies'))
- response = requests.get(url, **request_params)
- response.encoding = response.apparent_encoding
- return response
- def html2element(html):
- element = fromstring(html)
- return element
- def province():
- """
- {
- "_id" : ObjectId("6098cafbb9b8e6b1903a83f4"),
- "province_code" : NumberInt(11),
- "province" : "北京市"
- }
- """
- url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html"
- response = page_source(url)
- element = html2element(response.text)
- node = element.xpath('//table[@width="100%"]//tr[position()>3]/td')
- item = []
- for td in node:
- name = ''.join(td.xpath('./a/text()')).strip()
- href = ''.join(td.xpath('./a/@href')).strip()
- province_code = re.match('\d+', href).group()
- province_url = urljoin(url, href)
- print(name, province_code, province_url)
- item.append({
- 'province_code': int(province_code),
- 'province': name,
- 'province_url': province_url
- })
- province_tab.insert_many(item)
- print('[省级]下载完成')
- def city():
- """
- {
- "_id" : ObjectId("6098cb97b9b8e6b1903a841a"),
- "province_code" : NumberInt(11),
- "province" : "北京市",
- "city" : "市辖区",
- "city_code" : NumberInt(1101)
- }
- """
- with province_tab.find() as cursor:
- for item in cursor:
- url = item['province_url']
- response = page_source(url)
- element = html2element(response.text)
- node = element.xpath('//table[@class="citytable"]//tr[position()>1]')
- city_item = []
- for tr in node:
- city_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:4]
- name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
- href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
- city_url = urljoin(url, href)
- city_item.append({
- 'province_code': item['province_code'],
- 'province': item['province'],
- 'city': name,
- 'city_code': int(city_code),
- 'city_url': city_url
- })
- city_tab.insert_many(city_item)
- print(f'[市级]{item["province"]}下载完成')
- def district():
- """
- {
- "_id" : ObjectId("6098cbb8b9b8e6b1903a8593"),
- "province_code" : NumberInt(12),
- "province" : "天津市",
- "city" : "市辖区",
- "city_code" : NumberInt(1201),
- "district" : "宝坻区",
- "district_code" : NumberInt(120115)
- }
- """
- with city_tab.find() as cursor:
- for item in cursor:
- url = item['city_url']
- while True:
- response = page_source(url)
- try:
- element = html2element(response.text)
- node = element.xpath('//table[@class="countytable"]//tr[position()>1]')
- district_item = []
- district_level_item = []
- for tr in node:
- attrib = tr.attrib.get('class')
- href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
- '''
- 1、县级市辖区为街道
- 2、市辖区无街道
- '''
- if attrib == 'countytr':
- if len(href) > 0:
- district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
- name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
- district_url = urljoin(url, href)
- district_item.append({
- 'province_code': item['province_code'],
- 'province': item['province'],
- 'city': item['city'],
- 'city_code': item['city_code'],
- 'district': name,
- 'district_code': int(district_code),
- 'district_url': district_url
- })
- else:
- district_code = ''.join(tr.xpath('./td[1]/text()')).strip()[0:6]
- name = ''.join(tr.xpath('./td[2]/text()')).strip()
- district_item.append({
- 'province_code': item['province_code'],
- 'province': item['province'],
- 'city': item['city'],
- 'city_code': item['city_code'],
- 'district': name,
- 'district_code': int(district_code),
- })
- elif attrib == 'towntr':
- '''区、县页面出现 towntr 表示该区、县为县级市'''
- district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
- name = item['city']
- town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
- town_name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
- town_url = urljoin(url, href)
- district_level_item.append({
- 'province_code': item['province_code'],
- 'province': item['province'],
- 'city': item['city'],
- 'city_code': item['city_code'],
- 'district': name,
- 'district_code': int(district_code),
- 'town': town_name,
- 'town_code': int(town_code),
- 'town_url': town_url
- })
- else:
- raise
- break
- except lxml.etree.ParserError:
- print(f'[县级]{item["province"]}{item["city"]}下载超时,重新获取')
- time.sleep(1)
- if len(district_item) > 0:
- district_tab.insert_many(district_item)
- print(f'[县级]{item["province"]}{item["city"]}下载完成')
- if len(district_level_item) > 0:
- district_tab.insert_many(district_level_item)
- print(f'[县级市]{item["province"]}{item["city"]}下载完成')
- time.sleep(0.5)
- def town():
- """
- {
- "_id" : ObjectId("6098cbceb9b8e6b1903a91b4"),
- "province_code" : NumberInt(11),
- "province" : "北京市",
- "city" : "市辖区",
- "city_code" : NumberInt(1101),
- "district" : "海淀区",
- "district_code" : NumberInt(110108),
- "town" : "上庄地区",
- "town_code" : NumberInt(110108030)
- }
- """
- query = {"town": {"$exists": True}}
- with district_tab.find(query) as cursor:
- for item in cursor:
- town_tab.insert_one(item)
- print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
- query = {"town": {"$exists": False}, "district_url": {"$exists": True}}
- with district_tab.find(query) as cursor:
- for item in cursor:
- url = item['district_url']
- while True:
- response = page_source(url)
- try:
- element = html2element(response.text)
- node = element.xpath('//table[@class="towntable"]//tr[position()>1]')
- town_item = []
- for tr in node:
- href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
- town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
- name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
- town_url = urljoin(url, href)
- town_item.append({
- 'province_code': item['province_code'],
- 'province': item['province'],
- 'city': item['city'],
- 'city_code': item['city_code'],
- 'district': item['district'],
- 'district_code': item['district_code'],
- 'town': name,
- 'town_code': int(town_code),
- 'town_url': town_url,
- })
- break
- except lxml.etree.ParserError:
- print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载超时,重新获取')
- time.sleep(1)
- town_tab.insert_many(town_item)
- print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
- time.sleep(0.5)
- def village():
- """
- {
- "_id" : ObjectId("6098cc2bb9b8e6b1903b3a38"),
- "province_code" : NumberInt(11),
- "province" : "北京市",
- "city" : "市辖区",
- "city_code" : NumberInt(1101),
- "district" : "海淀区",
- "district_code" : NumberInt(110108),
- "town" : "万寿路街道",
- "town_code" : NumberInt(110108001),
- "village" : "复兴路26号社区居委会",
- "village_code" : NumberLong(110108001020)
- }
- """
- with town_tab.find() as cursor:
- for item in cursor:
- url = item['town_url']
- while True:
- try:
- response = page_source(url)
- except requests.RequestException:
- print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}访问超时,重新获取')
- time.sleep(1)
- continue
- try:
- element = html2element(response.text)
- node = element.xpath('//table[@class="villagetable"]//tr[position()>1]')
- village_item = []
- for tr in node:
- village_code = ''.join(tr.xpath('./td[1]/text()')).strip()
- name = ''.join(tr.xpath('./td[3]/text()')).strip()
- village_item.append({
- 'province_code': item['province_code'],
- 'province': item['province'],
- 'city': item['city'],
- 'city_code': item['city_code'],
- 'district': item['district'],
- 'district_code': item['district_code'],
- 'town': item['town'],
- 'town_code': item['town_code'],
- 'village': name,
- 'village_code': int2long(int(village_code))
- })
- break
- except lxml.etree.ParserError:
- print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载超时,重新获取')
- time.sleep(1)
- try:
- village_tab.insert_many(village_item)
- print(f'[行政区划代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载完成')
- time.sleep(0.5)
- except TypeError:
- print(url)
- breakpoint()
- def address():
- mgo_maps = {
- 'province': {
- 'table': province_tab,
- 'query': {},
- 'projection': {'province_code': 1, 'province': 1}
- },
- 'city': {
- 'table': city_tab,
- 'query': {},
- 'projection': {
- 'province_code': 1,
- 'province': 1,
- 'city': 1,
- 'city_code': 1
- }
- },
- 'district': {
- 'table': district_tab,
- 'query': {},
- 'projection': {
- 'province_code': 1,
- 'province': 1,
- 'city': 1,
- 'city_code': 1,
- 'district': 1,
- 'district_code': 1
- }
- },
- 'town': {
- 'table': town_tab,
- 'query': {},
- 'projection': {
- 'province_code': 1,
- 'province': 1,
- 'city': 1,
- 'city_code': 1,
- 'district': 1,
- 'district_code': 1,
- 'town': 1,
- 'town_code': 1
- }
- },
- 'village': {
- 'table': village_tab,
- 'query': {},
- 'projection': {
- 'province_code': 1,
- 'province': 1,
- 'city': 1,
- 'city_code': 1,
- 'district': 1,
- 'district_code': 1,
- 'town': 1,
- 'town_code': 1,
- 'village': 1,
- 'village_code': 1
- }
- }
- }
- for key, maps in mgo_maps.items():
- tab = maps['table']
- query = maps['query']
- projection = maps['projection']
- with tab.find(query, projection) as cursor:
- for item in cursor:
- del item['_id']
- address_tab.insert_one(item)
- print(f'{key} >> {item}')
- def main():
- province()
- city()
- district()
- town()
- village()
- address()
- if __name__ == '__main__':
- main()
|