123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- import re
- from urllib.parse import urljoin
- import requests
- from default import (
- html2element,
- headers,
- query_address,
- query_region,
- area_tab
- )
- Address = query_address()
- def start():
- url = "https://www.yixue.com/%E5%85%A8%E5%9B%BD%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8"
- response = requests.get(url, headers=headers, timeout=60)
- element = html2element(response.text)
- nodes = element.xpath('//div[@id="mw-content-text"]/div/p')
- p_nodes = nodes[: len(nodes) - 5]
- for node in p_nodes:
- a_nodes = node.xpath('./a')
- if len(a_nodes) > 1:
- items = []
- for a in a_nodes:
- href = a.attrib.get('href')
- title = a.attrib.get('title')
- link = urljoin(url, href)
- if '页面不存在' in title:
- continue
- print(title, link)
- result = re.search('(.*)医院列表', title).group(1)
- result = re.split('省|自治区', result)
- # print(result)
- region = result[-1]
- # print(region)
- item = query_region(region, Address)
- # print(f'>>> ', item)
- if item is not None:
- items.append({'name': title, 'url': link, **item})
- result = area_tab.insert_many(items)
- print(f"医院区域列表 新增 {len(result.inserted_ids)} 条")
- if __name__ == '__main__':
- start()
|