1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- import time
- from urllib.parse import urljoin
- import requests
- from default import (
- crawl_tab,
- headers,
- html2element,
- area_tab,
- unknown_element
- )
- def start():
- with area_tab.find() as cursor:
- for hospital in cursor:
- url = hospital['url']
- response = requests.get(url, headers=headers, timeout=60)
- # print(response)
- element = html2element(response.text)
- nodes = element.xpath('//div[@id="mw-content-text"]/div/ul')
- if len(nodes) > 0:
- ul = nodes[-2]
- items = []
- for li in ul:
- try:
- a = li.xpath('./b/a')[-1]
- except IndexError:
- unknown_element(li, hospital)
- continue
- title = a.attrib.get('title')
- href = a.attrib.get('href')
- link = urljoin(url, href)
- # print(title, link)
- items.append({
- 'name': title,
- 'url': link,
- 'refer': url,
- 'province': hospital.get('province', ''),
- 'city': hospital.get('city', ''),
- 'district': hospital.get('district', '')
- })
- result = crawl_tab.insert_many(items)
- print(f"{hospital['name']} 共有医院 {len(result.inserted_ids)} 家")
- if __name__ == '__main__':
- start()
|