data_spider
/
topic_spider


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
							import time
from urllib.parse import urljoin

import requests

from default import (
    crawl_tab,
    headers,
    html2element,
    area_tab,
    unknown_element
)


def start():
    with area_tab.find() as cursor:
        for hospital in cursor:
            url = hospital['url']
            response = requests.get(url, headers=headers, timeout=60)
            # print(response)
            element = html2element(response.text)
            nodes = element.xpath('//div[@id="mw-content-text"]/div/ul')
            if len(nodes) > 0:
                ul = nodes[-2]
                items = []
                for li in ul:
                    try:
                        a = li.xpath('./b/a')[-1]
                    except IndexError:
                        unknown_element(li, hospital)
                        continue

                    title = a.attrib.get('title')
                    href = a.attrib.get('href')
                    link = urljoin(url, href)
                    # print(title, link)
                    items.append({
                        'name': title,
                        'url': link,
                        'refer': url,
                        'province': hospital.get('province', ''),
                        'city': hospital.get('city', ''),
                        'district': hospital.get('district', '')
                    })
                result = crawl_tab.insert_many(items)
                print(f"{hospital['name']} 共有医院 {len(result.inserted_ids)} 家")


if __name__ == '__main__':
    start()