data_spider
/
topic_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
							import requests

from default import (
    crawl_tab,
    html2element,
    save_tab,
    hospital_name,
    hospital_alias, hospital_main_department
)

headers = {
    "authority": "www.yixue.com",
    "cache-control": "max-age=0",
    "sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"97\", \"Chromium\";v=\"97\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "sec-fetch-site": "same-origin",
    "sec-fetch-mode": "navigate",
    "sec-fetch-user": "?1",
    "sec-fetch-dest": "document",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "referer": "https://www.yixue.com/%E6%B5%B7%E8%A5%BF%E8%92%99%E5%8F%A4%E6%97%8F%E8%97%8F%E6%97%8F%E8%87%AA%E6%B2%BB%E5%B7%9E%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8",
}


def start():
    count = 0
    q = {'finished': False}
    with crawl_tab.find(q, no_cursor_timeout=True, batch_size=5) as cursor:
        for item in cursor:
            count += 1
            url = item['url']
            headers.update({'referer': item['refer']})
            try:
                response = requests.get(url, headers=headers, timeout=60)
                element = html2element(response.text)

                table = element.xpath('//div[@id="mw-content-text"]/div/table[1]/@class')
                '''放在table格式中的数据'''
                if len(table) > 0 and 'navbox' not in table:
                    node_table = element.xpath('//div[@id="mw-content-text"]/div/table[1]')[0]
                    name = "".join(node_table.xpath('.//tr[1]/th/span/text()')).strip()
                    hospital = {
                        'origin_name': name,
                        'origin_url': url,
                        'name': hospital_name(name),  # 医院名称
                        'level': '',  # 医院等级
                        'type': '',  # 医院类型
                        'address': "".join(node_table.xpath('.//tr[2]/td/text()')).strip(),  # 医院地址
                        'main_depart': '',  # 重点科室
                        'business_type': '',  # 经营方式
                        'tel': "".join(node_table.xpath('.//tr[3]/td/text()')).strip(),  # 联系电话
                        'fax_number': "".join(node_table.xpath('.//tr[4]/td/text()')).strip().replace('&nbsp;', ''),  # 传真号码
                        'e_mail': "".join(node_table.xpath('.//tr[5]/td/text()')).strip().replace('&nbsp;', ''),  # 电子邮箱
                        'postcode': "".join(node_table.xpath('.//tr[6]/td/text()')).strip().replace('&nbsp;', ''), # 邮政编码
                        'website': "".join(node_table.xpath('.//tr[7]/td/a/@href')).strip(),  # 医院网站
                        'alias': hospital_alias(name),  # 其他名称
                        'area': item.get('province', ''),  # 省份
                        'city': item.get('city', ''),  # 城市
                        'district': item.get('district', '')  # 区县
                    }
                else:
                    # 重点科室
                    line1 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/text()')).strip()
                    line2 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/a/text()')).strip()
                    line = "{}、{}".format(line1, line2)
                    main_department = hospital_main_department(line)
                    # 发布的医院名称

                    name_xpath = [
                        '//div[@id="mw-content-text"]/div/p[1]/b/text()',
                        '//*[@id="firstHeading"]/text()'
                    ]
                    for _xpath in name_xpath:
                        name = "".join(element.xpath(_xpath)).strip()
                        if len(name) > 0:
                            break
                    else:
                        name = ''

                    hospital = {
                        'origin_name': name,
                        'origin_url': url,
                        'name': hospital_name(name),  # 医院名称
                        'level': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[3]/a/text()')).strip(),  # 医院等级
                        'type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[4]/a/text()')).strip(),  # 医院类型
                        'address': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[1]/text()')).strip().replace("：", ""),  # 医院地址
                        'main_depart': main_department,  # 重点科室
                        'business_type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[6]/a/text()')).strip(),  # 经营方式
                        'tel': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[2]/text()')).strip().replace("：", ""),  # 联系电话
                        'fax_number': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[7]/text()')).strip().replace("：", ""),  # 传真号码
                        'e_mail': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[9]/text()')).strip().replace("：", ""),  # 电子邮箱
                        'postcode': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[8]/text()')).strip().replace("：", ""), # 邮政编码
                        'website': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[10]/a/text()')).strip(),  # 医院网站
                        'alias': hospital_alias(name),  # 其他名称
                        'area': item.get('province', ''),  # 省份
                        'city': item.get('city', ''),  # 城市
                        'district': item.get('district', '')  # 区县
                    }
                save_tab.insert_one(hospital)
                crawl_tab.update_one(
                    {'_id': item['_id']},
                    {'$set': {'finished': True}}
                )
                print(f"[采集成功] {item['name']}")
            except:
                crawl_tab.update_one(
                    {'_id': item['_id']},
                    {'$set': {'finished': False}}
                )
                print(f"[采集失败] {item['name']}")

            if count % 500 == 0:
                print(f"已采集 {count} 条")


if __name__ == '__main__':
    start()