123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- import requests
- from default import (
- crawl_tab,
- html2element,
- save_tab,
- hospital_name,
- hospital_alias, hospital_main_department
- )
- headers = {
- "authority": "www.yixue.com",
- "cache-control": "max-age=0",
- "sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"97\", \"Chromium\";v=\"97\"",
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": "\"Windows\"",
- "upgrade-insecure-requests": "1",
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
- "sec-fetch-site": "same-origin",
- "sec-fetch-mode": "navigate",
- "sec-fetch-user": "?1",
- "sec-fetch-dest": "document",
- "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
- "referer": "https://www.yixue.com/%E6%B5%B7%E8%A5%BF%E8%92%99%E5%8F%A4%E6%97%8F%E8%97%8F%E6%97%8F%E8%87%AA%E6%B2%BB%E5%B7%9E%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8",
- }
- def start():
- count = 0
- q = {'finished': False}
- with crawl_tab.find(q, no_cursor_timeout=True, batch_size=5) as cursor:
- for item in cursor:
- count += 1
- url = item['url']
- headers.update({'referer': item['refer']})
- try:
- response = requests.get(url, headers=headers, timeout=60)
- element = html2element(response.text)
- table = element.xpath('//div[@id="mw-content-text"]/div/table[1]/@class')
- '''放在table格式中的数据'''
- if len(table) > 0 and 'navbox' not in table:
- node_table = element.xpath('//div[@id="mw-content-text"]/div/table[1]')[0]
- name = "".join(node_table.xpath('.//tr[1]/th/span/text()')).strip()
- hospital = {
- 'origin_name': name,
- 'origin_url': url,
- 'name': hospital_name(name), # 医院名称
- 'level': '', # 医院等级
- 'type': '', # 医院类型
- 'address': "".join(node_table.xpath('.//tr[2]/td/text()')).strip(), # 医院地址
- 'main_depart': '', # 重点科室
- 'business_type': '', # 经营方式
- 'tel': "".join(node_table.xpath('.//tr[3]/td/text()')).strip(), # 联系电话
- 'fax_number': "".join(node_table.xpath('.//tr[4]/td/text()')).strip().replace(' ', ''), # 传真号码
- 'e_mail': "".join(node_table.xpath('.//tr[5]/td/text()')).strip().replace(' ', ''), # 电子邮箱
- 'postcode': "".join(node_table.xpath('.//tr[6]/td/text()')).strip().replace(' ', ''), # 邮政编码
- 'website': "".join(node_table.xpath('.//tr[7]/td/a/@href')).strip(), # 医院网站
- 'alias': hospital_alias(name), # 其他名称
- 'area': item.get('province', ''), # 省份
- 'city': item.get('city', ''), # 城市
- 'district': item.get('district', '') # 区县
- }
- else:
- # 重点科室
- line1 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/text()')).strip()
- line2 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/a/text()')).strip()
- line = "{}、{}".format(line1, line2)
- main_department = hospital_main_department(line)
- # 发布的医院名称
- name_xpath = [
- '//div[@id="mw-content-text"]/div/p[1]/b/text()',
- '//*[@id="firstHeading"]/text()'
- ]
- for _xpath in name_xpath:
- name = "".join(element.xpath(_xpath)).strip()
- if len(name) > 0:
- break
- else:
- name = ''
- hospital = {
- 'origin_name': name,
- 'origin_url': url,
- 'name': hospital_name(name), # 医院名称
- 'level': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[3]/a/text()')).strip(), # 医院等级
- 'type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[4]/a/text()')).strip(), # 医院类型
- 'address': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[1]/text()')).strip().replace(":", ""), # 医院地址
- 'main_depart': main_department, # 重点科室
- 'business_type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[6]/a/text()')).strip(), # 经营方式
- 'tel': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[2]/text()')).strip().replace(":", ""), # 联系电话
- 'fax_number': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[7]/text()')).strip().replace(":", ""), # 传真号码
- 'e_mail': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[9]/text()')).strip().replace(":", ""), # 电子邮箱
- 'postcode': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[8]/text()')).strip().replace(":", ""), # 邮政编码
- 'website': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[10]/a/text()')).strip(), # 医院网站
- 'alias': hospital_alias(name), # 其他名称
- 'area': item.get('province', ''), # 省份
- 'city': item.get('city', ''), # 城市
- 'district': item.get('district', '') # 区县
- }
- save_tab.insert_one(hospital)
- crawl_tab.update_one(
- {'_id': item['_id']},
- {'$set': {'finished': True}}
- )
- print(f"[采集成功] {item['name']}")
- except:
- crawl_tab.update_one(
- {'_id': item['_id']},
- {'$set': {'finished': False}}
- )
- print(f"[采集失败] {item['name']}")
- if count % 500 == 0:
- print(f"已采集 {count} 条")
- if __name__ == '__main__':
- start()
|