crawl_detail_page.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import requests
  2. from default import (
  3. crawl_tab,
  4. html2element,
  5. save_tab,
  6. hospital_name,
  7. hospital_alias, hospital_main_department
  8. )
  9. headers = {
  10. "authority": "www.yixue.com",
  11. "cache-control": "max-age=0",
  12. "sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"97\", \"Chromium\";v=\"97\"",
  13. "sec-ch-ua-mobile": "?0",
  14. "sec-ch-ua-platform": "\"Windows\"",
  15. "upgrade-insecure-requests": "1",
  16. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
  17. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  18. "sec-fetch-site": "same-origin",
  19. "sec-fetch-mode": "navigate",
  20. "sec-fetch-user": "?1",
  21. "sec-fetch-dest": "document",
  22. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  23. "referer": "https://www.yixue.com/%E6%B5%B7%E8%A5%BF%E8%92%99%E5%8F%A4%E6%97%8F%E8%97%8F%E6%97%8F%E8%87%AA%E6%B2%BB%E5%B7%9E%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8",
  24. }
  25. def start():
  26. count = 0
  27. q = {'finished': False}
  28. with crawl_tab.find(q, no_cursor_timeout=True, batch_size=5) as cursor:
  29. for item in cursor:
  30. count += 1
  31. url = item['url']
  32. headers.update({'referer': item['refer']})
  33. try:
  34. response = requests.get(url, headers=headers, timeout=60)
  35. element = html2element(response.text)
  36. table = element.xpath('//div[@id="mw-content-text"]/div/table[1]/@class')
  37. '''放在table格式中的数据'''
  38. if len(table) > 0 and 'navbox' not in table:
  39. node_table = element.xpath('//div[@id="mw-content-text"]/div/table[1]')[0]
  40. name = "".join(node_table.xpath('.//tr[1]/th/span/text()')).strip()
  41. hospital = {
  42. 'origin_name': name,
  43. 'origin_url': url,
  44. 'name': hospital_name(name), # 医院名称
  45. 'level': '', # 医院等级
  46. 'type': '', # 医院类型
  47. 'address': "".join(node_table.xpath('.//tr[2]/td/text()')).strip(), # 医院地址
  48. 'main_depart': '', # 重点科室
  49. 'business_type': '', # 经营方式
  50. 'tel': "".join(node_table.xpath('.//tr[3]/td/text()')).strip(), # 联系电话
  51. 'fax_number': "".join(node_table.xpath('.//tr[4]/td/text()')).strip().replace(' ', ''), # 传真号码
  52. 'e_mail': "".join(node_table.xpath('.//tr[5]/td/text()')).strip().replace(' ', ''), # 电子邮箱
  53. 'postcode': "".join(node_table.xpath('.//tr[6]/td/text()')).strip().replace(' ', ''), # 邮政编码
  54. 'website': "".join(node_table.xpath('.//tr[7]/td/a/@href')).strip(), # 医院网站
  55. 'alias': hospital_alias(name), # 其他名称
  56. 'area': item.get('province', ''), # 省份
  57. 'city': item.get('city', ''), # 城市
  58. 'district': item.get('district', '') # 区县
  59. }
  60. else:
  61. # 重点科室
  62. line1 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/text()')).strip()
  63. line2 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/a/text()')).strip()
  64. line = "{}、{}".format(line1, line2)
  65. main_department = hospital_main_department(line)
  66. # 发布的医院名称
  67. name_xpath = [
  68. '//div[@id="mw-content-text"]/div/p[1]/b/text()',
  69. '//*[@id="firstHeading"]/text()'
  70. ]
  71. for _xpath in name_xpath:
  72. name = "".join(element.xpath(_xpath)).strip()
  73. if len(name) > 0:
  74. break
  75. else:
  76. name = ''
  77. hospital = {
  78. 'origin_name': name,
  79. 'origin_url': url,
  80. 'name': hospital_name(name), # 医院名称
  81. 'level': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[3]/a/text()')).strip(), # 医院等级
  82. 'type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[4]/a/text()')).strip(), # 医院类型
  83. 'address': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[1]/text()')).strip().replace(":", ""), # 医院地址
  84. 'main_depart': main_department, # 重点科室
  85. 'business_type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[6]/a/text()')).strip(), # 经营方式
  86. 'tel': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[2]/text()')).strip().replace(":", ""), # 联系电话
  87. 'fax_number': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[7]/text()')).strip().replace(":", ""), # 传真号码
  88. 'e_mail': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[9]/text()')).strip().replace(":", ""), # 电子邮箱
  89. 'postcode': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[8]/text()')).strip().replace(":", ""), # 邮政编码
  90. 'website': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[10]/a/text()')).strip(), # 医院网站
  91. 'alias': hospital_alias(name), # 其他名称
  92. 'area': item.get('province', ''), # 省份
  93. 'city': item.get('city', ''), # 城市
  94. 'district': item.get('district', '') # 区县
  95. }
  96. save_tab.insert_one(hospital)
  97. crawl_tab.update_one(
  98. {'_id': item['_id']},
  99. {'$set': {'finished': True}}
  100. )
  101. print(f"[采集成功] {item['name']}")
  102. except:
  103. crawl_tab.update_one(
  104. {'_id': item['_id']},
  105. {'$set': {'finished': False}}
  106. )
  107. print(f"[采集失败] {item['name']}")
  108. if count % 500 == 0:
  109. print(f"已采集 {count} 条")
  110. if __name__ == '__main__':
  111. start()