default.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. import io
  2. import re
  3. from lxml.html import fromstring
  4. from pymongo import MongoClient
  5. from pymongo.collection import Collection
  6. from lxml.html import tostring
  7. hospital = MongoClient('127.0.0.1:27017').hospital
  8. area_tab: Collection = hospital.area
  9. crawl_tab: Collection = hospital.list_item
  10. save_tab: Collection = hospital.data_info
  11. err_tab: Collection = hospital.crawl_error
  12. region = MongoClient('127.0.0.1:27017').region
  13. address_tab: Collection = region.address
  14. headers = {
  15. "authority": "www.yixue.com",
  16. "cache-control": "max-age=0",
  17. "sec-ch-ua": "^\\^",
  18. "sec-ch-ua-mobile": "?0",
  19. "sec-ch-ua-platform": "^\\^Windows^^",
  20. "upgrade-insecure-requests": "1",
  21. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
  22. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  23. "sec-fetch-site": "none",
  24. "sec-fetch-mode": "navigate",
  25. "sec-fetch-user": "?1",
  26. "sec-fetch-dest": "document",
  27. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8"
  28. }
  29. def html2element(html):
  30. return fromstring(html)
  31. def element2html(element):
  32. return tostring(element, encoding='utf-8').decode('utf-8')
  33. def unknown_element(element, item):
  34. page_source = element2html(element)
  35. data = {
  36. **item,
  37. 'page_source': page_source
  38. }
  39. err_tab.insert_one(data)
  40. def query_address(query=None, projection=None):
  41. results = []
  42. if query is None:
  43. query = {}
  44. if projection is None:
  45. projection = {'province': 1, 'city': 1, 'district': 1}
  46. with address_tab.find(query, projection=projection) as cursor:
  47. for item in cursor:
  48. del item['_id']
  49. results.append(item)
  50. return results
  51. def remove_suffix(text):
  52. if text is None:
  53. return None
  54. return re.sub('省|自治区|地区', '', text)
  55. def _query_region(text, items):
  56. _find_result = []
  57. for k in ['district', 'city', 'province']:
  58. for item in items:
  59. _val = item.get(k)
  60. if text is not None and _val is not None and text == _val:
  61. if k == 'province':
  62. _find_result.append({'province': _val})
  63. elif k == 'city':
  64. _find_result.append({'province': item.get('province'), 'city': _val})
  65. else:
  66. _find_result.append(item)
  67. if len(_find_result) > 0:
  68. return _find_result
  69. return None
  70. def query_region(text, address):
  71. result = re.match('(.*(?:省|自治区|市)){0,1}(.*(?:市|区|县|州|盟)){0,1}', text)
  72. prov, city = result.groups() # 抽取省 市信息
  73. prov = remove_suffix(prov)
  74. city = remove_suffix(city)
  75. if prov is None and city is None:
  76. return {}
  77. '''查询省市信息'''
  78. item = (_query_region(city, address) or _query_region(prov, address))
  79. # print(item)
  80. if item is None:
  81. return None
  82. elif len(item) > 1 and prov is not None:
  83. '''查询结果大于1,通过精准对比市级或者省级名称取出数据'''
  84. for _item in item:
  85. if prov == _item.get('city') or prov == _item['province']:
  86. return _item
  87. elif len(item) > 1 and prov is None:
  88. '''查询结果大于1,直接给出一个省市信息'''
  89. _item = item[0]
  90. return {'province': _item.get('province'), 'city': _item.get('city')}
  91. else:
  92. return item[0]
  93. res_name = re.compile('(.*(?:院|区|部|所|馆|科|局|诊|病|场|康|站|点|社|字|室|会|瘤|大|矿|腔|堂|岗|合|〗|校|办|)|号|坊|医|房|贵|光|吾|门诊|体检|中心|公司|机构|集团|美容|整形|部队|保健|基地|服务)){0,1}((.*)){0,1}$')
  94. def hospital_alias(text: str):
  95. """医院别名"""
  96. res = res_name.match(text)
  97. _, _other = res.groups()
  98. if _other is not None:
  99. _other = _other[1:-1]
  100. _other = ",".join(_other.split('、'))
  101. # print(_other)
  102. return _other if _other is not None else ''
  103. def hospital_name(text: str):
  104. res = res_name.match(text)
  105. _name, _ = res.groups()
  106. return _name
  107. def hospital_main_department(text: str):
  108. res = re.match(':(.*){0,1}(、){0,1}$', text)
  109. if res is None:
  110. return ''
  111. _department, _ = res.groups()
  112. if _department is not None:
  113. # print(_department)
  114. _departments = _department.split('、')
  115. _stream = io.StringIO()
  116. for val in _departments:
  117. if len(val) == 0:
  118. continue
  119. else:
  120. _stream.write(val + '、')
  121. _department = _stream.getvalue()
  122. # print(_department[:-1])
  123. return _department[:-1] if _department is None else ''
  124. if __name__ == '__main__':
  125. # ma = ':特需门诊、银屑病、白癜风科、、痤疮门诊、灰指甲专科'
  126. # ma = ':、、、、、、、、消化内科、心血管内科、眼科、产科'
  127. # ma = ':、心脏科、神经外科'
  128. # hospital_main_department(ma)
  129. # name = '北京鸿慈童康'
  130. # name = '上海精神卫生康复医院二部'
  131. # name = '海湾镇燎原卫生院'
  132. # name = '张家港时代港口医院有限公司'
  133. # name = '北京玉之光医疗整形美容国际连锁机构(玉之光(北京)国际医疗美容整形机构)'
  134. # name = '中国人民解放军第306医院(三零六医院、三0六医院)'
  135. # name = '上海浦东新区迎博社区卫生服务站'
  136. # name = '上海徐剑炜整形美容'
  137. # print(hospital_name(name))
  138. # print(hospital_alias(name))
  139. # name = '上海市宝山区医院列表'
  140. # name = '北京市宣武区医院列表'
  141. # name = '北京市延庆县医院列表'
  142. # name = '甘孜藏族自治州医院列表'
  143. # name = '湖北省神农架林区医院列表'
  144. # name = '永州市医院列表'
  145. # name = '德宏傣族景颇族自治州医院列表'
  146. # name = '云南省丽江地区医院列表' # 1
  147. # name = '延边朝鲜族自治州医院列表'
  148. # name = '兴安盟医院列表'
  149. # name = '新疆维吾尔自治区喀什地区医院列表'
  150. name = '石河子市医院列表'
  151. address = query_address()
  152. result = query_region(name, address)
  153. print(result)