crawl_region.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import re
  2. from urllib.parse import urljoin
  3. import requests
  4. from default import (
  5. html2element,
  6. headers,
  7. query_address,
  8. query_region,
  9. area_tab
  10. )
  11. Address = query_address()
  12. def start():
  13. url = "https://www.yixue.com/%E5%85%A8%E5%9B%BD%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8"
  14. response = requests.get(url, headers=headers, timeout=60)
  15. element = html2element(response.text)
  16. nodes = element.xpath('//div[@id="mw-content-text"]/div/p')
  17. p_nodes = nodes[: len(nodes) - 5]
  18. for node in p_nodes:
  19. a_nodes = node.xpath('./a')
  20. if len(a_nodes) > 1:
  21. items = []
  22. for a in a_nodes:
  23. href = a.attrib.get('href')
  24. title = a.attrib.get('title')
  25. link = urljoin(url, href)
  26. if '页面不存在' in title:
  27. continue
  28. print(title, link)
  29. result = re.search('(.*)医院列表', title).group(1)
  30. result = re.split('省|自治区', result)
  31. # print(result)
  32. region = result[-1]
  33. # print(region)
  34. item = query_region(region, Address)
  35. # print(f'>>> ', item)
  36. if item is not None:
  37. items.append({'name': title, 'url': link, **item})
  38. result = area_tab.insert_many(items)
  39. print(f"医院区域列表 新增 {len(result.inserted_ids)} 条")
  40. if __name__ == '__main__':
  41. start()