crawl_list_page.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import time
  2. from urllib.parse import urljoin
  3. import requests
  4. from default import (
  5. crawl_tab,
  6. headers,
  7. html2element,
  8. area_tab,
  9. unknown_element
  10. )
  11. def start():
  12. with area_tab.find() as cursor:
  13. for hospital in cursor:
  14. url = hospital['url']
  15. response = requests.get(url, headers=headers, timeout=60)
  16. # print(response)
  17. element = html2element(response.text)
  18. nodes = element.xpath('//div[@id="mw-content-text"]/div/ul')
  19. if len(nodes) > 0:
  20. ul = nodes[-2]
  21. items = []
  22. for li in ul:
  23. try:
  24. a = li.xpath('./b/a')[-1]
  25. except IndexError:
  26. unknown_element(li, hospital)
  27. continue
  28. title = a.attrib.get('title')
  29. href = a.attrib.get('href')
  30. link = urljoin(url, href)
  31. # print(title, link)
  32. items.append({
  33. 'name': title,
  34. 'url': link,
  35. 'refer': url,
  36. 'province': hospital.get('province', ''),
  37. 'city': hospital.get('city', ''),
  38. 'district': hospital.get('district', '')
  39. })
  40. result = crawl_tab.insert_many(items)
  41. print(f"{hospital['name']} 共有医院 {len(result.inserted_ids)} 家")
  42. if __name__ == '__main__':
  43. start()