crawl_spider.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. import re
  2. import time
  3. from urllib.parse import urljoin
  4. import lxml.etree
  5. import requests
  6. from lxml.html import fromstring
  7. from utils.databases import mongo_table, int2long
  8. province_tab = mongo_table('address', 'province')
  9. city_tab = mongo_table('address', 'city')
  10. district_tab = mongo_table('address', 'district')
  11. town_tab = mongo_table('address', 'town')
  12. village_tab = mongo_table('address', 'village')
  13. address_tab = mongo_table('address', 'new_address_2021')
  14. def page_source(url, headers=None, cookies=None, **kwargs):
  15. request_params = {}
  16. if headers is None:
  17. headers = {
  18. "Connection": "keep-alive",
  19. "Pragma": "no-cache",
  20. "Cache-Control": "no-cache",
  21. "Upgrade-Insecure-Requests": "1",
  22. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
  23. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  24. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
  25. }
  26. if cookies is None:
  27. cookies = {
  28. "SF_cookie_1": "37059734"
  29. }
  30. request_params.setdefault('headers', headers)
  31. request_params.setdefault('cookies', cookies)
  32. request_params.setdefault('timeout', 60)
  33. request_params.setdefault('allow_redirects', False)
  34. request_params.setdefault('proxies', kwargs.get('proxies'))
  35. response = requests.get(url, **request_params)
  36. response.encoding = response.apparent_encoding
  37. return response
  38. def html2element(html):
  39. element = fromstring(html)
  40. return element
  41. def province():
  42. """
  43. {
  44. "_id" : ObjectId("6098cafbb9b8e6b1903a83f4"),
  45. "province_code" : NumberInt(11),
  46. "province" : "北京市"
  47. }
  48. """
  49. url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html"
  50. response = page_source(url)
  51. element = html2element(response.text)
  52. node = element.xpath('//table[@width="100%"]//tr[position()>3]/td')
  53. item = []
  54. for td in node:
  55. name = ''.join(td.xpath('./a/text()')).strip()
  56. href = ''.join(td.xpath('./a/@href')).strip()
  57. province_code = re.match('\d+', href).group()
  58. province_url = urljoin(url, href)
  59. print(name, province_code, province_url)
  60. item.append({
  61. 'province_code': int(province_code),
  62. 'province': name,
  63. 'province_url': province_url
  64. })
  65. province_tab.insert_many(item)
  66. print('[省级]下载完成')
  67. def city():
  68. """
  69. {
  70. "_id" : ObjectId("6098cb97b9b8e6b1903a841a"),
  71. "province_code" : NumberInt(11),
  72. "province" : "北京市",
  73. "city" : "市辖区",
  74. "city_code" : NumberInt(1101)
  75. }
  76. """
  77. with province_tab.find() as cursor:
  78. for item in cursor:
  79. url = item['province_url']
  80. response = page_source(url)
  81. element = html2element(response.text)
  82. node = element.xpath('//table[@class="citytable"]//tr[position()>1]')
  83. city_item = []
  84. for tr in node:
  85. city_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:4]
  86. name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
  87. href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
  88. city_url = urljoin(url, href)
  89. city_item.append({
  90. 'province_code': item['province_code'],
  91. 'province': item['province'],
  92. 'city': name,
  93. 'city_code': int(city_code),
  94. 'city_url': city_url
  95. })
  96. city_tab.insert_many(city_item)
  97. print(f'[市级]{item["province"]}下载完成')
  98. def district():
  99. """
  100. {
  101. "_id" : ObjectId("6098cbb8b9b8e6b1903a8593"),
  102. "province_code" : NumberInt(12),
  103. "province" : "天津市",
  104. "city" : "市辖区",
  105. "city_code" : NumberInt(1201),
  106. "district" : "宝坻区",
  107. "district_code" : NumberInt(120115)
  108. }
  109. """
  110. with city_tab.find() as cursor:
  111. for item in cursor:
  112. url = item['city_url']
  113. while True:
  114. response = page_source(url)
  115. try:
  116. element = html2element(response.text)
  117. node = element.xpath('//table[@class="countytable"]//tr[position()>1]')
  118. district_item = []
  119. district_level_item = []
  120. for tr in node:
  121. attrib = tr.attrib.get('class')
  122. href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
  123. '''
  124. 1、县级市辖区为街道
  125. 2、市辖区无街道
  126. '''
  127. if attrib == 'countytr':
  128. if len(href) > 0:
  129. district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
  130. name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
  131. district_url = urljoin(url, href)
  132. district_item.append({
  133. 'province_code': item['province_code'],
  134. 'province': item['province'],
  135. 'city': item['city'],
  136. 'city_code': item['city_code'],
  137. 'district': name,
  138. 'district_code': int(district_code),
  139. 'district_url': district_url
  140. })
  141. else:
  142. district_code = ''.join(tr.xpath('./td[1]/text()')).strip()[0:6]
  143. name = ''.join(tr.xpath('./td[2]/text()')).strip()
  144. district_item.append({
  145. 'province_code': item['province_code'],
  146. 'province': item['province'],
  147. 'city': item['city'],
  148. 'city_code': item['city_code'],
  149. 'district': name,
  150. 'district_code': int(district_code),
  151. })
  152. elif attrib == 'towntr':
  153. '''区、县页面出现 towntr 表示该区、县为县级市'''
  154. district_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:6]
  155. name = item['city']
  156. town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
  157. town_name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
  158. town_url = urljoin(url, href)
  159. district_level_item.append({
  160. 'province_code': item['province_code'],
  161. 'province': item['province'],
  162. 'city': item['city'],
  163. 'city_code': item['city_code'],
  164. 'district': name,
  165. 'district_code': int(district_code),
  166. 'town': town_name,
  167. 'town_code': int(town_code),
  168. 'town_url': town_url
  169. })
  170. else:
  171. raise
  172. break
  173. except lxml.etree.ParserError:
  174. print(f'[县级]{item["province"]}{item["city"]}下载超时,重新获取')
  175. time.sleep(1)
  176. if len(district_item) > 0:
  177. district_tab.insert_many(district_item)
  178. print(f'[县级]{item["province"]}{item["city"]}下载完成')
  179. if len(district_level_item) > 0:
  180. district_tab.insert_many(district_level_item)
  181. print(f'[县级市]{item["province"]}{item["city"]}下载完成')
  182. time.sleep(0.5)
  183. def town():
  184. """
  185. {
  186. "_id" : ObjectId("6098cbceb9b8e6b1903a91b4"),
  187. "province_code" : NumberInt(11),
  188. "province" : "北京市",
  189. "city" : "市辖区",
  190. "city_code" : NumberInt(1101),
  191. "district" : "海淀区",
  192. "district_code" : NumberInt(110108),
  193. "town" : "上庄地区",
  194. "town_code" : NumberInt(110108030)
  195. }
  196. """
  197. query = {"town": {"$exists": True}}
  198. with district_tab.find(query) as cursor:
  199. for item in cursor:
  200. town_tab.insert_one(item)
  201. print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
  202. query = {"town": {"$exists": False}, "district_url": {"$exists": True}}
  203. with district_tab.find(query) as cursor:
  204. for item in cursor:
  205. url = item['district_url']
  206. while True:
  207. response = page_source(url)
  208. try:
  209. element = html2element(response.text)
  210. node = element.xpath('//table[@class="towntable"]//tr[position()>1]')
  211. town_item = []
  212. for tr in node:
  213. href = ''.join(tr.xpath('./td[2]/a/@href')).strip()
  214. town_code = ''.join(tr.xpath('./td[1]/a/text()')).strip()[0:9]
  215. name = ''.join(tr.xpath('./td[2]/a/text()')).strip()
  216. town_url = urljoin(url, href)
  217. town_item.append({
  218. 'province_code': item['province_code'],
  219. 'province': item['province'],
  220. 'city': item['city'],
  221. 'city_code': item['city_code'],
  222. 'district': item['district'],
  223. 'district_code': item['district_code'],
  224. 'town': name,
  225. 'town_code': int(town_code),
  226. 'town_url': town_url,
  227. })
  228. break
  229. except lxml.etree.ParserError:
  230. print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载超时,重新获取')
  231. time.sleep(1)
  232. town_tab.insert_many(town_item)
  233. print(f'[街道]{item["province"]}{item["city"]}{item["district"]}下载完成')
  234. time.sleep(0.5)
  235. def village():
  236. """
  237. {
  238. "_id" : ObjectId("6098cc2bb9b8e6b1903b3a38"),
  239. "province_code" : NumberInt(11),
  240. "province" : "北京市",
  241. "city" : "市辖区",
  242. "city_code" : NumberInt(1101),
  243. "district" : "海淀区",
  244. "district_code" : NumberInt(110108),
  245. "town" : "万寿路街道",
  246. "town_code" : NumberInt(110108001),
  247. "village" : "复兴路26号社区居委会",
  248. "village_code" : NumberLong(110108001020)
  249. }
  250. """
  251. with town_tab.find() as cursor:
  252. for item in cursor:
  253. url = item['town_url']
  254. while True:
  255. try:
  256. response = page_source(url)
  257. except requests.RequestException:
  258. print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}访问超时,重新获取')
  259. time.sleep(1)
  260. continue
  261. try:
  262. element = html2element(response.text)
  263. node = element.xpath('//table[@class="villagetable"]//tr[position()>1]')
  264. village_item = []
  265. for tr in node:
  266. village_code = ''.join(tr.xpath('./td[1]/text()')).strip()
  267. name = ''.join(tr.xpath('./td[3]/text()')).strip()
  268. village_item.append({
  269. 'province_code': item['province_code'],
  270. 'province': item['province'],
  271. 'city': item['city'],
  272. 'city_code': item['city_code'],
  273. 'district': item['district'],
  274. 'district_code': item['district_code'],
  275. 'town': item['town'],
  276. 'town_code': item['town_code'],
  277. 'village': name,
  278. 'village_code': int2long(int(village_code))
  279. })
  280. break
  281. except lxml.etree.ParserError:
  282. print(f'[行政区话代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载超时,重新获取')
  283. time.sleep(1)
  284. try:
  285. village_tab.insert_many(village_item)
  286. print(f'[行政区划代码]{item["province"]}{item["city"]}{item["district"]}{item["town"]}下载完成')
  287. time.sleep(0.5)
  288. except TypeError:
  289. print(url)
  290. breakpoint()
  291. def address():
  292. mgo_maps = {
  293. 'province': {
  294. 'table': province_tab,
  295. 'query': {},
  296. 'projection': {'province_code': 1, 'province': 1}
  297. },
  298. 'city': {
  299. 'table': city_tab,
  300. 'query': {},
  301. 'projection': {
  302. 'province_code': 1,
  303. 'province': 1,
  304. 'city': 1,
  305. 'city_code': 1
  306. }
  307. },
  308. 'district': {
  309. 'table': district_tab,
  310. 'query': {},
  311. 'projection': {
  312. 'province_code': 1,
  313. 'province': 1,
  314. 'city': 1,
  315. 'city_code': 1,
  316. 'district': 1,
  317. 'district_code': 1
  318. }
  319. },
  320. 'town': {
  321. 'table': town_tab,
  322. 'query': {},
  323. 'projection': {
  324. 'province_code': 1,
  325. 'province': 1,
  326. 'city': 1,
  327. 'city_code': 1,
  328. 'district': 1,
  329. 'district_code': 1,
  330. 'town': 1,
  331. 'town_code': 1
  332. }
  333. },
  334. 'village': {
  335. 'table': village_tab,
  336. 'query': {},
  337. 'projection': {
  338. 'province_code': 1,
  339. 'province': 1,
  340. 'city': 1,
  341. 'city_code': 1,
  342. 'district': 1,
  343. 'district_code': 1,
  344. 'town': 1,
  345. 'town_code': 1,
  346. 'village': 1,
  347. 'village_code': 1
  348. }
  349. }
  350. }
  351. for key, maps in mgo_maps.items():
  352. tab = maps['table']
  353. query = maps['query']
  354. projection = maps['projection']
  355. with tab.find(query, projection) as cursor:
  356. for item in cursor:
  357. del item['_id']
  358. address_tab.insert_one(item)
  359. print(f'{key} >> {item}')
  360. def main():
  361. province()
  362. city()
  363. district()
  364. town()
  365. village()
  366. address()
  367. if __name__ == '__main__':
  368. main()