数据处理.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-10-19
  4. ---------
  5. @summary:
  6. ---------
  7. @author: Dzr
  8. """
  9. import time
  10. from datetime import datetime
  11. import bson
  12. from pymongo import MongoClient
  13. Int64 = bson.int64.Int64
  14. client = MongoClient('192.168.3.182', 27017)
  15. qlm_coll = client['zjb_poc']['qlm_data_lst']
  16. jy_coll = client['zjb_poc']['jy_data_lst']
  17. count = 0
  18. insert_lst = []
  19. with qlm_coll.find() as cursor:
  20. for item in cursor:
  21. href = item['url']
  22. title = item['popTitle'] if 'popTitle' in item else item['showTitle']
  23. publishtime = item['updateTime']
  24. l_np_publishtime = datetime.strptime(publishtime, '%Y-%m-%d').timestamp()
  25. addr = str(item['areaName']).split('-')
  26. area = addr[0] if len(addr) > 0 else ''
  27. city = addr[1] if len(addr) > 1 else ''
  28. if '国土' in item.get('progName', ''):
  29. toptype = item['progName']
  30. else:
  31. toptype = (item['noticeSegmentTypeName'] or item['progName'])
  32. data = {
  33. 'site': '千里马',
  34. 'channel': item['channel'],
  35. 'spidercode': 'sdxzbiddingsjzypc',
  36. 'area': area,
  37. 'city': city,
  38. 'district': '',
  39. 'comeintime': Int64(int(time.time())),
  40. 'isdownload': False, # 是否下载
  41. 'isfailed': False, # 是否失败
  42. 'title': title, # 标题
  43. 'href': href, # 信息链接
  44. 'publishtime': publishtime, # 发布时间(字符串)
  45. 'l_np_publishtime': Int64(l_np_publishtime), # 发布时间(时间戳)
  46. 'buyer': item['tenderees'], # 招标单位
  47. 'toptype': toptype, # 公告类型
  48. 'winner': item['bidder'] if item.get('bidder') is not None else '', # 中标单位
  49. 'agency': item['agent'] if item.get('agent') is not None else '', # 代理单位
  50. }
  51. insert_lst.append(data)
  52. if len(insert_lst) == 100:
  53. jy_coll.insert_many(insert_lst, ordered=False)
  54. count += len(insert_lst)
  55. insert_lst = []
  56. print('已处理{}条'.format(count))
  57. if len(insert_lst) > 0:
  58. jy_coll.insert_many(insert_lst, ordered=False)
  59. count += len(insert_lst)
  60. print('已处理{}条'.format(count))
  61. print('数据处理结束')