DataExport_forTesting.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. from pymongo import MongoClient
  2. from datetime import datetime, timedelta
  3. import pandas as pd
  4. import openpyxl
  5. # 数据入库量及数据监控时效 导出execl
  6. # MongoDB连接配置
  7. host = '192.168.3.149'
  8. port = 27180
  9. dbname = 'data_quality'
  10. collection_name = 'statistics'
  11. # 创建MongoDB连接
  12. client = MongoClient(host, port)
  13. db = client[dbname]
  14. collection = db[collection_name]
  15. # 获取当前时间和一周前的时间
  16. end_time = datetime.now()
  17. start_time = end_time - timedelta(weeks=1)
  18. # 将datetime转换为Unix时间戳(整数类型,去掉小数部分)
  19. start_timestamp = int(start_time.timestamp())
  20. end_timestamp = int(end_time.timestamp())
  21. # 输出调试信息:检查开始时间和结束时间
  22. print("Start time:", start_time)
  23. print("End time:", end_time)
  24. print("Start timestamp:", start_timestamp)
  25. print("End timestamp:", end_timestamp)
  26. # ----------------- 第一个Sheet: 断流监控_mongo库 -------------------
  27. # 查询过去一周的数据(断流监控_mongo库)
  28. pipeline_mongo = [
  29. {
  30. "$match": {
  31. "$or": [
  32. {"bidding.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
  33. {"bidding_ai.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
  34. {"connections.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
  35. {"nzj.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
  36. {"bidding_fragment.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}}
  37. ]
  38. }
  39. },
  40. {
  41. "$limit": 5 # 限制查询返回的结果为前5条数据,便于调试
  42. }
  43. ]
  44. # 获取符合条件的数据
  45. data_mongo = list(collection.aggregate(pipeline_mongo))
  46. # 初始化MongoDB字段统计数据
  47. bidding_count = 0
  48. bidding_ai_count = 0
  49. connections_count = 0
  50. nzj_count = 0
  51. bidding_fragment_data = {
  52. "情报_法务": 0,
  53. "情报_财务审计": 0,
  54. "情报_招标代理": 0,
  55. "情报_管理咨询": 0,
  56. "情报_保险": 0,
  57. "情报_工程设计咨询": 0,
  58. "情报_安防": 0,
  59. "情报_印务商机": 0,
  60. "情报_环境采购": 0,
  61. "情报_家具招投标": 0
  62. }
  63. # 统计MongoDB数据
  64. for doc in data_mongo:
  65. if 'bidding' in doc:
  66. bidding_count += doc['bidding'].get('count', 0)
  67. if 'bidding_ai' in doc:
  68. bidding_ai_count += doc['bidding_ai'].get('count', 0)
  69. if 'connections' in doc:
  70. connections_count += doc['connections'].get('count', 0)
  71. if 'nzj' in doc:
  72. nzj_count += doc['nzj'].get('count', 0)
  73. if 'bidding_fragment' in doc:
  74. for key, value in doc['bidding_fragment'].get('count', {}).items():
  75. if key in bidding_fragment_data:
  76. bidding_fragment_data[key] += value
  77. # ----------------- 第二个Sheet: 断流监控—es -------------------
  78. # 查询过去一周的数据(断流监控—es)
  79. pipeline_es = [
  80. {
  81. "$match": {
  82. "$or": [
  83. {"es_bidding.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
  84. {"es_bidding_ai.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
  85. {"es_nzj.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
  86. {"es_bidding_fragment.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}}
  87. ]
  88. }
  89. },
  90. {
  91. "$limit": 5 # 限制查询返回的结果为前5条数据,便于调试
  92. }
  93. ]
  94. # 获取符合条件的数据
  95. data_es = list(collection.aggregate(pipeline_es))
  96. # 初始化ES字段统计数据
  97. es_bidding_count = 0
  98. es_bidding_ai_count = 0
  99. es_nzj_count = 0
  100. es_bidding_fragment_data = {
  101. "情报_法务": 0,
  102. "情报_财务审计": 0,
  103. "情报_招标代理": 0,
  104. "情报_管理咨询": 0,
  105. "情报_保险": 0,
  106. "情报_工程设计咨询": 0,
  107. "情报_安防": 0,
  108. "情报_印务商机": 0,
  109. "情报_环境采购": 0,
  110. "情报_家具招投标": 0
  111. }
  112. # 统计ES数据
  113. for doc in data_es:
  114. if 'es_bidding' in doc:
  115. es_bidding_count += doc['es_bidding'].get('count', 0)
  116. if 'es_bidding_ai' in doc:
  117. es_bidding_ai_count += doc['es_bidding_ai'].get('count', 0)
  118. if 'es_nzj' in doc:
  119. es_nzj_count += doc['es_nzj'].get('count', 0)
  120. if 'es_bidding_fragment' in doc:
  121. for key, value in doc['es_bidding_fragment'].get('count', {}).items():
  122. if key in es_bidding_fragment_data:
  123. es_bidding_fragment_data[key] += value
  124. # ----------------- 第三个Sheet: 数据时效监控 -------------------
  125. # 查询过去一周的数据(数据时效监控)
  126. pipeline_timeliness = [
  127. {
  128. "$match": {
  129. "data_timeliness.timestamp": {
  130. "$gte": start_timestamp, # 使用整数Unix时间戳
  131. "$lt": end_timestamp # 使用整数Unix时间戳
  132. }
  133. }
  134. },
  135. {
  136. "$limit": 5 # 限制查询返回的结果为前5条数据,便于调试
  137. }
  138. ]
  139. # 获取符合条件的数据
  140. data_timeliness = list(collection.aggregate(pipeline_timeliness))
  141. # 初始化字段统计数据
  142. timeliness_data = {
  143. "[0,5)分钟": 0,
  144. "[5,15)分钟": 0,
  145. "[15,30)分钟": 0,
  146. "[30,60)分钟": 0,
  147. "[1,3)小时": 0,
  148. "[3,7)小时": 0,
  149. "[7,15)小时": 0,
  150. "[15,24)小时": 0,
  151. "[1,2)天": 0,
  152. "[2,3)天": 0,
  153. "3天+": 0
  154. }
  155. # 统计数据
  156. for doc in data_timeliness:
  157. if 'data_timeliness' in doc:
  158. count_data = doc['data_timeliness'].get('count', {})
  159. timeliness_data["[0,5)分钟"] += float(count_data.get("a1", "0%").replace('%', ''))
  160. timeliness_data["[5,15)分钟"] += float(count_data.get("a2", "0%").replace('%', ''))
  161. timeliness_data["[15,30)分钟"] += float(count_data.get("a3", "0%").replace('%', ''))
  162. timeliness_data["[30,60)分钟"] += float(count_data.get("a4", "0%").replace('%', ''))
  163. timeliness_data["[1,3)小时"] += float(count_data.get("a5", "0%").replace('%', ''))
  164. timeliness_data["[3,7)小时"] += float(count_data.get("a6", "0%").replace('%', ''))
  165. timeliness_data["[7,15)小时"] += float(count_data.get("a7", "0%").replace('%', ''))
  166. timeliness_data["[15,24)小时"] += float(count_data.get("a8", "0%").replace('%', ''))
  167. timeliness_data["[1,2)天"] += float(count_data.get("a9", "0%").replace('%', ''))
  168. timeliness_data["[2,3)天"] += float(count_data.get("a10", "0%").replace('%', ''))
  169. timeliness_data["3天+"] += float(count_data.get("a11", "0%").replace('%', ''))
  170. # 获取当前时间的一周时间范围字符串
  171. date_range = f"{start_time.strftime('%Y/%m/%d')}-{end_time.strftime('%Y/%m/%d')}"
  172. # 构建Excel数据
  173. columns = ['日期', '标讯每周入库数据量', '高质量库每周入库数据量', '人脉管理数据', '拟在建数据量(全国)'] + list(bidding_fragment_data.keys())
  174. data_row_mongo = [date_range, bidding_count, bidding_ai_count, connections_count, nzj_count] + list(bidding_fragment_data.values())
  175. columns_es = ['日期', '标讯每周入库数据量', '高质量库每周数据入库量', '拟在建数据量(全国)'] + list(es_bidding_fragment_data.keys())
  176. data_row_es = [date_range, es_bidding_count, es_bidding_ai_count, es_nzj_count] + list(es_bidding_fragment_data.values())
  177. columns_timeliness = ['日期'] + list(timeliness_data.keys())
  178. data_row_timeliness = [date_range] + list(timeliness_data.values())
  179. # 创建DataFrame并写入Excel
  180. excel_file = 'mongo_data_statistics_combined1.xlsx'
  181. with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
  182. # 写入第一个sheet(断流监控_mongo库)
  183. df_mongo = pd.DataFrame([data_row_mongo], columns=columns)
  184. df_mongo.to_excel(writer, sheet_name='入库数据量监控-mongo(每周)', index=False)
  185. # 写入第二个sheet(断流监控—es)
  186. df_es = pd.DataFrame([data_row_es], columns=columns_es)
  187. df_es.to_excel(writer, sheet_name='入库量数据量监控-es(每周)', index=False)
  188. # 将timeliness_data中的值转换为百分比字符串
  189. for key in timeliness_data:
  190. timeliness_data[key] = f"{timeliness_data[key]:.2f}%"
  191. # 构建数据行
  192. data_row_timeliness = [date_range] + list(timeliness_data.values())
  193. # 写入第三个sheet(数据时效监控)
  194. df_timeliness = pd.DataFrame([data_row_timeliness], columns=columns_timeliness)
  195. df_timeliness.to_excel(writer, sheet_name='数据时效监控(7天平均值)', index=False)
  196. print(f"统计结果已写入Excel文件: {excel_file}")