liumiaomiao il y a 6 mois
Parent
commit
53fb09d286
1 fichiers modifiés avec 7 ajouts et 4 suppressions
  1. 7 4
      client_spider.py

+ 7 - 4
client_spider.py

@@ -86,8 +86,8 @@ def insert_batch_data(conn, params):
     """
     执行批量插入数据
     """
-    query = """INSERT IGNORE INTO bid_analysis (mongoid, site, spidercode, comeintime, area, city, district, score, error_type, spider_modified_time) 
-               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
+    query = """INSERT IGNORE INTO bid_analysis (mongoid,toptype,subtype, site, spidercode, channel,comeintime, area, city, district, score, error_type, spider_modified_time) 
+               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s,%s)"""
     MysqlUtil.insert_data(conn, query, params)
 
 def insert_dynamic_error_field(conn, cleaned_key, error_ids, mongoid):
@@ -199,8 +199,11 @@ def batch_load_data():
                 data = result.get("data", {})
 
                 # 数据插入到 MySQL
+                toptype = item.get("toptype", "")
+                subtype = item.get("subtype", "")
                 site = item.get("site", "")
                 spidercode = item.get("spidercode", "")
+                channel = item.get("channel", "")
                 comeintime = item.get("comeintime", "")
                 comeintime = datetime.fromtimestamp(comeintime)
                 area = item.get("area", "")
@@ -215,7 +218,7 @@ def batch_load_data():
                     spider_modified_time = info.get("modifytime", "")
                     spider_modified_time = datetime.fromtimestamp(spider_modified_time)
 
-                params = (item["_id"], site, spidercode, comeintime, area, city, district, score, error_type_data,spider_modified_time)
+                params = (item["_id"], toptype, subtype, site, spidercode,channel, comeintime, area, city, district, score, error_type_data,spider_modified_time)
                 insert_batch_data(conn, params)
 
                 # 遍历错误原因字典并提取非空字典中的值
@@ -242,7 +245,7 @@ def batch_load_data():
             # 获取下一批数据
             search_after = hits[-1]["_id"]  # 获取当前批次最后一条数据的 _id 作为下一批的起始点
             es_query["search_after"] = [search_after]  # 保持 _id 类型一致
-            response = es_client.search(index="bidding", body=es_query, scroll="1m")
+            response = es_client.search(index="bidding", body=es_query, size="100")
             hits = response['hits']['hits']
 
             # 如果没有更多数据,跳出循环