vor 1 Monat · d4d4a5bfda
--- a/data_process.py
+++ b/data_process.py
@@ -188,31 +188,83 @@ def predict_data_simple(df_input, city_pair, output_dir, predict_dir=".", pred_t
 
				         df_sorted['hours_until_departure'].between(24, 360)
			
 
				     ].reset_index(drop=True)
			
 
				 
			
 
				-    # 每个 gid  baggage_weight 取 hours_until_departure 最小的一条
			
 
				+    # 每个 gid  baggage_weight 取 hours_until_departure 最小的一条 (当前小时)
			
 
				     df_min_hours = (
			
 
				         df_sorted.drop_duplicates(subset=['gid', 'baggage_weight'], keep='last')
			
 
				         .reset_index(drop=True)
			
 
				     )
			
 
				 
			
 
				-    # 读历史升价-降价
			
 
				+    # 读历史降价场景
			
 
				     drop_info_csv_path = os.path.join(output_dir, f'{city_pair}_drop_info.csv')
			
 
				     if os.path.exists(drop_info_csv_path):
			
 
				         df_drop_nodes = pd.read_csv(drop_info_csv_path)
			
 
				     else:
			
 
				         df_drop_nodes = pd.DataFrame()
			
 
				 
			
 
				-    # 读历史升价-升价
			
 
				+    # 读历史升价场景
			
 
				     rise_info_csv_path = os.path.join(output_dir, f'{city_pair}_rise_info.csv')
			
 
				     if os.path.exists(rise_info_csv_path):
			
 
				         df_rise_nodes = pd.read_csv(rise_info_csv_path)
			
 
				     else:
			
 
				         df_rise_nodes = pd.DataFrame()
			
 
				     
			
 
				-    # ==================== 跨航班日包络线 + 降价潜力 ====================
			
 
				-    print(">>> 构建跨航班日价格包络线")
			
 
				-    flight_key = ['citypair', 'flight_numbers', 'baggage_weight']
			
 
				-    day_key = flight_key + ['from_date']
			
 
				+    # 联合价格分布
			
 
				+    # 统一初始化
			
 
				+    df_min_hours['relative_position'] = np.nan
			
 
				+    if not df_drop_nodes.empty:
			
 
				+        df_drop_nodes['relative_position'] = np.nan
			
 
				+    if not df_rise_nodes.empty:
			
 
				+        df_rise_nodes['relative_position'] = np.nan
			
 
				     
			
 
				+    parts = []
			
 
				+
			
 
				+    # 当前待预测
			
 
				+    if not df_min_hours.empty and 'price_total' in df_min_hours.columns:
			
 
				+        cur = df_min_hours[['price_total']].copy()
			
 
				+        cur['price'] = pd.to_numeric(cur['price_total'], errors='coerce')
			
 
				+        cur['source'] = 'min'
			
 
				+        cur['row_id'] = cur.index
			
 
				+        parts.append(cur[['price', 'source', 'row_id']])
			
 
				+    
			
 
				+    # 历史降价
			
 
				+    if not df_drop_nodes.empty and 'high_price_amount' in df_drop_nodes.columns:
			
 
				+        drop = df_drop_nodes[['high_price_amount']].copy()
			
 
				+        drop['price'] = pd.to_numeric(drop['high_price_amount'], errors='coerce')
			
 
				+        drop['source'] = 'drop'
			
 
				+        drop['row_id'] = drop.index
			
 
				+        parts.append(drop[['price', 'source', 'row_id']])
			
 
				     
			
 
				+    # 历史升价
			
 
				+    if not df_rise_nodes.empty and 'prev_rise_amount' in df_rise_nodes.columns:
			
 
				+        rise = df_rise_nodes[['prev_rise_amount']].copy()
			
 
				+        rise['price'] = pd.to_numeric(rise['prev_rise_amount'], errors='coerce')
			
 
				+        rise['source'] = 'rise'
			
 
				+        rise['row_id'] = rise.index
			
 
				+        parts.append(rise[['price', 'source', 'row_id']])
			
 
				+    
			
 
				+    if parts:
			
 
				+        all_prices = pd.concat(parts, ignore_index=True)
			
 
				+        all_prices = all_prices.dropna(subset=['price']).reset_index(drop=True)
			
 
				+
			
 
				+        # 计算价格百分位
			
 
				+        dense_rank = all_prices['price'].rank(method='dense')
			
 
				+        max_rank = dense_rank.max()
			
 
				+        if pd.notna(max_rank) and max_rank > 1:
			
 
				+            all_prices['relative_position'] = (dense_rank - 1) / (max_rank - 1)
			
 
				+        else:
			
 
				+            all_prices['relative_position'] = 1.0
			
 
				+        all_prices['relative_position'] = all_prices['relative_position'].round(4)
			
 
				 
			
 
				+        # 回填到三个表
			
 
				+        m = all_prices['source'] == 'min'
			
 
				+        df_min_hours.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
			
 
				+
			
 
				+        if not df_drop_nodes.empty:
			
 
				+            m = all_prices['source'] == 'drop'
			
 
				+            df_drop_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
			
 
				+
			
 
				+        if not df_rise_nodes.empty:
			
 
				+            m = all_prices['source'] == 'rise'
			
 
				+            df_rise_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
			
 
				+    
			
 
				     pass