Browse Source

加入联合价格百分位

node04 2 days ago
parent
commit
d4d4a5bfda
1 changed files with 59 additions and 7 deletions
  1. 59 7
      data_process.py

+ 59 - 7
data_process.py

@@ -188,31 +188,83 @@ def predict_data_simple(df_input, city_pair, output_dir, predict_dir=".", pred_t
         df_sorted['hours_until_departure'].between(24, 360)
     ].reset_index(drop=True)
 
-    # 每个 gid  baggage_weight 取 hours_until_departure 最小的一条
+    # 每个 gid  baggage_weight 取 hours_until_departure 最小的一条 (当前小时)
     df_min_hours = (
         df_sorted.drop_duplicates(subset=['gid', 'baggage_weight'], keep='last')
         .reset_index(drop=True)
     )
 
-    # 读历史升价-降价
+    # 读历史降价场景
     drop_info_csv_path = os.path.join(output_dir, f'{city_pair}_drop_info.csv')
     if os.path.exists(drop_info_csv_path):
         df_drop_nodes = pd.read_csv(drop_info_csv_path)
     else:
         df_drop_nodes = pd.DataFrame()
 
-    # 读历史升价-升价
+    # 读历史升价场景
     rise_info_csv_path = os.path.join(output_dir, f'{city_pair}_rise_info.csv')
     if os.path.exists(rise_info_csv_path):
         df_rise_nodes = pd.read_csv(rise_info_csv_path)
     else:
         df_rise_nodes = pd.DataFrame()
     
-    # ==================== 跨航班日包络线 + 降价潜力 ====================
-    print(">>> 构建跨航班日价格包络线")
-    flight_key = ['citypair', 'flight_numbers', 'baggage_weight']
-    day_key = flight_key + ['from_date']
+    # 联合价格分布
+    # 统一初始化
+    df_min_hours['relative_position'] = np.nan
+    if not df_drop_nodes.empty:
+        df_drop_nodes['relative_position'] = np.nan
+    if not df_rise_nodes.empty:
+        df_rise_nodes['relative_position'] = np.nan
     
+    parts = []
+
+    # 当前待预测
+    if not df_min_hours.empty and 'price_total' in df_min_hours.columns:
+        cur = df_min_hours[['price_total']].copy()
+        cur['price'] = pd.to_numeric(cur['price_total'], errors='coerce')
+        cur['source'] = 'min'
+        cur['row_id'] = cur.index
+        parts.append(cur[['price', 'source', 'row_id']])
+    
+    # 历史降价
+    if not df_drop_nodes.empty and 'high_price_amount' in df_drop_nodes.columns:
+        drop = df_drop_nodes[['high_price_amount']].copy()
+        drop['price'] = pd.to_numeric(drop['high_price_amount'], errors='coerce')
+        drop['source'] = 'drop'
+        drop['row_id'] = drop.index
+        parts.append(drop[['price', 'source', 'row_id']])
     
+    # 历史升价
+    if not df_rise_nodes.empty and 'prev_rise_amount' in df_rise_nodes.columns:
+        rise = df_rise_nodes[['prev_rise_amount']].copy()
+        rise['price'] = pd.to_numeric(rise['prev_rise_amount'], errors='coerce')
+        rise['source'] = 'rise'
+        rise['row_id'] = rise.index
+        parts.append(rise[['price', 'source', 'row_id']])
+    
+    if parts:
+        all_prices = pd.concat(parts, ignore_index=True)
+        all_prices = all_prices.dropna(subset=['price']).reset_index(drop=True)
+
+        # 计算价格百分位
+        dense_rank = all_prices['price'].rank(method='dense')
+        max_rank = dense_rank.max()
+        if pd.notna(max_rank) and max_rank > 1:
+            all_prices['relative_position'] = (dense_rank - 1) / (max_rank - 1)
+        else:
+            all_prices['relative_position'] = 1.0
+        all_prices['relative_position'] = all_prices['relative_position'].round(4)
 
+        # 回填到三个表
+        m = all_prices['source'] == 'min'
+        df_min_hours.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
+
+        if not df_drop_nodes.empty:
+            m = all_prices['source'] == 'drop'
+            df_drop_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
+
+        if not df_rise_nodes.empty:
+            m = all_prices['source'] == 'rise'
+            df_rise_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
+    
     pass