Prechádzať zdrojové kódy

在计算价格分位时带上分组

node04 4 dní pred
rodič
commit
cd5b7f6bd0
1 zmenil súbory, kde vykonal 47 pridanie a 14 odobranie
  1. 47 14
      data_process.py

+ 47 - 14
data_process.py

@@ -240,7 +240,17 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
     else:
         df_rise_nodes = pd.DataFrame()
     
-    # 联合价格分布 ==========================================================
+    # 联合价格分布(按 flight_key 分组计算)==========================================================
+    flight_key = ['citypair', 'flight_numbers', 'baggage_weight']
+    group_cols = [
+        c for c in flight_key
+        if (
+            c in df_min_hours.columns
+            or (not df_drop_nodes.empty and c in df_drop_nodes.columns)
+            or (not df_rise_nodes.empty and c in df_rise_nodes.columns)
+        )
+    ]
+
     # 统一初始化
     df_min_hours['relative_position'] = np.nan
     if not df_drop_nodes.empty:
@@ -252,39 +262,62 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
 
     # 当前待预测
     if not df_min_hours.empty and 'price_total' in df_min_hours.columns:
-        cur = df_min_hours[['price_total']].copy()
+        min_group_cols = [c for c in group_cols if c in df_min_hours.columns]
+        cur = df_min_hours[min_group_cols + ['from_date', 'price_total']].copy()
+        for c in group_cols:
+            if c not in cur.columns:
+                cur[c] = np.nan
         cur['price'] = pd.to_numeric(cur['price_total'], errors='coerce')
         cur['source'] = 'min'
         cur['row_id'] = cur.index
-        parts.append(cur[['price', 'source', 'row_id']])
+        parts.append(cur[group_cols + ['from_date', 'price', 'source', 'row_id']])
     
     # 历史降价
     if not df_drop_nodes.empty and 'high_price_amount' in df_drop_nodes.columns:
-        drop = df_drop_nodes[['high_price_amount']].copy()
+        drop_group_cols = [c for c in group_cols if c in df_drop_nodes.columns]
+        drop = df_drop_nodes[drop_group_cols + ['from_date', 'high_price_amount']].copy()
+        for c in group_cols:
+            if c not in drop.columns:
+                drop[c] = np.nan
         drop['price'] = pd.to_numeric(drop['high_price_amount'], errors='coerce')
         drop['source'] = 'drop'
         drop['row_id'] = drop.index
-        parts.append(drop[['price', 'source', 'row_id']])
+        parts.append(drop[group_cols + ['from_date', 'price', 'source', 'row_id']])
     
     # 历史升价
     if not df_rise_nodes.empty and 'prev_rise_amount' in df_rise_nodes.columns:
-        rise = df_rise_nodes[['prev_rise_amount']].copy()
+        rise_group_cols = [c for c in group_cols if c in df_rise_nodes.columns]
+        rise = df_rise_nodes[rise_group_cols + ['from_date', 'prev_rise_amount']].copy()
+        for c in group_cols:
+            if c not in rise.columns:
+                rise[c] = np.nan
         rise['price'] = pd.to_numeric(rise['prev_rise_amount'], errors='coerce')
         rise['source'] = 'rise'
         rise['row_id'] = rise.index
-        parts.append(rise[['price', 'source', 'row_id']])
+        parts.append(rise[group_cols + ['from_date', 'price', 'source', 'row_id']])
     
     if parts:
         all_prices = pd.concat(parts, ignore_index=True)
         all_prices = all_prices.dropna(subset=['price']).reset_index(drop=True)
 
-        # 计算价格百分位
-        dense_rank = all_prices['price'].rank(method='dense')
-        max_rank = dense_rank.max()
-        if pd.notna(max_rank) and max_rank > 1:
-            all_prices['relative_position'] = (dense_rank - 1) / (max_rank - 1)
-        else:
-            all_prices['relative_position'] = 1.0
+        # 计算价格百分位(优先按分组计算,无法分组时回退全局)
+        if group_cols:
+            all_prices['dense_rank'] = all_prices.groupby(group_cols, dropna=False)['price'].rank(method='dense')
+            all_prices['max_rank'] = all_prices.groupby(group_cols, dropna=False)['dense_rank'].transform('max')
+            all_prices['relative_position'] = np.where(
+                all_prices['max_rank'] > 1,
+                (all_prices['dense_rank'] - 1) / (all_prices['max_rank'] - 1),
+                1.0,
+            )
+            all_prices = all_prices.drop(columns=['dense_rank', 'max_rank'])
+        else:    
+            dense_rank = all_prices['price'].rank(method='dense')
+            max_rank = dense_rank.max()
+            if pd.notna(max_rank) and max_rank > 1:
+                all_prices['relative_position'] = (dense_rank - 1) / (max_rank - 1)
+            else:
+                all_prices['relative_position'] = 1.0
+        
         all_prices['relative_position'] = all_prices['relative_position'].round(4)
 
         # 回填到三个表