|
|
@@ -188,31 +188,83 @@ def predict_data_simple(df_input, city_pair, output_dir, predict_dir=".", pred_t
|
|
|
df_sorted['hours_until_departure'].between(24, 360)
|
|
|
].reset_index(drop=True)
|
|
|
|
|
|
- # 每个 gid baggage_weight 取 hours_until_departure 最小的一条
|
|
|
+ # 每个 gid baggage_weight 取 hours_until_departure 最小的一条 (当前小时)
|
|
|
df_min_hours = (
|
|
|
df_sorted.drop_duplicates(subset=['gid', 'baggage_weight'], keep='last')
|
|
|
.reset_index(drop=True)
|
|
|
)
|
|
|
|
|
|
- # 读历史升价-降价
|
|
|
+ # 读历史降价场景
|
|
|
drop_info_csv_path = os.path.join(output_dir, f'{city_pair}_drop_info.csv')
|
|
|
if os.path.exists(drop_info_csv_path):
|
|
|
df_drop_nodes = pd.read_csv(drop_info_csv_path)
|
|
|
else:
|
|
|
df_drop_nodes = pd.DataFrame()
|
|
|
|
|
|
- # 读历史升价-升价
|
|
|
+ # 读历史升价场景
|
|
|
rise_info_csv_path = os.path.join(output_dir, f'{city_pair}_rise_info.csv')
|
|
|
if os.path.exists(rise_info_csv_path):
|
|
|
df_rise_nodes = pd.read_csv(rise_info_csv_path)
|
|
|
else:
|
|
|
df_rise_nodes = pd.DataFrame()
|
|
|
|
|
|
- # ==================== 跨航班日包络线 + 降价潜力 ====================
|
|
|
- print(">>> 构建跨航班日价格包络线")
|
|
|
- flight_key = ['citypair', 'flight_numbers', 'baggage_weight']
|
|
|
- day_key = flight_key + ['from_date']
|
|
|
+ # 联合价格分布
|
|
|
+ # 统一初始化
|
|
|
+ df_min_hours['relative_position'] = np.nan
|
|
|
+ if not df_drop_nodes.empty:
|
|
|
+ df_drop_nodes['relative_position'] = np.nan
|
|
|
+ if not df_rise_nodes.empty:
|
|
|
+ df_rise_nodes['relative_position'] = np.nan
|
|
|
|
|
|
+ parts = []
|
|
|
+
|
|
|
+ # 当前待预测
|
|
|
+ if not df_min_hours.empty and 'price_total' in df_min_hours.columns:
|
|
|
+ cur = df_min_hours[['price_total']].copy()
|
|
|
+ cur['price'] = pd.to_numeric(cur['price_total'], errors='coerce')
|
|
|
+ cur['source'] = 'min'
|
|
|
+ cur['row_id'] = cur.index
|
|
|
+ parts.append(cur[['price', 'source', 'row_id']])
|
|
|
+
|
|
|
+ # 历史降价
|
|
|
+ if not df_drop_nodes.empty and 'high_price_amount' in df_drop_nodes.columns:
|
|
|
+ drop = df_drop_nodes[['high_price_amount']].copy()
|
|
|
+ drop['price'] = pd.to_numeric(drop['high_price_amount'], errors='coerce')
|
|
|
+ drop['source'] = 'drop'
|
|
|
+ drop['row_id'] = drop.index
|
|
|
+ parts.append(drop[['price', 'source', 'row_id']])
|
|
|
|
|
|
+ # 历史升价
|
|
|
+ if not df_rise_nodes.empty and 'prev_rise_amount' in df_rise_nodes.columns:
|
|
|
+ rise = df_rise_nodes[['prev_rise_amount']].copy()
|
|
|
+ rise['price'] = pd.to_numeric(rise['prev_rise_amount'], errors='coerce')
|
|
|
+ rise['source'] = 'rise'
|
|
|
+ rise['row_id'] = rise.index
|
|
|
+ parts.append(rise[['price', 'source', 'row_id']])
|
|
|
+
|
|
|
+ if parts:
|
|
|
+ all_prices = pd.concat(parts, ignore_index=True)
|
|
|
+ all_prices = all_prices.dropna(subset=['price']).reset_index(drop=True)
|
|
|
+
|
|
|
+ # 计算价格百分位
|
|
|
+ dense_rank = all_prices['price'].rank(method='dense')
|
|
|
+ max_rank = dense_rank.max()
|
|
|
+ if pd.notna(max_rank) and max_rank > 1:
|
|
|
+ all_prices['relative_position'] = (dense_rank - 1) / (max_rank - 1)
|
|
|
+ else:
|
|
|
+ all_prices['relative_position'] = 1.0
|
|
|
+ all_prices['relative_position'] = all_prices['relative_position'].round(4)
|
|
|
|
|
|
+ # 回填到三个表
|
|
|
+ m = all_prices['source'] == 'min'
|
|
|
+ df_min_hours.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
|
+
|
|
|
+ if not df_drop_nodes.empty:
|
|
|
+ m = all_prices['source'] == 'drop'
|
|
|
+ df_drop_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
|
+
|
|
|
+ if not df_rise_nodes.empty:
|
|
|
+ m = all_prices['source'] == 'rise'
|
|
|
+ df_rise_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
|
+
|
|
|
pass
|