|
|
@@ -4,7 +4,7 @@ import gc
|
|
|
import os
|
|
|
|
|
|
|
|
|
-def preprocess_data_simple(df_input, is_train=False):
|
|
|
+def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
|
|
|
print(">>> 开始数据预处理")
|
|
|
# 城市码映射成数字(不用)
|
|
|
@@ -31,12 +31,13 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_input = df_input[df_input['hours_until_departure'] <= 480]
|
|
|
df_input = df_input[df_input['baggage_weight'] == 20] # 先保留20公斤行李的
|
|
|
|
|
|
- # 在hours_until_departure 的末尾 保留真实的而不是补齐的数据
|
|
|
+ # 在hours_until_departure 的末尾 保留到当前时刻的数据
|
|
|
if not is_train:
|
|
|
- _tail_filled = df_input.groupby(['gid', 'baggage_weight'])['is_filled'].transform(
|
|
|
- lambda s: s.iloc[::-1].cummin().iloc[::-1]
|
|
|
- )
|
|
|
- df_input = df_input[~((df_input['is_filled'] == 1) & (_tail_filled == 1))]
|
|
|
+ df_input = df_input[df_input['update_hour'] <= hourly_time].copy()
|
|
|
+ else:
|
|
|
+ df_input = df_input.copy() # 训练集也 copy 一下保持一致性
|
|
|
+
|
|
|
+ df_input = df_input.reset_index(drop=True)
|
|
|
|
|
|
# 价格变化最小量阈值
|
|
|
price_change_amount_threshold = 5
|
|
|
@@ -89,12 +90,15 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
ascending=[True, True, False]
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
- # 对于先升后降的分析
|
|
|
+ # 每条对应的前一条记录
|
|
|
prev_pct = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_percent'].shift(1)
|
|
|
prev_amo = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_amount'].shift(1)
|
|
|
prev_dur = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_duration_hours'].shift(1)
|
|
|
prev_price = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total'].shift(1)
|
|
|
- drop_mask = (prev_pct > 0) & (df_target['price_change_percent'] < 0)
|
|
|
+
|
|
|
+ # 对于先升后降(先降再降)的分析
|
|
|
+ seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
|
+ drop_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] < 0)
|
|
|
|
|
|
df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week']].copy()
|
|
|
df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
|
@@ -124,9 +128,9 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
# 按顺序排列 去掉gid
|
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + ['baggage_weight'] + drop_info_cols]
|
|
|
|
|
|
- # 对于“上涨后再次上涨”的分析(连续两个正向变价段)
|
|
|
- seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
|
- rise_mask = seg_start_mask & (prev_pct > 0) & (df_target['price_change_percent'] > 0)
|
|
|
+ # 对于先升再升(先降再升)的分析
|
|
|
+ # seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
|
+ rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
|
|
|
|
|
|
df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week']].copy()
|
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
|
@@ -169,4 +173,46 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
return df_input, df_drop_nodes, df_rise_nodes, df_envelope
|
|
|
|
|
|
return df_input, None, None, None
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
+def predict_data_simple(df_input, city_pair, output_dir, predict_dir=".", pred_time_str=""):
|
|
|
+ if df_input is None or df_input.empty:
|
|
|
+ return pd.DataFrame()
|
|
|
+
|
|
|
+ df_sorted = df_input.sort_values(
|
|
|
+ by=['gid', 'baggage_weight', 'hours_until_departure'],
|
|
|
+ ascending=[True, True, False],
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ df_sorted = df_sorted[
|
|
|
+ df_sorted['hours_until_departure'].between(24, 360)
|
|
|
+ ].reset_index(drop=True)
|
|
|
+
|
|
|
+ # 每个 gid baggage_weight 取 hours_until_departure 最小的一条
|
|
|
+ df_min_hours = (
|
|
|
+ df_sorted.drop_duplicates(subset=['gid', 'baggage_weight'], keep='last')
|
|
|
+ .reset_index(drop=True)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 读历史升价-降价
|
|
|
+ drop_info_csv_path = os.path.join(output_dir, f'{city_pair}_drop_info.csv')
|
|
|
+ if os.path.exists(drop_info_csv_path):
|
|
|
+ df_drop_nodes = pd.read_csv(drop_info_csv_path)
|
|
|
+ else:
|
|
|
+ df_drop_nodes = pd.DataFrame()
|
|
|
+
|
|
|
+ # 读历史升价-升价
|
|
|
+ rise_info_csv_path = os.path.join(output_dir, f'{city_pair}_rise_info.csv')
|
|
|
+ if os.path.exists(rise_info_csv_path):
|
|
|
+ df_rise_nodes = pd.read_csv(rise_info_csv_path)
|
|
|
+ else:
|
|
|
+ df_rise_nodes = pd.DataFrame()
|
|
|
+
|
|
|
+ # ==================== 跨航班日包络线 + 降价潜力 ====================
|
|
|
+ print(">>> 构建跨航班日价格包络线")
|
|
|
+ flight_key = ['citypair', 'flight_numbers', 'baggage_weight']
|
|
|
+ day_key = flight_key + ['from_date']
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ pass
|