|
|
@@ -831,3 +831,101 @@ def standardization(df, feature_scaler, target_scaler=None, is_training=True, is
|
|
|
print(">>> 基于固定范围的特征数据归一化完成")
|
|
|
|
|
|
return df, feature_scaler, target_scaler
|
|
|
+
|
|
|
+
|
|
|
+def preprocess_data_simple(df_input, is_train=False, output_dir='.'):
|
|
|
+
|
|
|
+ df_input = preprocess_data_first_half(df_input)
|
|
|
+
|
|
|
+ # 在 gid 与 baggage 内按时间降序
|
|
|
+ df_input = df_input.sort_values(
|
|
|
+ by=['gid', 'baggage', 'hours_until_departure'],
|
|
|
+ ascending=[True, True, False]
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ df_input = df_input[df_input['hours_until_departure'] <= 480]
|
|
|
+ df_input = df_input[df_input['baggage'] == 30]
|
|
|
+
|
|
|
+ # 保留真实的而不是补齐的数据
|
|
|
+ if not is_train:
|
|
|
+ df_input = df_input[df_input['is_filled'] == 0]
|
|
|
+
|
|
|
+ # 计算价格变化量
|
|
|
+ df_input['price_change_amount'] = (
|
|
|
+ df_input.groupby(['gid', 'baggage'], group_keys=False)['adult_total_price']
|
|
|
+ .apply(lambda s: s.diff().replace(0, np.nan).ffill().fillna(0)).round(2)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 计算价格变化百分比(相对于上一时间点的变化率)
|
|
|
+ df_input['price_change_percent'] = (
|
|
|
+ df_input.groupby(['gid', 'baggage'], group_keys=False)['adult_total_price']
|
|
|
+ .apply(lambda s: s.pct_change().replace(0, np.nan).ffill().fillna(0)).round(4)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 第一步:标记价格变化段
|
|
|
+ df_input['price_change_segment'] = (
|
|
|
+ df_input.groupby(['gid', 'baggage'], group_keys=False)['price_change_amount']
|
|
|
+ .apply(lambda s: (s != s.shift()).cumsum())
|
|
|
+ )
|
|
|
+
|
|
|
+ # 第二步:计算每个变化段内的持续时间
|
|
|
+ df_input['price_duration_hours'] = (
|
|
|
+ df_input.groupby(['gid', 'baggage', 'price_change_segment'], group_keys=False)
|
|
|
+ .cumcount()
|
|
|
+ .add(1)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 可选:删除临时列
|
|
|
+ df_input = df_input.drop(columns=['price_change_segment'])
|
|
|
+
|
|
|
+ adult_price = df_input.pop('Adult_Total_Price')
|
|
|
+ hours_until = df_input.pop('Hours_Until_Departure')
|
|
|
+ df_input['Adult_Total_Price'] = adult_price
|
|
|
+ df_input['Hours_Until_Departure'] = hours_until
|
|
|
+ df_input['Baggage'] = df_input['baggage']
|
|
|
+
|
|
|
+ if is_train:
|
|
|
+ df_target = df_input[(df_input['hours_until_departure'] >= 18) & (df_input['hours_until_departure'] <= 54)].copy()
|
|
|
+ df_target = df_target.sort_values(
|
|
|
+ by=['gid', 'hours_until_departure'],
|
|
|
+ ascending=[True, False]
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ prev_pct = df_target.groupby('gid', group_keys=False)['price_change_percent'].shift(1)
|
|
|
+ prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
|
+ prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
|
+ drop_mask = (prev_pct > 0) & (df_target['price_change_percent'] < 0)
|
|
|
+
|
|
|
+ df_drop_nodes = df_target.loc[drop_mask, ['gid', 'hours_until_departure']].copy()
|
|
|
+ df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
|
+ df_drop_nodes['drop_price_change_percent'] = df_target.loc[drop_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
|
+ df_drop_nodes['drop_price_change_amount'] = df_target.loc[drop_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
|
+ df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
|
|
|
+ df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
|
+ df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
+
|
|
|
+ df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
|
+
|
|
|
+ flight_info_cols = [
|
|
|
+ 'city_pair',
|
|
|
+ 'flight_number_1', 'seg1_dep_air_port', 'seg1_dep_time', 'seg1_arr_air_port', 'seg1_arr_time',
|
|
|
+ 'flight_number_2', 'seg2_dep_air_port', 'seg2_dep_time', 'seg2_arr_air_port', 'seg2_arr_time',
|
|
|
+ 'currency', 'baggage', 'flight_day',
|
|
|
+ ]
|
|
|
+
|
|
|
+ df_gid_info = df_target[['gid'] + flight_info_cols].drop_duplicates(subset=['gid']).reset_index(drop=True)
|
|
|
+ df_drop_nodes = df_drop_nodes.merge(df_gid_info, on='gid', how='left')
|
|
|
+
|
|
|
+ drop_info_cols = ['drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
+ 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount'
|
|
|
+ ]
|
|
|
+ # 按顺序排列 去掉gid
|
|
|
+ order_columns = flight_info_cols + drop_info_cols
|
|
|
+ df_drop_nodes = df_drop_nodes[order_columns]
|
|
|
+
|
|
|
+ del df_gid_info
|
|
|
+ del df_target
|
|
|
+ else:
|
|
|
+ df_drop_nodes = None
|
|
|
+
|
|
|
+ return df_input, df_drop_nodes
|