|
|
@@ -7,7 +7,10 @@ import os
|
|
|
def preprocess_data_simple(df_input, is_train=False):
|
|
|
|
|
|
print(">>> 开始数据预处理")
|
|
|
- # 城市码映射成数字
|
|
|
+ # 城市码映射成数字(不用)
|
|
|
+
|
|
|
+ # 更新日期是周几
|
|
|
+ df_input['update_week'] = df_input['update_hour'].dt.dayofweek + 1
|
|
|
|
|
|
# gid:基于指定字段的分组标记(整数)
|
|
|
df_input['gid'] = (
|
|
|
@@ -93,8 +96,11 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
prev_price = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total'].shift(1)
|
|
|
drop_mask = (prev_pct > 0) & (df_target['price_change_percent'] < 0)
|
|
|
|
|
|
- df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure']].copy()
|
|
|
+ df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week']].copy()
|
|
|
df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
|
+ df_drop_nodes.rename(columns={'days_to_departure': 'drop_days_to_departure'}, inplace=True)
|
|
|
+ df_drop_nodes.rename(columns={'update_hour': 'drop_update_hour'}, inplace=True)
|
|
|
+ df_drop_nodes.rename(columns={'update_week': 'drop_update_week'}, inplace=True)
|
|
|
df_drop_nodes['drop_price_change_percent'] = df_target.loc[drop_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
|
df_drop_nodes['drop_price_change_amount'] = df_target.loc[drop_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
|
df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
|
|
|
@@ -111,7 +117,8 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
|
|
|
|
|
|
drop_info_cols = [
|
|
|
- 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
+ 'drop_update_hour', 'drop_update_week',
|
|
|
+ 'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount',
|
|
|
]
|
|
|
# 按顺序排列 去掉gid
|
|
|
@@ -121,8 +128,11 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
|
rise_mask = seg_start_mask & (prev_pct > 0) & (df_target['price_change_percent'] > 0)
|
|
|
|
|
|
- df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure']].copy()
|
|
|
+ df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week']].copy()
|
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
|
+ df_rise_nodes.rename(columns={'days_to_departure': 'rise_days_to_departure'}, inplace=True)
|
|
|
+ df_rise_nodes.rename(columns={'update_hour': 'rise_update_hour'}, inplace=True)
|
|
|
+ df_rise_nodes.rename(columns={'update_week': 'rise_update_week'}, inplace=True)
|
|
|
df_rise_nodes['rise_price_change_percent'] = df_target.loc[rise_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
|
df_rise_nodes['rise_price_change_amount'] = df_target.loc[rise_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
|
df_rise_nodes['prev_rise_duration_hours'] = prev_dur.loc[rise_mask].astype(float).to_numpy()
|
|
|
@@ -134,7 +144,8 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_rise_nodes = df_rise_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
|
|
|
|
|
|
rise_info_cols = [
|
|
|
- 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
+ 'rise_update_hour', 'rise_update_week',
|
|
|
+ 'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount',
|
|
|
]
|
|
|
df_rise_nodes = df_rise_nodes[flight_info_cols + ['baggage_weight'] + rise_info_cols]
|
|
|
@@ -143,10 +154,13 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
|
idx_peak = df_input.groupby(envelope_group)['price_total'].idxmax()
|
|
|
df_envelope = df_input.loc[idx_peak, envelope_group + [
|
|
|
- 'price_total', 'hours_until_departure'
|
|
|
+ 'from_time', 'price_total', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week',
|
|
|
]].rename(columns={
|
|
|
'price_total': 'peak_price',
|
|
|
'hours_until_departure': 'peak_hours',
|
|
|
+ 'days_to_departure': 'peak_days',
|
|
|
+ 'update_hour': 'peak_time',
|
|
|
+ 'update_week': 'peak_week',
|
|
|
}).reset_index(drop=True)
|
|
|
|
|
|
del df_gid_info
|