|
@@ -847,9 +847,12 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_input = df_input[df_input['hours_until_departure'] <= 480]
|
|
df_input = df_input[df_input['hours_until_departure'] <= 480]
|
|
|
df_input = df_input[df_input['baggage'] == 30]
|
|
df_input = df_input[df_input['baggage'] == 30]
|
|
|
|
|
|
|
|
- # 保留真实的而不是补齐的数据
|
|
|
|
|
|
|
+ # 在hours_until_departure 的末尾 保留真实的而不是补齐的数据
|
|
|
if not is_train:
|
|
if not is_train:
|
|
|
- df_input = df_input[df_input['is_filled'] == 0]
|
|
|
|
|
|
|
+ _tail_filled = df_input.groupby(['gid', 'baggage'])['is_filled'].transform(
|
|
|
|
|
+ lambda s: s.iloc[::-1].cummin().iloc[::-1]
|
|
|
|
|
+ )
|
|
|
|
|
+ df_input = df_input[~((df_input['is_filled'] == 1) & (_tail_filled == 1))]
|
|
|
|
|
|
|
|
# 计算价格变化量
|
|
# 计算价格变化量
|
|
|
df_input['price_change_amount'] = (
|
|
df_input['price_change_amount'] = (
|
|
@@ -878,6 +881,18 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
|
|
|
|
|
# 可选:删除临时列
|
|
# 可选:删除临时列
|
|
|
df_input = df_input.drop(columns=['price_change_segment'])
|
|
df_input = df_input.drop(columns=['price_change_segment'])
|
|
|
|
|
+
|
|
|
|
|
+ # 仅在价格变化点记录余票变化量;其它非价格变化点置空(NaN)
|
|
|
|
|
+ _price_diff = df_input.groupby(['gid', 'baggage'], group_keys=False)['adult_total_price'].diff()
|
|
|
|
|
+ _price_changed = _price_diff.notna() & _price_diff.ne(0)
|
|
|
|
|
+ _seats_diff = df_input.groupby(['gid', 'baggage'], group_keys=False)['seats_remaining'].diff()
|
|
|
|
|
+ df_input['seats_remaining_change_amount'] = _seats_diff.where(_price_changed).round(0)
|
|
|
|
|
+ # 前向填充 并 填充缺失值为0
|
|
|
|
|
+ df_input['seats_remaining_change_amount'] = (
|
|
|
|
|
+ df_input.groupby(['gid', 'baggage'], group_keys=False)['seats_remaining_change_amount']
|
|
|
|
|
+ .ffill()
|
|
|
|
|
+ .fillna(0)
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
adult_price = df_input.pop('Adult_Total_Price')
|
|
adult_price = df_input.pop('Adult_Total_Price')
|
|
|
hours_until = df_input.pop('Hours_Until_Departure')
|
|
hours_until = df_input.pop('Hours_Until_Departure')
|
|
@@ -887,7 +902,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
|
|
|
|
|
# 训练过程
|
|
# 训练过程
|
|
|
if is_train:
|
|
if is_train:
|
|
|
- df_target = df_input[(df_input['hours_until_departure'] >= 18) & (df_input['hours_until_departure'] <= 54)].copy()
|
|
|
|
|
|
|
+ df_target = df_input[(df_input['hours_until_departure'] >= 12) & (df_input['hours_until_departure'] <= 60)].copy()
|
|
|
df_target = df_target.sort_values(
|
|
df_target = df_target.sort_values(
|
|
|
by=['gid', 'hours_until_departure'],
|
|
by=['gid', 'hours_until_departure'],
|
|
|
ascending=[True, False]
|
|
ascending=[True, False]
|
|
@@ -897,6 +912,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
prev_pct = df_target.groupby('gid', group_keys=False)['price_change_percent'].shift(1)
|
|
prev_pct = df_target.groupby('gid', group_keys=False)['price_change_percent'].shift(1)
|
|
|
prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
|
prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
|
|
|
+ prev_seats_amo = df_target.groupby('gid', group_keys=False)['seats_remaining_change_amount'].shift(1)
|
|
|
drop_mask = (prev_pct > 0) & (df_target['price_change_percent'] < 0)
|
|
drop_mask = (prev_pct > 0) & (df_target['price_change_percent'] < 0)
|
|
|
|
|
|
|
|
df_drop_nodes = df_target.loc[drop_mask, ['gid', 'hours_until_departure']].copy()
|
|
df_drop_nodes = df_target.loc[drop_mask, ['gid', 'hours_until_departure']].copy()
|
|
@@ -906,7 +922,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
|
|
df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
|
|
|
df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
|
df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
-
|
|
|
|
|
|
|
+ df_drop_nodes['high_price_seats_remaining_change_amount'] = prev_seats_amo.loc[drop_mask].astype(float).round(1).to_numpy()
|
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
|
|
|
|
|
|
flight_info_cols = [
|
|
flight_info_cols = [
|
|
@@ -922,17 +938,20 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on='gid', how='left')
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on='gid', how='left')
|
|
|
|
|
|
|
|
drop_info_cols = ['drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
drop_info_cols = ['drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
- 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount'
|
|
|
|
|
|
|
+ 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount',
|
|
|
|
|
+ 'high_price_seats_remaining_change_amount',
|
|
|
]
|
|
]
|
|
|
# 按顺序排列 去掉gid
|
|
# 按顺序排列 去掉gid
|
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_price_change_percent'] <= -0.01] # 太低的降幅不计
|
|
|
|
|
|
|
|
# 对于没有先升后降的gid进行分析
|
|
# 对于没有先升后降的gid进行分析
|
|
|
gids_with_drop = df_target.loc[drop_mask, 'gid'].unique()
|
|
gids_with_drop = df_target.loc[drop_mask, 'gid'].unique()
|
|
|
df_no_drop = df_target[~df_target['gid'].isin(gids_with_drop)].copy()
|
|
df_no_drop = df_target[~df_target['gid'].isin(gids_with_drop)].copy()
|
|
|
|
|
|
|
|
keep_info_cols = [
|
|
keep_info_cols = [
|
|
|
- 'keep_hours_until_departure', 'keep_price_change_percent', 'keep_price_change_amount', 'keep_price_duration_hours'
|
|
|
|
|
|
|
+ 'keep_hours_until_departure', 'keep_price_change_percent', 'keep_price_change_amount',
|
|
|
|
|
+ 'keep_price_duration_hours', 'keep_seats_remaining_change_amount',
|
|
|
]
|
|
]
|
|
|
|
|
|
|
|
if df_no_drop.empty:
|
|
if df_no_drop.empty:
|
|
@@ -954,7 +973,8 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
df_keep_nodes = df_keep_row[
|
|
df_keep_nodes = df_keep_row[
|
|
|
- ['gid', 'hours_until_departure', 'price_change_percent', 'price_change_amount', 'price_duration_hours']
|
|
|
|
|
|
|
+ ['gid', 'hours_until_departure', 'price_change_percent', 'price_change_amount',
|
|
|
|
|
+ 'price_duration_hours', 'seats_remaining_change_amount']
|
|
|
].copy()
|
|
].copy()
|
|
|
df_keep_nodes.rename(
|
|
df_keep_nodes.rename(
|
|
|
columns={
|
|
columns={
|
|
@@ -962,6 +982,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
'price_change_percent': 'keep_price_change_percent',
|
|
'price_change_percent': 'keep_price_change_percent',
|
|
|
'price_change_amount': 'keep_price_change_amount',
|
|
'price_change_amount': 'keep_price_change_amount',
|
|
|
'price_duration_hours': 'keep_price_duration_hours',
|
|
'price_duration_hours': 'keep_price_duration_hours',
|
|
|
|
|
+ 'seats_remaining_change_amount': 'keep_seats_remaining_change_amount',
|
|
|
},
|
|
},
|
|
|
inplace=True,
|
|
inplace=True,
|
|
|
)
|
|
)
|