|
@@ -906,28 +906,28 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 第三步:段级余票变化(上一段终点余票 -> 当前段起点余票),首段为 n->n
|
|
# 第三步:段级余票变化(上一段终点余票 -> 当前段起点余票),首段为 n->n
|
|
|
- _seg_keys = ['gid', 'baggage', 'price_change_segment']
|
|
|
|
|
- _seg_seats = (
|
|
|
|
|
- df_input.groupby(_seg_keys, as_index=False)['seats_remaining']
|
|
|
|
|
- .agg(_seg_first_seats='first', _seg_last_seats='last')
|
|
|
|
|
- )
|
|
|
|
|
- _seg_seats['_prev_seg_last_seats'] = (
|
|
|
|
|
- _seg_seats.groupby(['gid', 'baggage'], group_keys=False)['_seg_last_seats']
|
|
|
|
|
- .shift(1)
|
|
|
|
|
- .fillna(_seg_seats['_seg_first_seats'])
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # _seg_keys = ['gid', 'baggage', 'price_change_segment']
|
|
|
|
|
+ # _seg_seats = (
|
|
|
|
|
+ # df_input.groupby(_seg_keys, as_index=False)['seats_remaining']
|
|
|
|
|
+ # .agg(_seg_first_seats='first', _seg_last_seats='last')
|
|
|
|
|
+ # )
|
|
|
|
|
+ # _seg_seats['_prev_seg_last_seats'] = (
|
|
|
|
|
+ # _seg_seats.groupby(['gid', 'baggage'], group_keys=False)['_seg_last_seats']
|
|
|
|
|
+ # .shift(1)
|
|
|
|
|
+ # .fillna(_seg_seats['_seg_first_seats'])
|
|
|
|
|
+ # )
|
|
|
|
|
|
|
|
- _seg_seats['_seg_first_seats'] = pd.to_numeric(_seg_seats['_seg_first_seats'], errors='coerce').round().astype('Int64')
|
|
|
|
|
- _seg_seats['_prev_seg_last_seats'] = pd.to_numeric(_seg_seats['_prev_seg_last_seats'], errors='coerce').round().astype('Int64')
|
|
|
|
|
- _seg_seats['seats_remaining_transition'] = (
|
|
|
|
|
- _seg_seats['_prev_seg_last_seats'].astype(str) + '->' + _seg_seats['_seg_first_seats'].astype(str)
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # _seg_seats['_seg_first_seats'] = pd.to_numeric(_seg_seats['_seg_first_seats'], errors='coerce').round().astype('Int64')
|
|
|
|
|
+ # _seg_seats['_prev_seg_last_seats'] = pd.to_numeric(_seg_seats['_prev_seg_last_seats'], errors='coerce').round().astype('Int64')
|
|
|
|
|
+ # _seg_seats['seats_remaining_transition'] = (
|
|
|
|
|
+ # _seg_seats['_prev_seg_last_seats'].astype(str) + '->' + _seg_seats['_seg_first_seats'].astype(str)
|
|
|
|
|
+ # )
|
|
|
|
|
|
|
|
- df_input = df_input.merge(
|
|
|
|
|
- _seg_seats[_seg_keys + ['seats_remaining_transition']],
|
|
|
|
|
- on=_seg_keys,
|
|
|
|
|
- how='left'
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # df_input = df_input.merge(
|
|
|
|
|
+ # _seg_seats[_seg_keys + ['seats_remaining_transition']],
|
|
|
|
|
+ # on=_seg_keys,
|
|
|
|
|
+ # how='left'
|
|
|
|
|
+ # )
|
|
|
|
|
|
|
|
# 可选:删除临时列
|
|
# 可选:删除临时列
|
|
|
# df_input = df_input.drop(columns=['price_change_segment'])
|
|
# df_input = df_input.drop(columns=['price_change_segment'])
|
|
@@ -964,30 +964,31 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
|
prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
|
prev_price = df_target.groupby('gid', group_keys=False)['adult_total_price'].shift(1)
|
|
prev_price = df_target.groupby('gid', group_keys=False)['adult_total_price'].shift(1)
|
|
|
- prev_seats_trans = df_target.groupby('gid', group_keys=False)['seats_remaining_transition'].shift(1)
|
|
|
|
|
|
|
+ prev_seats = df_target.groupby('gid', group_keys=False)['seats_remaining'].shift(1)
|
|
|
|
|
|
|
|
# 对于先升后降(先降后降)的分析
|
|
# 对于先升后降(先降后降)的分析
|
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
|
- # 正例库仅保留24小时内发生的降价:上一价格段持续时长需<=24h
|
|
|
|
|
|
|
+ # 正例库
|
|
|
prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
|
drop_mask = (
|
|
drop_mask = (
|
|
|
seg_start_mask
|
|
seg_start_mask
|
|
|
& prev_pct_num.notna()
|
|
& prev_pct_num.notna()
|
|
|
& (df_target['price_change_percent'] < 0)
|
|
& (df_target['price_change_percent'] < 0)
|
|
|
- & prev_dur.le(24)
|
|
|
|
|
|
|
+ # & prev_dur.le(24) # 仅保留24小时内发生的降价:上一价格段持续时长需<=24h
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- df_drop_nodes = df_target.loc[drop_mask, ['gid', 'hours_until_departure', 'days_to_departure', 'update_hour']].copy()
|
|
|
|
|
|
|
+ df_drop_nodes = df_target.loc[drop_mask, ['gid', 'hours_until_departure', 'days_to_departure', 'update_hour', 'seats_remaining']].copy()
|
|
|
df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
|
df_drop_nodes.rename(columns={'days_to_departure': 'drop_days_to_departure'}, inplace=True)
|
|
df_drop_nodes.rename(columns={'days_to_departure': 'drop_days_to_departure'}, inplace=True)
|
|
|
df_drop_nodes.rename(columns={'update_hour': 'drop_update_hour'}, inplace=True)
|
|
df_drop_nodes.rename(columns={'update_hour': 'drop_update_hour'}, inplace=True)
|
|
|
|
|
+ df_drop_nodes.rename(columns={'seats_remaining': 'drop_seats_remaining'}, inplace=True)
|
|
|
df_drop_nodes['drop_price_change_percent'] = df_target.loc[drop_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
df_drop_nodes['drop_price_change_percent'] = df_target.loc[drop_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
|
df_drop_nodes['drop_price_change_amount'] = df_target.loc[drop_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
df_drop_nodes['drop_price_change_amount'] = df_target.loc[drop_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
|
df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
|
|
df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
|
|
|
df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
|
df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
df_drop_nodes['high_price_amount'] = prev_price.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
df_drop_nodes['high_price_amount'] = prev_price.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
- df_drop_nodes['high_price_seats_remaining_transition'] = prev_seats_trans.loc[drop_mask].astype(str)
|
|
|
|
|
|
|
+ df_drop_nodes['high_price_seats_remaining'] = prev_seats.loc[drop_mask].astype(int)
|
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
|
|
|
|
|
|
flight_info_cols = [
|
|
flight_info_cols = [
|
|
@@ -1003,9 +1004,9 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on='gid', how='left')
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on='gid', how='left')
|
|
|
|
|
|
|
|
drop_info_cols = ['drop_update_hour', 'drop_days_to_departure',
|
|
drop_info_cols = ['drop_update_hour', 'drop_days_to_departure',
|
|
|
- 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
|
|
|
|
+ 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount', 'drop_seats_remaining',
|
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount',
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount',
|
|
|
- 'high_price_amount', 'high_price_seats_remaining_transition',
|
|
|
|
|
|
|
+ 'high_price_amount', 'high_price_seats_remaining',
|
|
|
]
|
|
]
|
|
|
# 按顺序排列 保留gid
|
|
# 按顺序排列 保留gid
|
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
@@ -1013,7 +1014,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_hours_until_departure'] <= 360]
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_hours_until_departure'] <= 360]
|
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72]
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72]
|
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['high_price_duration_hours'] > 1.0] # 维持时间太短的不计
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['high_price_duration_hours'] > 1.0] # 维持时间太短的不计
|
|
|
- df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_price_change_amount'].abs() >= 1] # 1$之内的降价不计
|
|
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_price_change_amount'].abs() > 1] # 1$之内的降价不计
|
|
|
|
|
|
|
|
# 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
|
|
# 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
|
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
@@ -1022,51 +1023,40 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
valid_mask = seg_start_mask & prev_pct_num.notna()
|
|
valid_mask = seg_start_mask & prev_pct_num.notna()
|
|
|
|
|
|
|
|
curr_pct = pd.to_numeric(df_target['price_change_percent'], errors='coerce')
|
|
curr_pct = pd.to_numeric(df_target['price_change_percent'], errors='coerce')
|
|
|
- prev_dur_num = pd.to_numeric(prev_dur, errors='coerce')
|
|
|
|
|
|
|
+ # prev_dur_num = pd.to_numeric(prev_dur, errors='coerce')
|
|
|
pos_case_mask = curr_pct.ge(0)
|
|
pos_case_mask = curr_pct.ge(0)
|
|
|
- neg_case_mask = curr_pct.lt(0) & prev_dur_num.gt(24)
|
|
|
|
|
|
|
+ # neg_case_mask = curr_pct.lt(0) & prev_dur_num.gt(24)
|
|
|
|
|
|
|
|
- # next_seg_hours = pd.Series(index=df_target.index, dtype='float64')
|
|
|
|
|
- # next_seg_pct = pd.Series(index=df_target.index, dtype='float64')
|
|
|
|
|
- # next_seg_hours.loc[seg_start_mask] = (
|
|
|
|
|
- # df_target.loc[seg_start_mask].groupby('gid')['hours_until_departure'].shift(-1).to_numpy()
|
|
|
|
|
- # )
|
|
|
|
|
- # next_seg_pct.loc[seg_start_mask] = (
|
|
|
|
|
- # df_target.loc[seg_start_mask].groupby('gid')['price_change_percent'].shift(-1).to_numpy()
|
|
|
|
|
- # )
|
|
|
|
|
-
|
|
|
|
|
- # hours_to_next_seg = df_target['hours_until_departure'] - next_seg_hours
|
|
|
|
|
- # drop_within_24h = next_seg_pct.lt(0) & hours_to_next_seg.ge(0) & hours_to_next_seg.le(24)
|
|
|
|
|
-
|
|
|
|
|
- rise_mask = valid_mask & (pos_case_mask | neg_case_mask)
|
|
|
|
|
|
|
+ rise_mask = valid_mask & pos_case_mask # (pos_case_mask | neg_case_mask)
|
|
|
# rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
|
|
# rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
|
|
|
|
|
|
|
|
- df_rise_nodes = df_target.loc[rise_mask, ['gid', 'hours_until_departure', 'days_to_departure', 'update_hour']].copy()
|
|
|
|
|
|
|
+ df_rise_nodes = df_target.loc[rise_mask, ['gid', 'hours_until_departure', 'days_to_departure', 'update_hour', 'seats_remaining']].copy()
|
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
|
df_rise_nodes.rename(columns={'days_to_departure': 'rise_days_to_departure'}, inplace=True)
|
|
df_rise_nodes.rename(columns={'days_to_departure': 'rise_days_to_departure'}, inplace=True)
|
|
|
df_rise_nodes.rename(columns={'update_hour': 'rise_update_hour'}, inplace=True)
|
|
df_rise_nodes.rename(columns={'update_hour': 'rise_update_hour'}, inplace=True)
|
|
|
|
|
+ df_rise_nodes.rename(columns={'seats_remaining': 'rise_seats_remaining'}, inplace=True)
|
|
|
df_rise_nodes['rise_price_change_percent'] = df_target.loc[rise_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
df_rise_nodes['rise_price_change_percent'] = df_target.loc[rise_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
|
df_rise_nodes['rise_price_change_amount'] = df_target.loc[rise_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
df_rise_nodes['rise_price_change_amount'] = df_target.loc[rise_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
|
df_rise_nodes['prev_rise_duration_hours'] = prev_dur.loc[rise_mask].astype(float).to_numpy()
|
|
df_rise_nodes['prev_rise_duration_hours'] = prev_dur.loc[rise_mask].astype(float).to_numpy()
|
|
|
df_rise_nodes['prev_rise_change_percent'] = prev_pct.loc[rise_mask].astype(float).round(4).to_numpy()
|
|
df_rise_nodes['prev_rise_change_percent'] = prev_pct.loc[rise_mask].astype(float).round(4).to_numpy()
|
|
|
df_rise_nodes['prev_rise_change_amount'] = prev_amo.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
df_rise_nodes['prev_rise_change_amount'] = prev_amo.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
|
df_rise_nodes['prev_rise_amount'] = prev_price.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
df_rise_nodes['prev_rise_amount'] = prev_price.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
|
- df_rise_nodes['prev_rise_seats_remaining_transition'] = prev_seats_trans.loc[rise_mask].astype(str)
|
|
|
|
|
|
|
+ df_rise_nodes['prev_rise_seats_remaining'] = prev_seats.loc[rise_mask].astype(int)
|
|
|
df_rise_nodes = df_rise_nodes.reset_index(drop=True)
|
|
df_rise_nodes = df_rise_nodes.reset_index(drop=True)
|
|
|
|
|
|
|
|
df_rise_nodes = df_rise_nodes.merge(df_gid_info, on='gid', how='left')
|
|
df_rise_nodes = df_rise_nodes.merge(df_gid_info, on='gid', how='left')
|
|
|
rise_info_cols = [
|
|
rise_info_cols = [
|
|
|
'rise_update_hour', 'rise_days_to_departure',
|
|
'rise_update_hour', 'rise_days_to_departure',
|
|
|
- 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
|
|
|
|
+ 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount', 'rise_seats_remaining',
|
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount',
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount',
|
|
|
- 'prev_rise_amount', 'prev_rise_seats_remaining_transition',
|
|
|
|
|
|
|
+ 'prev_rise_amount', 'prev_rise_seats_remaining',
|
|
|
]
|
|
]
|
|
|
df_rise_nodes = df_rise_nodes[flight_info_cols + rise_info_cols]
|
|
df_rise_nodes = df_rise_nodes[flight_info_cols + rise_info_cols]
|
|
|
df_rise_nodes['start_hours_until_departure'] = (df_rise_nodes['rise_hours_until_departure'] + df_rise_nodes['prev_rise_duration_hours']).round().astype('Int64')
|
|
df_rise_nodes['start_hours_until_departure'] = (df_rise_nodes['rise_hours_until_departure'] + df_rise_nodes['prev_rise_duration_hours']).round().astype('Int64')
|
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_hours_until_departure'] <= 360]
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_hours_until_departure'] <= 360]
|
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['start_hours_until_departure'] >= 72]
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['start_hours_until_departure'] >= 72]
|
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['prev_rise_duration_hours'] > 1.0] # 维持时间太短的不计
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['prev_rise_duration_hours'] > 1.0] # 维持时间太短的不计
|
|
|
- df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_price_change_amount'].abs() >= 1] # 1$之内的改变不计
|
|
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_price_change_amount'].abs() > 1] # 1$之内的改变不计
|
|
|
|
|
|
|
|
# 制作历史包络线
|
|
# 制作历史包络线
|
|
|
envelope_group = ['city_pair', 'flight_number_1', 'flight_number_2', 'flight_day']
|
|
envelope_group = ['city_pair', 'flight_number_1', 'flight_number_2', 'flight_day']
|
|
@@ -1325,7 +1315,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# del df_hist, df_future, df_envelope_all, df_envelope_agg, df_top # df_drop_freq, df_rise_freq
|
|
# del df_hist, df_future, df_envelope_all, df_envelope_agg, df_top # df_drop_freq, df_rise_freq
|
|
|
|
|
|
|
|
total_cnt_before = len(df_min_hours) # 记录下过滤前的总数
|
|
total_cnt_before = len(df_min_hours) # 记录下过滤前的总数
|
|
|
- df_min_hours = df_min_hours[(df_min_hours['is_good_target'] == 1) & (df_min_hours['seats_remaining'] >= 1)].reset_index(drop=True) # 保留值得投放的
|
|
|
|
|
|
|
+ df_min_hours = df_min_hours[(df_min_hours['is_good_target'] == 1) & (df_min_hours['seats_remaining'] >= 2)].reset_index(drop=True) # 保留值得投放的
|
|
|
total_cnt_after = len(df_min_hours) # 记录下过滤后的总数
|
|
total_cnt_after = len(df_min_hours) # 记录下过滤后的总数
|
|
|
|
|
|
|
|
# 余票为1的样本去掉
|
|
# 余票为1的样本去掉
|
|
@@ -1349,9 +1339,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
|
|
|
|
|
|
# 这个阈值取多少?
|
|
# 这个阈值取多少?
|
|
|
- pct_threshold = 0.1
|
|
|
|
|
|
|
+ pct_threshold = 0.3
|
|
|
# pct_threshold = 2
|
|
# pct_threshold = 2
|
|
|
- pct_threshold_1 = 0.1
|
|
|
|
|
|
|
+ pct_threshold_1 = 0.3
|
|
|
# pct_threshold_c = 0.001
|
|
# pct_threshold_c = 0.001
|
|
|
|
|
|
|
|
for idx, row in df_min_hours.iterrows():
|
|
for idx, row in df_min_hours.iterrows():
|
|
@@ -1369,7 +1359,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# seats_remaining_change_amount = row['seats_remaining_change_amount']
|
|
# seats_remaining_change_amount = row['seats_remaining_change_amount']
|
|
|
price_amount = row['adult_total_price']
|
|
price_amount = row['adult_total_price']
|
|
|
seats_remaining = row['seats_remaining']
|
|
seats_remaining = row['seats_remaining']
|
|
|
- seats_remaining_transition = row['seats_remaining_transition']
|
|
|
|
|
|
|
+ # seats_remaining_transition = row['seats_remaining_transition']
|
|
|
# envelope_position = row['envelope_position']
|
|
# envelope_position = row['envelope_position']
|
|
|
|
|
|
|
|
length_drop = 0
|
|
length_drop = 0
|
|
@@ -1402,9 +1392,10 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
|
|
pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
|
|
|
df_drop_gap = df_drop_nodes_part.loc[
|
|
df_drop_gap = df_drop_nodes_part.loc[
|
|
|
pct_vals.notna(),
|
|
pct_vals.notna(),
|
|
|
- ['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
|
|
|
|
+ ['flight_day',
|
|
|
|
|
+ 'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
'high_price_duration_hours', 'high_price_change_percent',
|
|
'high_price_duration_hours', 'high_price_change_percent',
|
|
|
- 'high_price_change_amount', 'high_price_amount', 'high_price_seats_remaining_transition']
|
|
|
|
|
|
|
+ 'high_price_change_amount', 'high_price_amount', 'high_price_seats_remaining', 'start_hours_until_departure']
|
|
|
].copy()
|
|
].copy()
|
|
|
df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
|
df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
|
|
df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
|
|
@@ -1423,7 +1414,8 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
(df_drop_gap['pct_abs_gap'] <= pct_threshold)
|
|
(df_drop_gap['pct_abs_gap'] <= pct_threshold)
|
|
|
& (df_drop_gap['price_abs_gap'] <= 0.001)
|
|
& (df_drop_gap['price_abs_gap'] <= 0.001)
|
|
|
& same_sign_mask
|
|
& same_sign_mask
|
|
|
- & (df_drop_gap['high_price_seats_remaining_transition'] == seats_remaining_transition)
|
|
|
|
|
|
|
+ # & (df_drop_gap['high_price_seats_remaining'] == seats_remaining)
|
|
|
|
|
+ & (df_drop_gap['high_price_duration_hours'] <= 48)
|
|
|
].copy()
|
|
].copy()
|
|
|
# df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 1.0)].copy()
|
|
# df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 1.0)].copy()
|
|
|
# df_drop_gap = df_drop_gap.sort_values(['price_abs_gap'], ascending=[True])
|
|
# df_drop_gap = df_drop_gap.sort_values(['price_abs_gap'], ascending=[True])
|
|
@@ -1441,16 +1433,21 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# df_match_chk = df_match_chk.loc[dur_vals.notna()].copy()
|
|
# df_match_chk = df_match_chk.loc[dur_vals.notna()].copy()
|
|
|
# df_match_chk = df_match_chk.loc[(dur_vals.loc[dur_vals.notna()] - float(dur_base)).abs() <= 36].copy()
|
|
# df_match_chk = df_match_chk.loc[(dur_vals.loc[dur_vals.notna()] - float(dur_base)).abs() <= 36].copy()
|
|
|
|
|
|
|
|
- # drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
|
|
- # df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
|
|
- # df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
|
|
|
|
|
|
|
+ # 正例收紧 (距离起飞的小时数)
|
|
|
|
|
+ drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= 0].copy()
|
|
|
|
|
+
|
|
|
|
|
+ start_hud_vals = pd.to_numeric(df_match_chk['start_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[start_hud_vals.notna()].copy()
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[(float(hud_base) - start_hud_vals.loc[start_hud_vals.notna()]) <= 0].copy()
|
|
|
|
|
|
|
|
- # 正例收紧
|
|
|
|
|
|
|
+ # 正例收紧 (持续小时数)
|
|
|
dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
|
dur_delta = dur_num_chk - float(dur_base)
|
|
dur_delta = dur_num_chk - float(dur_base)
|
|
|
df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
|
|
df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
|
|
|
df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
|
|
df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
|
|
|
- df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 72].copy()
|
|
|
|
|
|
|
+ # df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 24].copy()
|
|
|
|
|
|
|
|
# seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
|
|
# seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
|
|
|
# df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
|
|
# df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
|
|
@@ -1521,9 +1518,10 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
|
|
pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
|
|
|
df_rise_gap_1 = df_rise_nodes_part.loc[
|
|
df_rise_gap_1 = df_rise_nodes_part.loc[
|
|
|
pct_vals_1.notna(),
|
|
pct_vals_1.notna(),
|
|
|
- ['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
|
|
|
|
+ ['flight_day',
|
|
|
|
|
+ 'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent',
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent',
|
|
|
- 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_seats_remaining_transition']
|
|
|
|
|
|
|
+ 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_seats_remaining', 'start_hours_until_departure']
|
|
|
].copy()
|
|
].copy()
|
|
|
df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
|
df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
|
|
df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
|
|
@@ -1542,7 +1540,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1)
|
|
(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1)
|
|
|
& (df_rise_gap_1['price_abs_gap'] <= 0.001)
|
|
& (df_rise_gap_1['price_abs_gap'] <= 0.001)
|
|
|
& same_sign_mask_1
|
|
& same_sign_mask_1
|
|
|
- & (df_rise_gap_1['prev_rise_seats_remaining_transition'] == seats_remaining_transition)
|
|
|
|
|
|
|
+ # & (df_rise_gap_1['prev_rise_seats_remaining'] == seats_remaining)
|
|
|
].copy()
|
|
].copy()
|
|
|
# df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 1.0)].copy()
|
|
# df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 1.0)].copy()
|
|
|
# df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap'], ascending=[True])
|
|
# df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap'], ascending=[True])
|
|
@@ -1562,18 +1560,23 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_match_chk_1 = df_match_1.copy()
|
|
df_match_chk_1 = df_match_1.copy()
|
|
|
|
|
|
|
|
# 反例收紧:48小时内发生降价的不算显著反例
|
|
# 反例收紧:48小时内发生降价的不算显著反例
|
|
|
- _rise_pct_chk = pd.to_numeric(df_match_chk_1['rise_price_change_percent'], errors='coerce')
|
|
|
|
|
- _prev_dur_chk = pd.to_numeric(df_match_chk_1['prev_rise_duration_hours'], errors='coerce')
|
|
|
|
|
- _exclude_mask = _rise_pct_chk.lt(0) & _prev_dur_chk.lt(48)
|
|
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[~_exclude_mask.fillna(False)].copy()
|
|
|
|
|
-
|
|
|
|
|
|
|
+ # _rise_pct_chk = pd.to_numeric(df_match_chk_1['rise_price_change_percent'], errors='coerce')
|
|
|
|
|
+ # _prev_dur_chk = pd.to_numeric(df_match_chk_1['prev_rise_duration_hours'], errors='coerce')
|
|
|
|
|
+ # _exclude_mask = _rise_pct_chk.lt(0) & _prev_dur_chk.lt(48)
|
|
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[~_exclude_mask.fillna(False)].copy()
|
|
|
|
|
+
|
|
|
# dur_vals_1 = pd.to_numeric(df_match_chk_1['modify_rise_price_duration_hours'], errors='coerce')
|
|
# dur_vals_1 = pd.to_numeric(df_match_chk_1['modify_rise_price_duration_hours'], errors='coerce')
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[dur_vals_1.notna()].copy()
|
|
# df_match_chk_1 = df_match_chk_1.loc[dur_vals_1.notna()].copy()
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 24].copy()
|
|
# df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 24].copy()
|
|
|
|
|
|
|
|
- # rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
|
|
- # df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
|
|
- # df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= -24].copy()
|
|
|
|
|
|
|
+ # 反例收紧 (距离起飞的小时数)
|
|
|
|
|
+ rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= 0].copy()
|
|
|
|
|
+
|
|
|
|
|
+ start_hud_vals_1 = pd.to_numeric(df_match_chk_1['start_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[start_hud_vals_1.notna()].copy()
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - start_hud_vals_1.loc[start_hud_vals_1.notna()]) <= 0].copy()
|
|
|
|
|
|
|
|
# seats_vals_1 = pd.to_numeric(df_match_chk_1['rise_seats_remaining_change_amount'], errors='coerce')
|
|
# seats_vals_1 = pd.to_numeric(df_match_chk_1['rise_seats_remaining_change_amount'], errors='coerce')
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
|
|
# df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
|