|
@@ -855,7 +855,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_input = df_input[~((df_input['is_filled'] == 1) & (_tail_filled == 1))]
|
|
df_input = df_input[~((df_input['is_filled'] == 1) & (_tail_filled == 1))]
|
|
|
|
|
|
|
|
# 价格变化最小量阈值
|
|
# 价格变化最小量阈值
|
|
|
- price_change_amount_threshold = 1
|
|
|
|
|
|
|
+ price_change_amount_threshold = 0.001
|
|
|
df_input['_raw_price_diff'] = df_input.groupby(['gid', 'baggage'], group_keys=False)['adult_total_price'].diff()
|
|
df_input['_raw_price_diff'] = df_input.groupby(['gid', 'baggage'], group_keys=False)['adult_total_price'].diff()
|
|
|
|
|
|
|
|
# 计算价格变化量
|
|
# 计算价格变化量
|
|
@@ -905,6 +905,30 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
.add(1)
|
|
.add(1)
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
+ # 第三步:段级余票变化(上一段终点余票 -> 当前段起点余票),首段为 n->n
|
|
|
|
|
+ _seg_keys = ['gid', 'baggage', 'price_change_segment']
|
|
|
|
|
+ _seg_seats = (
|
|
|
|
|
+ df_input.groupby(_seg_keys, as_index=False)['seats_remaining']
|
|
|
|
|
+ .agg(_seg_first_seats='first', _seg_last_seats='last')
|
|
|
|
|
+ )
|
|
|
|
|
+ _seg_seats['_prev_seg_last_seats'] = (
|
|
|
|
|
+ _seg_seats.groupby(['gid', 'baggage'], group_keys=False)['_seg_last_seats']
|
|
|
|
|
+ .shift(1)
|
|
|
|
|
+ .fillna(_seg_seats['_seg_first_seats'])
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ _seg_seats['_seg_first_seats'] = pd.to_numeric(_seg_seats['_seg_first_seats'], errors='coerce').round().astype('Int64')
|
|
|
|
|
+ _seg_seats['_prev_seg_last_seats'] = pd.to_numeric(_seg_seats['_prev_seg_last_seats'], errors='coerce').round().astype('Int64')
|
|
|
|
|
+ _seg_seats['seats_remaining_transition'] = (
|
|
|
|
|
+ _seg_seats['_prev_seg_last_seats'].astype(str) + '->' + _seg_seats['_seg_first_seats'].astype(str)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ df_input = df_input.merge(
|
|
|
|
|
+ _seg_seats[_seg_keys + ['seats_remaining_transition']],
|
|
|
|
|
+ on=_seg_keys,
|
|
|
|
|
+ how='left'
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
# 可选:删除临时列
|
|
# 可选:删除临时列
|
|
|
# df_input = df_input.drop(columns=['price_change_segment'])
|
|
# df_input = df_input.drop(columns=['price_change_segment'])
|
|
|
df_input = df_input.drop(columns=['price_change_segment', '_raw_price_diff'])
|
|
df_input = df_input.drop(columns=['price_change_segment', '_raw_price_diff'])
|
|
@@ -929,7 +953,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
|
|
|
|
|
# 训练过程
|
|
# 训练过程
|
|
|
if is_train:
|
|
if is_train:
|
|
|
- df_target = df_input[(df_input['hours_until_departure'] >= 72) & (df_input['hours_until_departure'] <= 360)].copy() # 扩展至360小时(15天)
|
|
|
|
|
|
|
+ df_target = df_input[(df_input['hours_until_departure'] >= 48) & (df_input['hours_until_departure'] <= 384)].copy() # 扩展至360小时(15天)
|
|
|
df_target = df_target.sort_values(
|
|
df_target = df_target.sort_values(
|
|
|
by=['gid', 'hours_until_departure'],
|
|
by=['gid', 'hours_until_departure'],
|
|
|
ascending=[True, False]
|
|
ascending=[True, False]
|
|
@@ -940,7 +964,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
|
prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
|
prev_price = df_target.groupby('gid', group_keys=False)['adult_total_price'].shift(1)
|
|
prev_price = df_target.groupby('gid', group_keys=False)['adult_total_price'].shift(1)
|
|
|
- prev_seats = df_target.groupby('gid', group_keys=False)['seats_remaining'].shift(1)
|
|
|
|
|
|
|
+ prev_seats_trans = df_target.groupby('gid', group_keys=False)['seats_remaining_transition'].shift(1)
|
|
|
|
|
|
|
|
# 对于先升后降(先降后降)的分析
|
|
# 对于先升后降(先降后降)的分析
|
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
@@ -963,7 +987,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
|
df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
df_drop_nodes['high_price_amount'] = prev_price.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
df_drop_nodes['high_price_amount'] = prev_price.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
- df_drop_nodes['high_price_seats_remaining'] = prev_seats.loc[drop_mask].astype(int).to_numpy()
|
|
|
|
|
|
|
+ df_drop_nodes['high_price_seats_remaining_transition'] = prev_seats_trans.loc[drop_mask].astype(str)
|
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
|
|
|
|
|
|
flight_info_cols = [
|
|
flight_info_cols = [
|
|
@@ -981,11 +1005,15 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
drop_info_cols = ['drop_update_hour', 'drop_days_to_departure',
|
|
drop_info_cols = ['drop_update_hour', 'drop_days_to_departure',
|
|
|
'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount',
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount',
|
|
|
- 'high_price_amount', 'high_price_seats_remaining',
|
|
|
|
|
|
|
+ 'high_price_amount', 'high_price_seats_remaining_transition',
|
|
|
]
|
|
]
|
|
|
# 按顺序排列 保留gid
|
|
# 按顺序排列 保留gid
|
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
|
- # df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_price_change_percent'] <= -0.01] # 太低的降幅不计
|
|
|
|
|
|
|
+ df_drop_nodes['start_hours_until_departure'] = (df_drop_nodes['drop_hours_until_departure'] + df_drop_nodes['high_price_duration_hours']).round().astype('Int64')
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_hours_until_departure'] <= 360]
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72]
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['high_price_duration_hours'] > 1.0] # 维持时间太短的不计
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_price_change_amount'].abs() >= 1] # 1$之内的降价不计
|
|
|
|
|
|
|
|
# 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
|
|
# 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
|
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
@@ -1023,7 +1051,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
df_rise_nodes['prev_rise_change_percent'] = prev_pct.loc[rise_mask].astype(float).round(4).to_numpy()
|
|
df_rise_nodes['prev_rise_change_percent'] = prev_pct.loc[rise_mask].astype(float).round(4).to_numpy()
|
|
|
df_rise_nodes['prev_rise_change_amount'] = prev_amo.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
df_rise_nodes['prev_rise_change_amount'] = prev_amo.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
|
df_rise_nodes['prev_rise_amount'] = prev_price.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
df_rise_nodes['prev_rise_amount'] = prev_price.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
|
- df_rise_nodes['prev_rise_seats_remaining'] = prev_seats.loc[rise_mask].astype(int).to_numpy()
|
|
|
|
|
|
|
+ df_rise_nodes['prev_rise_seats_remaining_transition'] = prev_seats_trans.loc[rise_mask].astype(str)
|
|
|
df_rise_nodes = df_rise_nodes.reset_index(drop=True)
|
|
df_rise_nodes = df_rise_nodes.reset_index(drop=True)
|
|
|
|
|
|
|
|
df_rise_nodes = df_rise_nodes.merge(df_gid_info, on='gid', how='left')
|
|
df_rise_nodes = df_rise_nodes.merge(df_gid_info, on='gid', how='left')
|
|
@@ -1031,9 +1059,14 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
'rise_update_hour', 'rise_days_to_departure',
|
|
'rise_update_hour', 'rise_days_to_departure',
|
|
|
'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount',
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount',
|
|
|
- 'prev_rise_amount', 'prev_rise_seats_remaining',
|
|
|
|
|
|
|
+ 'prev_rise_amount', 'prev_rise_seats_remaining_transition',
|
|
|
]
|
|
]
|
|
|
df_rise_nodes = df_rise_nodes[flight_info_cols + rise_info_cols]
|
|
df_rise_nodes = df_rise_nodes[flight_info_cols + rise_info_cols]
|
|
|
|
|
+ df_rise_nodes['start_hours_until_departure'] = (df_rise_nodes['rise_hours_until_departure'] + df_rise_nodes['prev_rise_duration_hours']).round().astype('Int64')
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_hours_until_departure'] <= 360]
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[df_rise_nodes['start_hours_until_departure'] >= 72]
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[df_rise_nodes['prev_rise_duration_hours'] > 1.0] # 维持时间太短的不计
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_price_change_amount'].abs() >= 1] # 1$之内的改变不计
|
|
|
|
|
|
|
|
# 制作历史包络线
|
|
# 制作历史包络线
|
|
|
envelope_group = ['city_pair', 'flight_number_1', 'flight_number_2', 'flight_day']
|
|
envelope_group = ['city_pair', 'flight_number_1', 'flight_number_2', 'flight_day']
|
|
@@ -1090,58 +1123,117 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_rise_nodes = pd.read_csv(rise_info_csv_path)
|
|
df_rise_nodes = pd.read_csv(rise_info_csv_path)
|
|
|
else:
|
|
else:
|
|
|
df_rise_nodes = pd.DataFrame()
|
|
df_rise_nodes = pd.DataFrame()
|
|
|
|
|
+
|
|
|
|
|
+ # 联合价格分布 ==========================================================
|
|
|
|
|
+ # 统一初始化
|
|
|
|
|
+ df_min_hours['relative_position'] = np.nan
|
|
|
|
|
+ if not df_drop_nodes.empty:
|
|
|
|
|
+ df_drop_nodes['relative_position'] = np.nan
|
|
|
|
|
+ if not df_rise_nodes.empty:
|
|
|
|
|
+ df_rise_nodes['relative_position'] = np.nan
|
|
|
|
|
+
|
|
|
|
|
+ parts = []
|
|
|
|
|
+
|
|
|
|
|
+ # 当前待预测
|
|
|
|
|
+ if not df_min_hours.empty and 'adult_total_price' in df_min_hours.columns:
|
|
|
|
|
+ cur = df_min_hours[['adult_total_price']].copy()
|
|
|
|
|
+ cur['price'] = pd.to_numeric(cur['adult_total_price'], errors='coerce')
|
|
|
|
|
+ cur['source'] = 'min'
|
|
|
|
|
+ cur['row_id'] = cur.index
|
|
|
|
|
+ parts.append(cur[['price', 'source', 'row_id']])
|
|
|
|
|
+
|
|
|
|
|
+ # 历史降价
|
|
|
|
|
+ if not df_drop_nodes.empty and 'high_price_amount' in df_drop_nodes.columns:
|
|
|
|
|
+ drop = df_drop_nodes[['high_price_amount']].copy()
|
|
|
|
|
+ drop['price'] = pd.to_numeric(drop['high_price_amount'], errors='coerce')
|
|
|
|
|
+ drop['source'] = 'drop'
|
|
|
|
|
+ drop['row_id'] = drop.index
|
|
|
|
|
+ parts.append(drop[['price', 'source', 'row_id']])
|
|
|
|
|
+
|
|
|
|
|
+ # 历史升价
|
|
|
|
|
+ if not df_rise_nodes.empty and 'prev_rise_amount' in df_rise_nodes.columns:
|
|
|
|
|
+ rise = df_rise_nodes[['prev_rise_amount']].copy()
|
|
|
|
|
+ rise['price'] = pd.to_numeric(rise['prev_rise_amount'], errors='coerce')
|
|
|
|
|
+ rise['source'] = 'rise'
|
|
|
|
|
+ rise['row_id'] = rise.index
|
|
|
|
|
+ parts.append(rise[['price', 'source', 'row_id']])
|
|
|
|
|
+
|
|
|
|
|
+ if parts:
|
|
|
|
|
+ all_prices = pd.concat(parts, ignore_index=True)
|
|
|
|
|
+ all_prices = all_prices.dropna(subset=['price']).reset_index(drop=True)
|
|
|
|
|
+
|
|
|
|
|
+ # 计算价格百分位
|
|
|
|
|
+ dense_rank = all_prices['price'].rank(method='dense')
|
|
|
|
|
+ max_rank = dense_rank.max()
|
|
|
|
|
+ if pd.notna(max_rank) and max_rank > 1:
|
|
|
|
|
+ all_prices['relative_position'] = (dense_rank - 1) / (max_rank - 1)
|
|
|
|
|
+ else:
|
|
|
|
|
+ all_prices['relative_position'] = 1.0
|
|
|
|
|
+ all_prices['relative_position'] = all_prices['relative_position'].round(4)
|
|
|
|
|
+
|
|
|
|
|
+ # 回填到三个表
|
|
|
|
|
+ m = all_prices['source'] == 'min'
|
|
|
|
|
+ df_min_hours.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
|
|
|
|
|
|
- # ==================== 跨航班日包络线 + 降价潜力 ====================
|
|
|
|
|
- print(">>> 构建跨航班日价格包络线")
|
|
|
|
|
- flight_key = ['city_pair', 'flight_number_1', 'flight_number_2']
|
|
|
|
|
- day_key = flight_key + ['flight_day']
|
|
|
|
|
-
|
|
|
|
|
- # 1. 历史侧:加载训练阶段的峰值数据
|
|
|
|
|
- envelope_csv_path = os.path.join(output_dir, f'{group_route_str}_envelope_info.csv')
|
|
|
|
|
- if os.path.exists(envelope_csv_path):
|
|
|
|
|
- df_hist = pd.read_csv(envelope_csv_path)
|
|
|
|
|
- df_hist = df_hist[day_key + ['peak_price', 'peak_hours']]
|
|
|
|
|
- df_hist['source'] = 'hist'
|
|
|
|
|
- else:
|
|
|
|
|
- df_hist = pd.DataFrame()
|
|
|
|
|
-
|
|
|
|
|
- # 2. 未来侧:当前在售价格
|
|
|
|
|
- df_future = df_min_hours[day_key + ['adult_total_price', 'hours_until_departure']].copy().rename(
|
|
|
|
|
- columns={'adult_total_price': 'peak_price', 'hours_until_departure': 'peak_hours'}
|
|
|
|
|
- )
|
|
|
|
|
- df_future['source'] = 'future'
|
|
|
|
|
-
|
|
|
|
|
- # 3. 合并包络线数据点
|
|
|
|
|
- df_envelope_all = pd.concat(
|
|
|
|
|
- [x for x in [df_hist, df_future] if not x.empty], ignore_index=True
|
|
|
|
|
- ).drop_duplicates(subset=day_key, keep='last')
|
|
|
|
|
-
|
|
|
|
|
- # 4. 包络线统计 + 找高点起飞日
|
|
|
|
|
- df_envelope_agg = df_envelope_all.groupby(flight_key).agg(
|
|
|
|
|
- envelope_max=('peak_price', 'max'), # 峰值最大
|
|
|
|
|
- envelope_min=('peak_price', 'min'), # 峰值最小
|
|
|
|
|
- envelope_mean=('peak_price', 'mean'), # 峰值平均
|
|
|
|
|
- envelope_count=('peak_price', 'count'), # 峰值统计总数
|
|
|
|
|
- envelope_avg_peak_hours=('peak_hours', 'mean'), # 峰值发生的距离起飞小时数, 做一下平均
|
|
|
|
|
- ).reset_index()
|
|
|
|
|
|
|
+ if not df_drop_nodes.empty:
|
|
|
|
|
+ m = all_prices['source'] == 'drop'
|
|
|
|
|
+ df_drop_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
|
|
|
|
|
|
- # 对数值列保留两位小数
|
|
|
|
|
- df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']] = df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']].round(2)
|
|
|
|
|
|
|
+ if not df_rise_nodes.empty:
|
|
|
|
|
+ m = all_prices['source'] == 'rise'
|
|
|
|
|
+ df_rise_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
|
|
|
|
|
|
- idx_top = df_envelope_all.groupby(flight_key)['peak_price'].idxmax()
|
|
|
|
|
- df_top = df_envelope_all.loc[idx_top, flight_key + ['flight_day', 'peak_price', 'peak_hours']].rename(
|
|
|
|
|
- columns={'flight_day': 'target_flight_day', 'peak_price': 'target_price', 'peak_hours': 'target_peak_hours'}
|
|
|
|
|
- )
|
|
|
|
|
- df_envelope_agg = df_envelope_agg.merge(df_top, on=flight_key, how='left')
|
|
|
|
|
|
|
+ # ==================== 跨航班日包络线 + 降价潜力 ====================
|
|
|
|
|
+ # print(">>> 构建跨航班日价格包络线")
|
|
|
|
|
+ # flight_key = ['city_pair', 'flight_number_1', 'flight_number_2']
|
|
|
|
|
+ # day_key = flight_key + ['flight_day']
|
|
|
|
|
+
|
|
|
|
|
+ # # 1. 历史侧:加载训练阶段的峰值数据
|
|
|
|
|
+ # envelope_csv_path = os.path.join(output_dir, f'{group_route_str}_envelope_info.csv')
|
|
|
|
|
+ # if os.path.exists(envelope_csv_path):
|
|
|
|
|
+ # df_hist = pd.read_csv(envelope_csv_path)
|
|
|
|
|
+ # df_hist = df_hist[day_key + ['peak_price', 'peak_hours']]
|
|
|
|
|
+ # df_hist['source'] = 'hist'
|
|
|
|
|
+ # else:
|
|
|
|
|
+ # df_hist = pd.DataFrame()
|
|
|
|
|
+
|
|
|
|
|
+ # # 2. 未来侧:当前在售价格
|
|
|
|
|
+ # df_future = df_min_hours[day_key + ['adult_total_price', 'hours_until_departure']].copy().rename(
|
|
|
|
|
+ # columns={'adult_total_price': 'peak_price', 'hours_until_departure': 'peak_hours'}
|
|
|
|
|
+ # )
|
|
|
|
|
+ # df_future['source'] = 'future'
|
|
|
|
|
+
|
|
|
|
|
+ # # 3. 合并包络线数据点
|
|
|
|
|
+ # df_envelope_all = pd.concat(
|
|
|
|
|
+ # [x for x in [df_hist, df_future] if not x.empty], ignore_index=True
|
|
|
|
|
+ # ).drop_duplicates(subset=day_key, keep='last')
|
|
|
|
|
+
|
|
|
|
|
+ # # 4. 包络线统计 + 找高点起飞日
|
|
|
|
|
+ # df_envelope_agg = df_envelope_all.groupby(flight_key).agg(
|
|
|
|
|
+ # envelope_max=('peak_price', 'max'), # 峰值最大
|
|
|
|
|
+ # envelope_min=('peak_price', 'min'), # 峰值最小
|
|
|
|
|
+ # envelope_mean=('peak_price', 'mean'), # 峰值平均
|
|
|
|
|
+ # envelope_count=('peak_price', 'count'), # 峰值统计总数
|
|
|
|
|
+ # envelope_avg_peak_hours=('peak_hours', 'mean'), # 峰值发生的距离起飞小时数, 做一下平均
|
|
|
|
|
+ # ).reset_index()
|
|
|
|
|
+
|
|
|
|
|
+ # # 对数值列保留两位小数
|
|
|
|
|
+ # df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']] = df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']].round(2)
|
|
|
|
|
+
|
|
|
|
|
+ # idx_top = df_envelope_all.groupby(flight_key)['peak_price'].idxmax()
|
|
|
|
|
+ # df_top = df_envelope_all.loc[idx_top, flight_key + ['flight_day', 'peak_price', 'peak_hours']].rename(
|
|
|
|
|
+ # columns={'flight_day': 'target_flight_day', 'peak_price': 'target_price', 'peak_hours': 'target_peak_hours'}
|
|
|
|
|
+ # )
|
|
|
|
|
+ # df_envelope_agg = df_envelope_agg.merge(df_top, on=flight_key, how='left')
|
|
|
|
|
|
|
|
- # 5. 合并到 df_min_hours
|
|
|
|
|
- df_min_hours = df_min_hours.merge(df_envelope_agg, on=flight_key, how='left')
|
|
|
|
|
- price_range = (df_min_hours['envelope_max'] - df_min_hours['envelope_min']).replace(0, 1) # 计算当前价格在包络区间的百分位
|
|
|
|
|
- df_min_hours['envelope_position'] = (
|
|
|
|
|
- (df_min_hours['adult_total_price'] - df_min_hours['envelope_min']) / price_range
|
|
|
|
|
- ).clip(0, 1).round(4)
|
|
|
|
|
- # df_min_hours['is_envelope_peak'] = (df_min_hours['envelope_position'] >= 0.75).astype(int) # 0.95 -> 0.75
|
|
|
|
|
- df_min_hours['is_target_day'] = (df_min_hours['flight_day'] == df_min_hours['target_flight_day']).astype(int)
|
|
|
|
|
|
|
+ # # 5. 合并到 df_min_hours
|
|
|
|
|
+ # df_min_hours = df_min_hours.merge(df_envelope_agg, on=flight_key, how='left')
|
|
|
|
|
+ # price_range = (df_min_hours['envelope_max'] - df_min_hours['envelope_min']).replace(0, 1) # 计算当前价格在包络区间的百分位
|
|
|
|
|
+ # df_min_hours['envelope_position'] = (
|
|
|
|
|
+ # (df_min_hours['adult_total_price'] - df_min_hours['envelope_min']) / price_range
|
|
|
|
|
+ # ).clip(0, 1).round(4)
|
|
|
|
|
+ # # df_min_hours['is_envelope_peak'] = (df_min_hours['envelope_position'] >= 0.75).astype(int) # 0.95 -> 0.75
|
|
|
|
|
+ # df_min_hours['is_target_day'] = (df_min_hours['flight_day'] == df_min_hours['target_flight_day']).astype(int)
|
|
|
|
|
|
|
|
# # ==================== 目标二:降价潜力评分 ====================
|
|
# # ==================== 目标二:降价潜力评分 ====================
|
|
|
# # 用“上涨后回落倾向”替代简单计数:drop / (drop + rise)
|
|
# # 用“上涨后回落倾向”替代简单计数:drop / (drop + rise)
|
|
@@ -1192,16 +1284,20 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# ).round(4)
|
|
# ).round(4)
|
|
|
|
|
|
|
|
# 综合评分阈值:大于阈值的都认为值得投放
|
|
# 综合评分阈值:大于阈值的都认为值得投放
|
|
|
- target_score_threshold = 0.5
|
|
|
|
|
|
|
+ target_score_threshold = 0.4
|
|
|
# df_min_hours['target_score_threshold'] = target_score_threshold
|
|
# df_min_hours['target_score_threshold'] = target_score_threshold
|
|
|
- df_min_hours['is_good_target'] = (df_min_hours['envelope_position'] >= target_score_threshold).astype(int)
|
|
|
|
|
|
|
+ df_min_hours['is_good_target'] = (df_min_hours['relative_position'] >= target_score_threshold).astype(int)
|
|
|
|
|
|
|
|
print(f">>> 包络线+降价潜力评分完成")
|
|
print(f">>> 包络线+降价潜力评分完成")
|
|
|
- del df_hist, df_future, df_envelope_all, df_envelope_agg, df_top # df_drop_freq, df_rise_freq
|
|
|
|
|
|
|
+ # del df_hist, df_future, df_envelope_all, df_envelope_agg, df_top # df_drop_freq, df_rise_freq
|
|
|
|
|
|
|
|
total_cnt_before = len(df_min_hours) # 记录下过滤前的总数
|
|
total_cnt_before = len(df_min_hours) # 记录下过滤前的总数
|
|
|
- df_min_hours = df_min_hours[(df_min_hours['is_good_target'] == 1) & (df_min_hours['seats_remaining'] >= 3)].reset_index(drop=True) # 保留值得投放的
|
|
|
|
|
|
|
+ df_min_hours = df_min_hours[(df_min_hours['is_good_target'] == 1) & (df_min_hours['seats_remaining'] >= 1)].reset_index(drop=True) # 保留值得投放的
|
|
|
total_cnt_after = len(df_min_hours) # 记录下过滤后的总数
|
|
total_cnt_after = len(df_min_hours) # 记录下过滤后的总数
|
|
|
|
|
+
|
|
|
|
|
+ # 余票为1的样本去掉
|
|
|
|
|
+ # df_drop_nodes = df_drop_nodes[df_drop_nodes['high_price_seats_remaining'] >= 2]
|
|
|
|
|
+ # df_rise_nodes = df_rise_nodes[df_rise_nodes['prev_rise_seats_remaining'] >= 2]
|
|
|
# =====================================================================
|
|
# =====================================================================
|
|
|
# df_min_hours = df_min_hours[(df_min_hours['seats_remaining'] >= 5)].reset_index(drop=True)
|
|
# df_min_hours = df_min_hours[(df_min_hours['seats_remaining'] >= 5)].reset_index(drop=True)
|
|
|
|
|
|
|
@@ -1240,6 +1336,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# seats_remaining_change_amount = row['seats_remaining_change_amount']
|
|
# seats_remaining_change_amount = row['seats_remaining_change_amount']
|
|
|
price_amount = row['adult_total_price']
|
|
price_amount = row['adult_total_price']
|
|
|
seats_remaining = row['seats_remaining']
|
|
seats_remaining = row['seats_remaining']
|
|
|
|
|
+ seats_remaining_transition = row['seats_remaining_transition']
|
|
|
# envelope_position = row['envelope_position']
|
|
# envelope_position = row['envelope_position']
|
|
|
|
|
|
|
|
length_drop = 0
|
|
length_drop = 0
|
|
@@ -1274,7 +1371,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
pct_vals.notna(),
|
|
pct_vals.notna(),
|
|
|
['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
'high_price_duration_hours', 'high_price_change_percent',
|
|
'high_price_duration_hours', 'high_price_change_percent',
|
|
|
- 'high_price_change_amount', 'high_price_amount', 'high_price_seats_remaining']
|
|
|
|
|
|
|
+ 'high_price_change_amount', 'high_price_amount', 'high_price_seats_remaining_transition']
|
|
|
].copy()
|
|
].copy()
|
|
|
df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
|
df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
|
|
df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
|
|
@@ -1291,8 +1388,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
)
|
|
)
|
|
|
df_match = df_drop_gap[
|
|
df_match = df_drop_gap[
|
|
|
(df_drop_gap['pct_abs_gap'] <= pct_threshold)
|
|
(df_drop_gap['pct_abs_gap'] <= pct_threshold)
|
|
|
- & (df_drop_gap['price_abs_gap'] <= 0.1)
|
|
|
|
|
|
|
+ & (df_drop_gap['price_abs_gap'] <= 0.001)
|
|
|
& same_sign_mask
|
|
& same_sign_mask
|
|
|
|
|
+ & (df_drop_gap['high_price_seats_remaining_transition'] == seats_remaining_transition)
|
|
|
].copy()
|
|
].copy()
|
|
|
# df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 1.0)].copy()
|
|
# df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 1.0)].copy()
|
|
|
# df_drop_gap = df_drop_gap.sort_values(['price_abs_gap'], ascending=[True])
|
|
# df_drop_gap = df_drop_gap.sort_values(['price_abs_gap'], ascending=[True])
|
|
@@ -1392,7 +1490,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
pct_vals_1.notna(),
|
|
pct_vals_1.notna(),
|
|
|
['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent',
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent',
|
|
|
- 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_seats_remaining']
|
|
|
|
|
|
|
+ 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_seats_remaining_transition']
|
|
|
].copy()
|
|
].copy()
|
|
|
df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
|
df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
|
|
df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
|
|
@@ -1409,8 +1507,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
)
|
|
)
|
|
|
df_match_1 = df_rise_gap_1[
|
|
df_match_1 = df_rise_gap_1[
|
|
|
(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1)
|
|
(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1)
|
|
|
- & (df_rise_gap_1['price_abs_gap'] <= 0.1)
|
|
|
|
|
|
|
+ & (df_rise_gap_1['price_abs_gap'] <= 0.001)
|
|
|
& same_sign_mask_1
|
|
& same_sign_mask_1
|
|
|
|
|
+ & (df_rise_gap_1['prev_rise_seats_remaining_transition'] == seats_remaining_transition)
|
|
|
].copy()
|
|
].copy()
|
|
|
# df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 1.0)].copy()
|
|
# df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 1.0)].copy()
|
|
|
# df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap'], ascending=[True])
|
|
# df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap'], ascending=[True])
|
|
@@ -1532,11 +1631,11 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
'flag_dist',
|
|
'flag_dist',
|
|
|
'drop_price_change_upper', 'drop_price_change_lower', 'drop_price_sample_size',
|
|
'drop_price_change_upper', 'drop_price_change_lower', 'drop_price_sample_size',
|
|
|
'rise_price_change_upper', 'rise_price_change_lower', 'rise_price_sample_size',
|
|
'rise_price_change_upper', 'rise_price_change_lower', 'rise_price_sample_size',
|
|
|
- 'envelope_max', 'envelope_min', 'envelope_mean', 'envelope_count',
|
|
|
|
|
- 'envelope_avg_peak_hours', 'envelope_position', # 包络线特征
|
|
|
|
|
- 'target_flight_day', 'target_price', 'target_peak_hours', 'is_target_day', # 高点起飞日(纯包络线高点)
|
|
|
|
|
|
|
+ # 'envelope_max', 'envelope_min', 'envelope_mean', 'envelope_count',
|
|
|
|
|
+ # 'envelope_avg_peak_hours', 'envelope_position', # 包络线特征
|
|
|
|
|
+ # 'target_flight_day', 'target_price', 'target_peak_hours', 'is_target_day', # 高点起飞日(纯包络线高点)
|
|
|
# 'drop_freq_count', 'drop_potential', 'target_score', # 降价潜力
|
|
# 'drop_freq_count', 'drop_potential', 'target_score', # 降价潜力
|
|
|
- 'is_good_target', # 综合目标评分()
|
|
|
|
|
|
|
+ 'relative_position', 'is_good_target', # 综合目标评分()
|
|
|
]
|
|
]
|
|
|
df_predict = df_min_hours[order_cols]
|
|
df_predict = df_min_hours[order_cols]
|
|
|
df_predict = df_predict.rename(columns={
|
|
df_predict = df_predict.rename(columns={
|