|
|
@@ -996,6 +996,16 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
]
|
|
|
df_rise_nodes = df_rise_nodes[flight_info_cols + rise_info_cols]
|
|
|
|
|
|
+ # 制作历史包络线
|
|
|
+ envelope_group = ['city_pair', 'flight_number_1', 'flight_number_2', 'flight_day']
|
|
|
+ idx_peak = df_input.groupby(envelope_group)['adult_total_price'].idxmax()
|
|
|
+ df_envelope = df_input.loc[idx_peak, envelope_group + [
|
|
|
+ 'adult_total_price', 'hours_until_departure'
|
|
|
+ ]].rename(columns={
|
|
|
+ 'adult_total_price': 'peak_price',
|
|
|
+ 'hours_until_departure': 'peak_hours',
|
|
|
+ }).reset_index(drop=True)
|
|
|
+
|
|
|
# 对于没有先升后降的gid进行分析
|
|
|
# gids_with_drop = df_target.loc[drop_mask, 'gid'].unique()
|
|
|
# df_no_drop = df_target[~df_target['gid'].isin(gids_with_drop)].copy()
|
|
|
@@ -1048,9 +1058,9 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
del df_target
|
|
|
# del df_no_drop
|
|
|
|
|
|
- return df_input, df_drop_nodes, df_rise_nodes
|
|
|
+ return df_input, df_drop_nodes, df_rise_nodes, df_envelope
|
|
|
|
|
|
- return df_input, None, None
|
|
|
+ return df_input, None, None, None
|
|
|
|
|
|
|
|
|
def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".", pred_time_str=""):
|
|
|
@@ -1089,25 +1099,137 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
else:
|
|
|
df_rise_nodes = pd.DataFrame()
|
|
|
|
|
|
+ # ==================== 跨航班日包络线 + 降价潜力 ====================
|
|
|
+ print(">>> 构建跨航班日价格包络线")
|
|
|
+ flight_key = ['city_pair', 'flight_number_1', 'flight_number_2']
|
|
|
+ day_key = flight_key + ['flight_day']
|
|
|
+
|
|
|
+ # 1. 历史侧:加载训练阶段的峰值数据
|
|
|
+ envelope_csv_path = os.path.join(output_dir, f'{group_route_str}_envelope_info.csv')
|
|
|
+ if os.path.exists(envelope_csv_path):
|
|
|
+ df_hist = pd.read_csv(envelope_csv_path)
|
|
|
+ df_hist = df_hist[day_key + ['peak_price', 'peak_hours']]
|
|
|
+ df_hist['source'] = 'hist'
|
|
|
+ else:
|
|
|
+ df_hist = pd.DataFrame()
|
|
|
+
|
|
|
+ # 2. 未来侧:当前在售价格
|
|
|
+ df_future = df_min_hours[day_key + ['adult_total_price', 'hours_until_departure']].copy().rename(
|
|
|
+ columns={'adult_total_price': 'peak_price', 'hours_until_departure': 'peak_hours'}
|
|
|
+ )
|
|
|
+ df_future['source'] = 'future'
|
|
|
+
|
|
|
+ # 3. 合并包络线数据点
|
|
|
+ df_envelope_all = pd.concat(
|
|
|
+ [x for x in [df_hist, df_future] if not x.empty], ignore_index=True
|
|
|
+ ).drop_duplicates(subset=day_key, keep='last')
|
|
|
+
|
|
|
+ # 4. 包络线统计 + 找高点起飞日
|
|
|
+ df_envelope_agg = df_envelope_all.groupby(flight_key).agg(
|
|
|
+ envelope_max=('peak_price', 'max'), # 峰值最大
|
|
|
+ envelope_min=('peak_price', 'min'), # 峰值最小
|
|
|
+ envelope_mean=('peak_price', 'mean'), # 峰值平均
|
|
|
+ envelope_count=('peak_price', 'count'), # 峰值统计总数
|
|
|
+ envelope_avg_peak_hours=('peak_hours', 'mean'), # 峰值发生的距离起飞小时数, 做一下平均
|
|
|
+ ).reset_index()
|
|
|
+
|
|
|
+ # 对数值列保留两位小数
|
|
|
+ df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']] = df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']].round(2)
|
|
|
+
|
|
|
+ idx_top = df_envelope_all.groupby(flight_key)['peak_price'].idxmax()
|
|
|
+ df_top = df_envelope_all.loc[idx_top, flight_key + ['flight_day', 'peak_price', 'peak_hours']].rename(
|
|
|
+ columns={'flight_day': 'target_flight_day', 'peak_price': 'target_price', 'peak_hours': 'target_peak_hours'}
|
|
|
+ )
|
|
|
+ df_envelope_agg = df_envelope_agg.merge(df_top, on=flight_key, how='left')
|
|
|
+
|
|
|
+ # 5. 合并到 df_min_hours
|
|
|
+ df_min_hours = df_min_hours.merge(df_envelope_agg, on=flight_key, how='left')
|
|
|
+ price_range = (df_min_hours['envelope_max'] - df_min_hours['envelope_min']).replace(0, 1) # 计算当前价格在包络区间的百分位
|
|
|
+ df_min_hours['envelope_position'] = (
|
|
|
+ (df_min_hours['adult_total_price'] - df_min_hours['envelope_min']) / price_range
|
|
|
+ ).clip(0, 1).round(4)
|
|
|
+ df_min_hours['is_envelope_peak'] = (df_min_hours['envelope_position'] >= 0.75).astype(int) # 0.95 -> 0.75
|
|
|
+ df_min_hours['is_target_day'] = (df_min_hours['flight_day'] == df_min_hours['target_flight_day']).astype(int)
|
|
|
+
|
|
|
+ # ==================== 目标二:降价潜力评分 ====================
|
|
|
+ # 用“上涨后回落倾向”替代简单计数:drop / (drop + rise)
|
|
|
+ # drop_count 来自 _drop_info.csv(上涨段后转跌),rise_count 来自 _rise_info.csv(上涨段后继续涨)
|
|
|
+ df_min_hours['drop_potential'] = 0.0
|
|
|
+
|
|
|
+ # 先保证相关列一定存在,避免后续选列 KeyError
|
|
|
+ # df_min_hours['drop_freq_count'] = 0.0
|
|
|
+ # df_min_hours['rise_freq_count'] = 0.0
|
|
|
+
|
|
|
+ df_drop_freq = pd.DataFrame(columns=flight_key + ['drop_freq_count'])
|
|
|
+ df_rise_freq = pd.DataFrame(columns=flight_key + ['rise_freq_count'])
|
|
|
+
|
|
|
+ if not df_drop_nodes.empty:
|
|
|
+ df_drop_freq = (
|
|
|
+ df_drop_nodes.groupby(flight_key)
|
|
|
+ .size()
|
|
|
+ .reset_index(name='drop_freq_count')
|
|
|
+ )
|
|
|
+
|
|
|
+ if not df_rise_nodes.empty:
|
|
|
+ df_rise_freq = (
|
|
|
+ df_rise_nodes.groupby(flight_key)
|
|
|
+ .size()
|
|
|
+ .reset_index(name='rise_freq_count')
|
|
|
+ )
|
|
|
+
|
|
|
+ if (not df_drop_freq.empty) or (not df_rise_freq.empty):
|
|
|
+ df_min_hours = df_min_hours.merge(df_drop_freq, on=flight_key, how='left')
|
|
|
+ df_min_hours = df_min_hours.merge(df_rise_freq, on=flight_key, how='left')
|
|
|
+
|
|
|
+ df_min_hours['drop_freq_count'] = df_min_hours['drop_freq_count'].fillna(0).astype(float)
|
|
|
+ df_min_hours['rise_freq_count'] = df_min_hours['rise_freq_count'].fillna(0).astype(float)
|
|
|
+
|
|
|
+ # 轻微平滑,避免样本很少时出现 0/0 或过度极端
|
|
|
+ alpha = 1.0
|
|
|
+ denom = df_min_hours['drop_freq_count'] + df_min_hours['rise_freq_count'] + 2.0 * alpha
|
|
|
+ df_min_hours['drop_potential'] = (
|
|
|
+ (df_min_hours['drop_freq_count'] + alpha) / denom.replace(0, np.nan)
|
|
|
+ ).fillna(0.0).clip(0, 1).round(4)
|
|
|
+
|
|
|
+ # ==================== 综合评分:包络高位 × 降价潜力 ====================
|
|
|
+ # target_score = 包络位置(越高越好)× 降价潜力(越高越好)
|
|
|
+ thres_ep = 0.7
|
|
|
+ thres_dp = 0.3
|
|
|
+ df_min_hours['target_score'] = (
|
|
|
+ df_min_hours['envelope_position'] * thres_ep + df_min_hours['drop_potential'] * thres_dp
|
|
|
+ ).round(4)
|
|
|
+
|
|
|
+ # 综合评分阈值:大于阈值的都认为值得投放
|
|
|
+ target_score_threshold = 0.75
|
|
|
+ # df_min_hours['target_score_threshold'] = target_score_threshold
|
|
|
+ df_min_hours['is_good_target'] = (df_min_hours['target_score'] >= target_score_threshold).astype(int)
|
|
|
+
|
|
|
+ print(f">>> 包络线+降价潜力评分完成")
|
|
|
+ del df_hist, df_future, df_envelope_all, df_envelope_agg, df_top, df_drop_freq, df_rise_freq
|
|
|
+
|
|
|
+ df_min_hours = df_min_hours[df_min_hours['is_good_target'] == 1].reset_index(drop=True) # 保留值得投放的
|
|
|
+
|
|
|
+ # =====================================================================
|
|
|
+
|
|
|
df_min_hours['simple_will_price_drop'] = 0
|
|
|
df_min_hours['simple_drop_in_hours'] = 0
|
|
|
df_min_hours['simple_drop_in_hours_prob'] = 0.0
|
|
|
df_min_hours['simple_drop_in_hours_dist'] = '' # 空串 表示未知
|
|
|
df_min_hours['flag_dist'] = ''
|
|
|
df_min_hours['drop_price_change_upper'] = 0.0
|
|
|
- df_min_hours['drop_price_change_mode'] = 0.0
|
|
|
+ # df_min_hours['drop_price_change_mode'] = 0.0
|
|
|
df_min_hours['drop_price_change_lower'] = 0.0
|
|
|
df_min_hours['drop_price_sample_size'] = 0
|
|
|
df_min_hours['rise_price_change_upper'] = 0.0
|
|
|
- df_min_hours['rise_price_change_mode'] = 0.0
|
|
|
+ # df_min_hours['rise_price_change_mode'] = 0.0
|
|
|
df_min_hours['rise_price_change_lower'] = 0.0
|
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
|
|
|
|
# 这个阈值取多少?
|
|
|
- pct_threshold = 0.001
|
|
|
+ pct_threshold = 0.01
|
|
|
# pct_threshold = 2
|
|
|
- pct_threshold_1 = 0.001
|
|
|
- pct_threshold_c = 0.001
|
|
|
+ pct_threshold_1 = 0.01
|
|
|
+ # pct_threshold_c = 0.001
|
|
|
|
|
|
for idx, row in df_min_hours.iterrows():
|
|
|
city_pair = row['city_pair']
|
|
|
@@ -1180,9 +1302,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# df_match_chk = df_match_chk.loc[dur_vals.notna()].copy()
|
|
|
# df_match_chk = df_match_chk.loc[(dur_vals.loc[dur_vals.notna()] - float(dur_base)).abs() <= 36].copy()
|
|
|
|
|
|
- drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
- df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
- df_match_chk = df_match_chk.loc[(drop_hud_vals.loc[drop_hud_vals.notna()] - float(hud_base)).abs() <= 24].copy()
|
|
|
+ # drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
+ # df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
+ # df_match_chk = df_match_chk.loc[(drop_hud_vals.loc[drop_hud_vals.notna()] - float(hud_base)).abs() <= 24].copy()
|
|
|
|
|
|
# seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
|
|
|
# df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
|
|
|
@@ -1198,9 +1320,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_min_hours.loc[idx, 'drop_price_change_upper'] = round(drop_price_change_upper, 2)
|
|
|
df_min_hours.loc[idx, 'drop_price_change_lower'] = round(drop_price_change_lower, 2)
|
|
|
|
|
|
- drop_mode_values = df_match_chk['drop_price_change_amount'].mode() # 降价众数
|
|
|
- if len(drop_mode_values) > 0:
|
|
|
- df_min_hours.loc[idx, 'drop_price_change_mode'] = round(float(drop_mode_values[0]), 2)
|
|
|
+ # drop_mode_values = df_match_chk['drop_price_change_amount'].mode() # 降价众数
|
|
|
+ # if len(drop_mode_values) > 0:
|
|
|
+ # df_min_hours.loc[idx, 'drop_price_change_mode'] = round(float(drop_mode_values[0]), 2)
|
|
|
|
|
|
remaining_hours = (
|
|
|
pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
|
|
|
@@ -1379,9 +1501,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[dur_vals_1.notna()].copy()
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 24].copy()
|
|
|
|
|
|
- rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[(rise_hud_vals_1.loc[rise_hud_vals_1.notna()] - float(hud_base_1)).abs() <= 24].copy()
|
|
|
+ # rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[(rise_hud_vals_1.loc[rise_hud_vals_1.notna()] - float(hud_base_1)).abs() <= 24].copy()
|
|
|
|
|
|
# seats_vals_1 = pd.to_numeric(df_match_chk_1['rise_seats_remaining_change_amount'], errors='coerce')
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
|
|
|
@@ -1397,9 +1519,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_min_hours.loc[idx, 'rise_price_change_upper'] = round(rise_price_change_upper, 2)
|
|
|
df_min_hours.loc[idx, 'rise_price_change_lower'] = round(rise_price_change_lower, 2)
|
|
|
|
|
|
- rise_mode_values = df_match_chk_1['rise_price_change_amount'].mode() # 涨价众数
|
|
|
- if len(rise_mode_values) > 0:
|
|
|
- df_min_hours.loc[idx, 'rise_price_change_mode'] = round(float(rise_mode_values[0]), 2)
|
|
|
+ # rise_mode_values = df_match_chk_1['rise_price_change_amount'].mode() # 涨价众数
|
|
|
+ # if len(rise_mode_values) > 0:
|
|
|
+ # df_min_hours.loc[idx, 'rise_price_change_mode'] = round(float(rise_mode_values[0]), 2)
|
|
|
|
|
|
# 可以明确的判定不降价
|
|
|
if length_drop == 0:
|
|
|
@@ -1478,7 +1600,27 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
# df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n1'
|
|
|
pass
|
|
|
- print("判定过程结束")
|
|
|
+ print("判定循环结束")
|
|
|
+ # 按航班号统一其降价/涨价的上限与下限, 上限统一取最大, 下限统一取最小
|
|
|
+ # _grp_cols = ['city_pair', 'flight_number_1', 'flight_number_2']
|
|
|
+ # _g = df_min_hours.groupby(_grp_cols, dropna=False)
|
|
|
+ # df_min_hours['drop_price_change_upper'] = pd.to_numeric(
|
|
|
+ # _g['drop_price_change_upper'].transform('max'),
|
|
|
+ # errors='coerce'
|
|
|
+ # ).fillna(0.0).round(2)
|
|
|
+ # df_min_hours['drop_price_change_lower'] = pd.to_numeric(
|
|
|
+ # _g['drop_price_change_lower'].transform('min'),
|
|
|
+ # errors='coerce'
|
|
|
+ # ).fillna(0.0).round(2)
|
|
|
+ # df_min_hours['rise_price_change_upper'] = pd.to_numeric(
|
|
|
+ # _g['rise_price_change_upper'].transform('max'),
|
|
|
+ # errors='coerce'
|
|
|
+ # ).fillna(0.0).round(2)
|
|
|
+ # df_min_hours['rise_price_change_lower'] = pd.to_numeric(
|
|
|
+ # _g['rise_price_change_lower'].transform('min'),
|
|
|
+ # errors='coerce'
|
|
|
+ # ).fillna(0.0).round(2)
|
|
|
+
|
|
|
df_min_hours = df_min_hours.rename(columns={'seg1_dep_time': 'from_time'})
|
|
|
_pred_dt = pd.to_datetime(str(pred_time_str), format="%Y%m%d%H%M", errors="coerce")
|
|
|
df_min_hours["update_hour"] = _pred_dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
@@ -1494,8 +1636,13 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
'valid_begin_hour', 'valid_end_hour',
|
|
|
'simple_will_price_drop', 'simple_drop_in_hours', 'simple_drop_in_hours_prob', 'simple_drop_in_hours_dist',
|
|
|
'flag_dist',
|
|
|
- 'drop_price_change_upper', 'drop_price_change_mode', 'drop_price_change_lower', 'drop_price_sample_size',
|
|
|
- 'rise_price_change_upper', 'rise_price_change_mode', 'rise_price_change_lower', 'rise_price_sample_size',
|
|
|
+ 'drop_price_change_upper', 'drop_price_change_lower', 'drop_price_sample_size',
|
|
|
+ 'rise_price_change_upper', 'rise_price_change_lower', 'rise_price_sample_size',
|
|
|
+ 'envelope_max', 'envelope_min', 'envelope_mean', 'envelope_count',
|
|
|
+ 'envelope_avg_peak_hours', 'envelope_position', 'is_envelope_peak', # 包络线特征
|
|
|
+ 'target_flight_day', 'target_price', 'target_peak_hours', 'is_target_day', # 高点起飞日(纯包络线高点)
|
|
|
+ 'drop_freq_count', 'drop_potential', # 降价潜力
|
|
|
+ 'target_score', 'is_good_target', # 综合目标评分(高点 × 降价潜力 = 最终投放目标)
|
|
|
]
|
|
|
df_predict = df_min_hours[order_cols]
|
|
|
df_predict = df_predict.rename(columns={
|
|
|
@@ -1513,15 +1660,25 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
na_position='last',
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
- # 时间段过滤 过滤掉异常时间(update_hour 早于 crawl_date)因为现在有实时验价, 不做8小时之内的过滤
|
|
|
+ # 时间段过滤 过滤掉异常时间(update_hour 早于 crawl_date)
|
|
|
update_dt = pd.to_datetime(df_predict["update_hour"], errors="coerce")
|
|
|
crawl_dt = pd.to_datetime(df_predict["crawl_date"], errors="coerce")
|
|
|
dt_diff = update_dt - crawl_dt
|
|
|
df_predict = df_predict.loc[
|
|
|
- # (dt_diff >= pd.Timedelta(0)) & (dt_diff <= pd.Timedelta(hours=8))
|
|
|
- (dt_diff >= pd.Timedelta(0))
|
|
|
+ (dt_diff >= pd.Timedelta(0)) & (dt_diff <= pd.Timedelta(hours=12))
|
|
|
+ # (dt_diff >= pd.Timedelta(0))
|
|
|
].reset_index(drop=True)
|
|
|
- print("更新时间过滤")
|
|
|
+ print("更新时间过滤完成")
|
|
|
+
|
|
|
+ total_cnt = len(df_predict)
|
|
|
+ if "will_price_drop" in df_predict.columns:
|
|
|
+ _wpd = pd.to_numeric(df_predict["will_price_drop"], errors="coerce")
|
|
|
+ drop_1_cnt = int((_wpd == 1).sum())
|
|
|
+ drop_0_cnt = int((_wpd == 0).sum())
|
|
|
+ else:
|
|
|
+ drop_1_cnt = 0
|
|
|
+ drop_0_cnt = 0
|
|
|
+ print(f"will_price_drop 分类数量统计: 1(会降)={drop_1_cnt}, 0(不降)={drop_0_cnt}, 总数={total_cnt}")
|
|
|
|
|
|
csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
|
|
|
df_predict.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
|