|
@@ -66,10 +66,13 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
.round(4)
|
|
.round(4)
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- # 第一步:标记价格变化段
|
|
|
|
|
|
|
+ # 第一步:标记价格变化段(按“是否发生新的实际变价事件”切段)
|
|
|
|
|
+ # 这样即使连续两次变价金额相同(如 -50, -50),也会分到不同段
|
|
|
|
|
+ _price_change_event = df_input['_raw_price_diff'].abs().ge(price_change_amount_threshold)
|
|
|
df_input['price_change_segment'] = (
|
|
df_input['price_change_segment'] = (
|
|
|
- df_input.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_amount']
|
|
|
|
|
- .apply(lambda s: (s != s.shift()).cumsum())
|
|
|
|
|
|
|
+ _price_change_event
|
|
|
|
|
+ .groupby([df_input['gid'], df_input['baggage_weight']], group_keys=False)
|
|
|
|
|
+ .cumsum()
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 第二步:计算每个变化段内的持续时间
|
|
# 第二步:计算每个变化段内的持续时间
|
|
@@ -99,7 +102,14 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
|
|
|
|
|
# 对于先升后降(先降再降)的分析
|
|
# 对于先升后降(先降再降)的分析
|
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
|
- drop_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] < 0)
|
|
|
|
|
|
|
+ # 正例库仅保留24小时内发生的降价:上一价格段持续时长需<=24h
|
|
|
|
|
+ prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
|
|
|
+ drop_mask = (
|
|
|
|
|
+ seg_start_mask
|
|
|
|
|
+ & prev_pct_num.notna()
|
|
|
|
|
+ & (df_target['price_change_percent'] < 0)
|
|
|
|
|
+ & prev_dur.le(24)
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
|
df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
@@ -117,10 +127,10 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
|
|
|
|
|
|
flight_info_cols = [
|
|
flight_info_cols = [
|
|
|
- 'citypair', 'flight_numbers', 'from_time', 'from_date', 'currency',
|
|
|
|
|
|
|
+ 'gid', 'baggage_weight', 'citypair', 'flight_numbers', 'from_time', 'from_date', 'currency',
|
|
|
]
|
|
]
|
|
|
flight_info_cols = [c for c in flight_info_cols if c in df_target.columns]
|
|
flight_info_cols = [c for c in flight_info_cols if c in df_target.columns]
|
|
|
- df_gid_info = df_target[['gid', 'baggage_weight'] + flight_info_cols].drop_duplicates(subset=['gid', 'baggage_weight']).reset_index(drop=True)
|
|
|
|
|
|
|
+ df_gid_info = df_target[flight_info_cols].drop_duplicates(subset=['gid', 'baggage_weight']).reset_index(drop=True)
|
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
|
|
|
|
|
|
|
|
drop_info_cols = [
|
|
drop_info_cols = [
|
|
@@ -128,12 +138,20 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'high_price_cabins',
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'high_price_cabins',
|
|
|
]
|
|
]
|
|
|
- # 按顺序排列 去掉gid
|
|
|
|
|
- df_drop_nodes = df_drop_nodes[flight_info_cols + ['baggage_weight'] + drop_info_cols]
|
|
|
|
|
|
|
+ # 按顺序排列 保留gid
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
|
|
|
|
|
|
- # 对于先升再升(先降再升)的分析
|
|
|
|
|
|
|
+ # 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
|
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
|
- rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
|
|
|
|
|
|
|
+ # rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
|
|
|
|
|
+ prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
|
|
|
+ valid_mask = seg_start_mask & prev_pct_num.notna()
|
|
|
|
|
+
|
|
|
|
|
+ curr_pct = pd.to_numeric(df_target['price_change_percent'], errors='coerce')
|
|
|
|
|
+ prev_dur_num = pd.to_numeric(prev_dur, errors='coerce')
|
|
|
|
|
+ pos_case_mask = curr_pct.ge(0)
|
|
|
|
|
+ neg_case_mask = curr_pct.lt(0) & prev_dur_num.gt(24)
|
|
|
|
|
+ rise_mask = valid_mask & (pos_case_mask | neg_case_mask)
|
|
|
|
|
|
|
|
df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
@@ -157,7 +175,7 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_cabins'
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_cabins'
|
|
|
]
|
|
]
|
|
|
- df_rise_nodes = df_rise_nodes[flight_info_cols + ['baggage_weight'] + rise_info_cols]
|
|
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[flight_info_cols + rise_info_cols]
|
|
|
|
|
|
|
|
# 制作历史包络线
|
|
# 制作历史包络线
|
|
|
envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
@@ -199,6 +217,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
.reset_index(drop=True)
|
|
.reset_index(drop=True)
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
+ # 余票不能太少
|
|
|
df_min_hours = df_min_hours[(df_min_hours['ticket_amount'] >= 2)].reset_index(drop=True)
|
|
df_min_hours = df_min_hours[(df_min_hours['ticket_amount'] >= 2)].reset_index(drop=True)
|
|
|
|
|
|
|
|
# 读历史降价场景
|
|
# 读历史降价场景
|
|
@@ -215,7 +234,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
else:
|
|
else:
|
|
|
df_rise_nodes = pd.DataFrame()
|
|
df_rise_nodes = pd.DataFrame()
|
|
|
|
|
|
|
|
- # 联合价格分布
|
|
|
|
|
|
|
+ # 联合价格分布 ==========================================================
|
|
|
# 统一初始化
|
|
# 统一初始化
|
|
|
df_min_hours['relative_position'] = np.nan
|
|
df_min_hours['relative_position'] = np.nan
|
|
|
if not df_drop_nodes.empty:
|
|
if not df_drop_nodes.empty:
|
|
@@ -274,7 +293,67 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
m = all_prices['source'] == 'rise'
|
|
m = all_prices['source'] == 'rise'
|
|
|
df_rise_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
df_rise_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
|
|
|
|
|
|
|
|
- pass
|
|
|
|
|
|
|
+ # ====================================================================================================
|
|
|
|
|
+
|
|
|
|
|
+ # print(">>> 构建跨航班日价格包络线")
|
|
|
|
|
+ # flight_key = ['citypair', 'flight_numbers', 'baggage_weight']
|
|
|
|
|
+ # day_key = flight_key + ['from_date']
|
|
|
|
|
+
|
|
|
|
|
+ # # 1. 历史侧:加载训练阶段的峰值数据
|
|
|
|
|
+ # envelope_csv_path = os.path.join(object_dir, f'{city_pair}_envelope_info.csv')
|
|
|
|
|
+ # if os.path.exists(envelope_csv_path):
|
|
|
|
|
+ # df_hist = pd.read_csv(envelope_csv_path)
|
|
|
|
|
+ # df_hist = df_hist[day_key + ['peak_price', 'peak_hours']]
|
|
|
|
|
+ # df_hist['source'] = 'hist'
|
|
|
|
|
+ # else:
|
|
|
|
|
+ # df_hist = pd.DataFrame()
|
|
|
|
|
+
|
|
|
|
|
+ # # 2. 未来侧:当前在售价格
|
|
|
|
|
+ # # df_future = df_min_hours[day_key + ['price_total', 'hours_until_departure']].copy().rename(
|
|
|
|
|
+ # # columns={'price_total': 'peak_price', 'hours_until_departure': 'peak_hours'}
|
|
|
|
|
+ # # )
|
|
|
|
|
+ # # df_future['source'] = 'future'
|
|
|
|
|
+ # df_future = pd.DataFrame()
|
|
|
|
|
+
|
|
|
|
|
+ # # 3. 合并包络线数据点
|
|
|
|
|
+ # df_envelope_all = pd.concat(
|
|
|
|
|
+ # [x for x in [df_hist, df_future] if not x.empty], ignore_index=True
|
|
|
|
|
+ # ).drop_duplicates(subset=day_key, keep='last')
|
|
|
|
|
+
|
|
|
|
|
+ # # 4. 包络线统计 + 找高点起飞日
|
|
|
|
|
+ # df_envelope_agg = df_envelope_all.groupby(flight_key).agg(
|
|
|
|
|
+ # envelope_max=('peak_price', 'max'), # 峰值最大
|
|
|
|
|
+ # envelope_min=('peak_price', 'min'), # 峰值最小
|
|
|
|
|
+ # envelope_mean=('peak_price', 'mean'), # 峰值平均
|
|
|
|
|
+ # envelope_count=('peak_price', 'count'), # 峰值统计总数
|
|
|
|
|
+ # envelope_avg_peak_hours=('peak_hours', 'mean'), # 峰值发生的距离起飞小时数, 做一下平均
|
|
|
|
|
+ # ).reset_index()
|
|
|
|
|
+
|
|
|
|
|
+ # # 对数值列保留两位小数
|
|
|
|
|
+ # df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']] = df_envelope_agg[['envelope_mean', 'envelope_avg_peak_hours']].round(2)
|
|
|
|
|
+
|
|
|
|
|
+ # idx_top = df_envelope_all.groupby(flight_key)['peak_price'].idxmax()
|
|
|
|
|
+ # df_top = df_envelope_all.loc[idx_top, flight_key + ['from_date', 'peak_price', 'peak_hours']].rename(
|
|
|
|
|
+ # columns={'from_date': 'target_flight_day', 'peak_price': 'target_price', 'peak_hours': 'target_peak_hours'}
|
|
|
|
|
+ # )
|
|
|
|
|
+ # df_envelope_agg = df_envelope_agg.merge(df_top, on=flight_key, how='left')
|
|
|
|
|
+
|
|
|
|
|
+ # # 5. 合并到 df_min_hours
|
|
|
|
|
+ # df_min_hours = df_min_hours.merge(df_envelope_agg, on=flight_key, how='left')
|
|
|
|
|
+ # price_range = (df_min_hours['envelope_max'] - df_min_hours['envelope_min']).replace(0, 1) # 计算当前价格在包络区间的百分位
|
|
|
|
|
+ # df_min_hours['envelope_position'] = (
|
|
|
|
|
+ # (df_min_hours['price_total'] - df_min_hours['envelope_min']) / price_range
|
|
|
|
|
+ # ).clip(0, 1).round(4)
|
|
|
|
|
+ # # df_min_hours['is_envelope_peak'] = (df_min_hours['envelope_position'] >= 0.75).astype(int) # 0.95 -> 0.75
|
|
|
|
|
+ # df_min_hours['is_target_day'] = (df_min_hours['from_date'] == df_min_hours['target_flight_day']).astype(int)
|
|
|
|
|
+
|
|
|
|
|
+ # 综合评分阈值:大于阈值的都认为值得投放
|
|
|
|
|
+ relative_position_threshold = 0.5
|
|
|
|
|
+ df_min_hours['is_good_target'] = (df_min_hours['relative_position'] >= relative_position_threshold).astype(int)
|
|
|
|
|
+ total_cnt_before = len(df_min_hours) # 记录下过滤前的总数
|
|
|
|
|
+ df_min_hours = df_min_hours[(df_min_hours['is_good_target'] == 1)].reset_index(drop=True) # 保留值得投放的
|
|
|
|
|
+ total_cnt_after = len(df_min_hours) # 记录下过滤后的总数
|
|
|
|
|
+
|
|
|
# =====================================================================
|
|
# =====================================================================
|
|
|
|
|
|
|
|
df_min_hours['simple_will_price_drop'] = 0
|
|
df_min_hours['simple_will_price_drop'] = 0
|
|
@@ -290,8 +369,8 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
|
|
|
|
|
|
# 这个阈值取多少?
|
|
# 这个阈值取多少?
|
|
|
- pct_threshold = 0.01
|
|
|
|
|
- pct_threshold_1 = 0.01
|
|
|
|
|
|
|
+ pct_threshold = 0.1
|
|
|
|
|
+ pct_threshold_1 = 0.1
|
|
|
|
|
|
|
|
for idx, row in df_min_hours.iterrows():
|
|
for idx, row in df_min_hours.iterrows():
|
|
|
city_pair = row['citypair']
|
|
city_pair = row['citypair']
|
|
@@ -338,7 +417,15 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
df_drop_gap['price_abs_gap'] = df_drop_gap['price_gap'].abs()
|
|
df_drop_gap['price_abs_gap'] = df_drop_gap['price_gap'].abs()
|
|
|
|
|
|
|
|
df_drop_gap = df_drop_gap.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
|
|
df_drop_gap = df_drop_gap.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
|
|
|
- df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 3.0)].copy()
|
|
|
|
|
|
|
+ same_sign_mask = (
|
|
|
|
|
+ np.sign(pd.to_numeric(df_drop_gap['high_price_change_percent'], errors='coerce'))
|
|
|
|
|
+ == np.sign(pct_base)
|
|
|
|
|
+ )
|
|
|
|
|
+ df_match = df_drop_gap[
|
|
|
|
|
+ (df_drop_gap['pct_abs_gap'] <= pct_threshold)
|
|
|
|
|
+ & (df_drop_gap['price_abs_gap'] <= 3.0)
|
|
|
|
|
+ & same_sign_mask
|
|
|
|
|
+ ].copy()
|
|
|
|
|
|
|
|
# 历史上出现的极近似的增长(下降)幅度后的降价场景
|
|
# 历史上出现的极近似的增长(下降)幅度后的降价场景
|
|
|
if not df_match.empty:
|
|
if not df_match.empty:
|
|
@@ -353,17 +440,18 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
# df_match_chk = df_match_chk.loc[drop_dtd_vals.notna()].copy()
|
|
# df_match_chk = df_match_chk.loc[drop_dtd_vals.notna()].copy()
|
|
|
# df_match_chk = df_match_chk.loc[(drop_dtd_vals.loc[drop_dtd_vals.notna()] - float(dtd_base)).abs() <= 3].copy()
|
|
# df_match_chk = df_match_chk.loc[(drop_dtd_vals.loc[drop_dtd_vals.notna()] - float(dtd_base)).abs() <= 3].copy()
|
|
|
|
|
|
|
|
- drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
|
|
- df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
|
|
- df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
|
|
|
|
|
|
|
+ # drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ # df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
|
|
+ # df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
|
|
|
|
|
|
|
|
|
|
+ # 正例收紧
|
|
|
dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
|
dur_delta = dur_num_chk - float(dur_base)
|
|
dur_delta = dur_num_chk - float(dur_base)
|
|
|
df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
|
|
df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
|
|
|
df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
|
|
df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
|
|
|
- df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 48].copy()
|
|
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 72].copy()
|
|
|
|
|
|
|
|
- # 距离起飞天数也对的上
|
|
|
|
|
|
|
+ # 所有条件都对的上
|
|
|
if not df_match_chk.empty:
|
|
if not df_match_chk.empty:
|
|
|
length_drop = df_match_chk.shape[0]
|
|
length_drop = df_match_chk.shape[0]
|
|
|
df_min_hours.loc[idx, 'drop_price_sample_size'] = length_drop
|
|
df_min_hours.loc[idx, 'drop_price_sample_size'] = length_drop
|
|
@@ -425,7 +513,15 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
df_rise_gap_1['price_abs_gap'] = df_rise_gap_1['price_gap'].abs()
|
|
df_rise_gap_1['price_abs_gap'] = df_rise_gap_1['price_gap'].abs()
|
|
|
|
|
|
|
|
df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
|
|
df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
|
|
|
- df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 3.0)].copy()
|
|
|
|
|
|
|
+ same_sign_mask_1 = (
|
|
|
|
|
+ np.sign(pd.to_numeric(df_rise_gap_1['prev_rise_change_percent'], errors='coerce'))
|
|
|
|
|
+ == np.sign(pct_base_1)
|
|
|
|
|
+ )
|
|
|
|
|
+ df_match_1 = df_rise_gap_1.loc[
|
|
|
|
|
+ (df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1)
|
|
|
|
|
+ & (df_rise_gap_1['price_abs_gap'] <= 3.0)
|
|
|
|
|
+ & same_sign_mask_1
|
|
|
|
|
+ ].copy()
|
|
|
|
|
|
|
|
# 历史上出现的极近似的增长(下降)幅度后的升价场景
|
|
# 历史上出现的极近似的增长(下降)幅度后的升价场景
|
|
|
if not df_match_1.empty:
|
|
if not df_match_1.empty:
|
|
@@ -440,11 +536,17 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[drop_dtd_vals_1.notna()].copy()
|
|
# df_match_chk_1 = df_match_chk_1.loc[drop_dtd_vals_1.notna()].copy()
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[(drop_dtd_vals_1.loc[drop_dtd_vals_1.notna()] - float(dtd_base_1)).abs() <= 3].copy()
|
|
# df_match_chk_1 = df_match_chk_1.loc[(drop_dtd_vals_1.loc[drop_dtd_vals_1.notna()] - float(dtd_base_1)).abs() <= 3].copy()
|
|
|
|
|
|
|
|
- rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= -24].copy()
|
|
|
|
|
|
|
+ # rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= -24].copy()
|
|
|
|
|
+
|
|
|
|
|
+ # 反例收紧:48小时内发生降价的不算显著反例
|
|
|
|
|
+ _rise_pct_chk = pd.to_numeric(df_match_chk_1['rise_price_change_percent'], errors='coerce')
|
|
|
|
|
+ _prev_dur_chk = pd.to_numeric(df_match_chk_1['prev_rise_duration_hours'], errors='coerce')
|
|
|
|
|
+ _exclude_mask = _rise_pct_chk.lt(0) & _prev_dur_chk.lt(48)
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[~_exclude_mask.fillna(False)].copy()
|
|
|
|
|
|
|
|
- # 距离起飞天数也对的上
|
|
|
|
|
|
|
+ # 所有条件都对的上
|
|
|
if not df_match_chk_1.empty:
|
|
if not df_match_chk_1.empty:
|
|
|
length_rise = df_match_chk_1.shape[0]
|
|
length_rise = df_match_chk_1.shape[0]
|
|
|
df_min_hours.loc[idx, 'rise_price_sample_size'] = length_rise
|
|
df_min_hours.loc[idx, 'rise_price_sample_size'] = length_rise
|
|
@@ -465,7 +567,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
else:
|
|
else:
|
|
|
drop_prob = round(length_drop / (length_rise + length_drop), 2)
|
|
drop_prob = round(length_drop / (length_rise + length_drop), 2)
|
|
|
# 依旧保持之前的降价判定,概率修改
|
|
# 依旧保持之前的降价判定,概率修改
|
|
|
- if drop_prob >= 0.7:
|
|
|
|
|
|
|
+ if drop_prob > 0.5:
|
|
|
df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
# df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'd1'
|
|
# df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'd1'
|
|
|
df_min_hours.loc[idx, 'flag_dist'] = 'd1'
|
|
df_min_hours.loc[idx, 'flag_dist'] = 'd1'
|
|
@@ -487,7 +589,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
order_cols = [
|
|
order_cols = [
|
|
|
"citypair", "flight_numbers", "baggage_weight", "from_date", "from_time",
|
|
"citypair", "flight_numbers", "baggage_weight", "from_date", "from_time",
|
|
|
"cabins", "ticket_amount", "currency", "price_base", "price_tax",
|
|
"cabins", "ticket_amount", "currency", "price_base", "price_tax",
|
|
|
- "price_total", 'relative_position', 'days_to_departure', 'hours_until_departure',
|
|
|
|
|
|
|
+ "price_total", 'relative_position', 'is_good_target', 'days_to_departure', 'hours_until_departure',
|
|
|
'price_change_amount', 'price_change_percent', 'price_duration_hours',
|
|
'price_change_amount', 'price_change_percent', 'price_duration_hours',
|
|
|
"update_hour", "create_time",
|
|
"update_hour", "create_time",
|
|
|
'valid_begin_hour', 'valid_end_hour',
|
|
'valid_begin_hour', 'valid_end_hour',
|
|
@@ -519,7 +621,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
else:
|
|
else:
|
|
|
drop_1_cnt = 0
|
|
drop_1_cnt = 0
|
|
|
drop_0_cnt = 0
|
|
drop_0_cnt = 0
|
|
|
- print(f"will_price_drop 分类数量统计: 1(会降)={drop_1_cnt}, 0(不降)={drop_0_cnt}, 总数={total_cnt}")
|
|
|
|
|
|
|
+ print(f"will_price_drop 分类数量统计: 1(会降)={drop_1_cnt}, 0(不降)={drop_0_cnt}, 总数={total_cnt}, 过滤前总数={total_cnt_before}")
|
|
|
|
|
|
|
|
csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
|
|
csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
|
|
|
df_predict.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
|
|
df_predict.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
|