|
@@ -102,13 +102,13 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
|
|
|
|
|
# 对于先升后降(先降再降)的分析
|
|
# 对于先升后降(先降再降)的分析
|
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
|
|
|
- # 正例库仅保留24小时内发生的降价:上一价格段持续时长需<=24h
|
|
|
|
|
|
|
+ # 正例库
|
|
|
prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
|
drop_mask = (
|
|
drop_mask = (
|
|
|
seg_start_mask
|
|
seg_start_mask
|
|
|
& prev_pct_num.notna()
|
|
& prev_pct_num.notna()
|
|
|
& (df_target['price_change_percent'] < 0)
|
|
& (df_target['price_change_percent'] < 0)
|
|
|
- & prev_dur.le(24)
|
|
|
|
|
|
|
+ # & prev_dur.le(24) # 仅保留24小时内发生的降价:上一价格段持续时长需<=24h
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
@@ -142,19 +142,21 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
|
df_drop_nodes['start_hours_until_departure'] = (df_drop_nodes['drop_hours_until_departure'] + df_drop_nodes['high_price_duration_hours']).round().astype('Int64')
|
|
df_drop_nodes['start_hours_until_departure'] = (df_drop_nodes['drop_hours_until_departure'] + df_drop_nodes['high_price_duration_hours']).round().astype('Int64')
|
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_hours_until_departure'] <= 360]
|
|
df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_hours_until_departure'] <= 360]
|
|
|
- df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72]
|
|
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72]
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['high_price_duration_hours'] > 2.0] # 维持时间太短的不计
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_price_change_amount'].abs() > 1] # 1¥之内的降价不计
|
|
|
|
|
|
|
|
- # 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
|
|
|
|
|
|
|
+ # 反例库:所有升价节点
|
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
# seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
|
# rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
|
|
# rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
|
|
|
prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
|
|
|
valid_mask = seg_start_mask & prev_pct_num.notna()
|
|
valid_mask = seg_start_mask & prev_pct_num.notna()
|
|
|
|
|
|
|
|
curr_pct = pd.to_numeric(df_target['price_change_percent'], errors='coerce')
|
|
curr_pct = pd.to_numeric(df_target['price_change_percent'], errors='coerce')
|
|
|
- prev_dur_num = pd.to_numeric(prev_dur, errors='coerce')
|
|
|
|
|
|
|
+ # prev_dur_num = pd.to_numeric(prev_dur, errors='coerce')
|
|
|
pos_case_mask = curr_pct.ge(0)
|
|
pos_case_mask = curr_pct.ge(0)
|
|
|
- neg_case_mask = curr_pct.lt(0) & prev_dur_num.gt(24)
|
|
|
|
|
- rise_mask = valid_mask & (pos_case_mask | neg_case_mask)
|
|
|
|
|
|
|
+ # neg_case_mask = curr_pct.lt(0) & prev_dur_num.gt(24)
|
|
|
|
|
+ rise_mask = valid_mask & pos_case_mask # (pos_case_mask | neg_case_mask)
|
|
|
|
|
|
|
|
df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
|
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
@@ -182,6 +184,8 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
df_rise_nodes['start_hours_until_departure'] = (df_rise_nodes['rise_hours_until_departure'] + df_rise_nodes['prev_rise_duration_hours']).round().astype('Int64')
|
|
df_rise_nodes['start_hours_until_departure'] = (df_rise_nodes['rise_hours_until_departure'] + df_rise_nodes['prev_rise_duration_hours']).round().astype('Int64')
|
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_hours_until_departure'] <= 360]
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_hours_until_departure'] <= 360]
|
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['start_hours_until_departure'] >= 72]
|
|
df_rise_nodes = df_rise_nodes[df_rise_nodes['start_hours_until_departure'] >= 72]
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[df_rise_nodes['prev_rise_duration_hours'] > 2.0] # 维持时间太短的不计
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_price_change_amount'].abs() > 1] # 1¥之内的改变不计
|
|
|
|
|
|
|
|
# 制作历史包络线
|
|
# 制作历史包络线
|
|
|
envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
@@ -408,8 +412,8 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
|
|
|
|
|
|
# 这个阈值取多少?
|
|
# 这个阈值取多少?
|
|
|
- pct_threshold = 0.1
|
|
|
|
|
- pct_threshold_1 = 0.1
|
|
|
|
|
|
|
+ pct_threshold = 0.2
|
|
|
|
|
+ pct_threshold_1 = 0.2
|
|
|
|
|
|
|
|
for idx, row in df_min_hours.iterrows():
|
|
for idx, row in df_min_hours.iterrows():
|
|
|
city_pair = row['citypair']
|
|
city_pair = row['citypair']
|
|
@@ -444,8 +448,10 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
|
|
pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
|
|
|
df_drop_gap = df_drop_nodes_part.loc[
|
|
df_drop_gap = df_drop_nodes_part.loc[
|
|
|
pct_vals.notna(),
|
|
pct_vals.notna(),
|
|
|
- ['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
|
|
- 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'high_price_cabins', 'relative_position'
|
|
|
|
|
|
|
+ ['from_date',
|
|
|
|
|
+ 'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
|
|
+ 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'relative_position',
|
|
|
|
|
+ 'high_price_cabins', 'start_hours_until_departure',
|
|
|
]
|
|
]
|
|
|
].copy()
|
|
].copy()
|
|
|
df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
@@ -463,34 +469,40 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
)
|
|
)
|
|
|
df_match = df_drop_gap[
|
|
df_match = df_drop_gap[
|
|
|
(df_drop_gap['pct_abs_gap'] <= pct_threshold)
|
|
(df_drop_gap['pct_abs_gap'] <= pct_threshold)
|
|
|
- & (df_drop_gap['price_abs_gap'] <= 5.0)
|
|
|
|
|
|
|
+ & (df_drop_gap['price_abs_gap'] <= 10.0)
|
|
|
& same_sign_mask
|
|
& same_sign_mask
|
|
|
& (df_drop_gap['high_price_cabins'] == cabins)
|
|
& (df_drop_gap['high_price_cabins'] == cabins)
|
|
|
|
|
+ & (df_drop_gap['high_price_duration_hours'] <= 48)
|
|
|
].copy()
|
|
].copy()
|
|
|
|
|
|
|
|
# 历史上出现的极近似的增长(下降)幅度后的降价场景
|
|
# 历史上出现的极近似的增长(下降)幅度后的降价场景
|
|
|
if not df_match.empty:
|
|
if not df_match.empty:
|
|
|
dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
|
hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
|
- dtd_base = pd.to_numeric(days_to_departure, errors='coerce')
|
|
|
|
|
|
|
+ # dtd_base = pd.to_numeric(days_to_departure, errors='coerce')
|
|
|
|
|
|
|
|
- if pd.notna(dur_base) and pd.notna(dtd_base) and pd.notna(hud_base):
|
|
|
|
|
|
|
+ if pd.notna(dur_base) and pd.notna(hud_base):
|
|
|
df_match_chk = df_match.copy()
|
|
df_match_chk = df_match.copy()
|
|
|
|
|
|
|
|
# drop_dtd_vals = pd.to_numeric(df_match_chk['drop_days_to_departure'], errors='coerce')
|
|
# drop_dtd_vals = pd.to_numeric(df_match_chk['drop_days_to_departure'], errors='coerce')
|
|
|
# df_match_chk = df_match_chk.loc[drop_dtd_vals.notna()].copy()
|
|
# df_match_chk = df_match_chk.loc[drop_dtd_vals.notna()].copy()
|
|
|
# df_match_chk = df_match_chk.loc[(drop_dtd_vals.loc[drop_dtd_vals.notna()] - float(dtd_base)).abs() <= 3].copy()
|
|
# df_match_chk = df_match_chk.loc[(drop_dtd_vals.loc[drop_dtd_vals.notna()] - float(dtd_base)).abs() <= 3].copy()
|
|
|
|
|
|
|
|
- # drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
|
|
- # df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
|
|
- # df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
|
|
|
|
|
|
|
+ # 正例收紧 (距离起飞的小时数)
|
|
|
|
|
+ drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= 0].copy()
|
|
|
|
|
|
|
|
- # 正例收紧
|
|
|
|
|
|
|
+ start_hud_vals = pd.to_numeric(df_match_chk['start_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[start_hud_vals.notna()].copy()
|
|
|
|
|
+ df_match_chk = df_match_chk.loc[(float(hud_base) - start_hud_vals.loc[start_hud_vals.notna()]) <= 0].copy()
|
|
|
|
|
+
|
|
|
|
|
+ # 正例收紧 (持续小时数)
|
|
|
dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
|
dur_delta = dur_num_chk - float(dur_base)
|
|
dur_delta = dur_num_chk - float(dur_base)
|
|
|
df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
|
|
df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
|
|
|
df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
|
|
df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
|
|
|
- df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 72].copy()
|
|
|
|
|
|
|
+ # df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 72].copy()
|
|
|
|
|
|
|
|
# 所有条件都对的上
|
|
# 所有条件都对的上
|
|
|
if not df_match_chk.empty:
|
|
if not df_match_chk.empty:
|
|
@@ -542,8 +554,11 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
|
|
pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
|
|
|
df_rise_gap_1 = df_rise_nodes_part.loc[
|
|
df_rise_gap_1 = df_rise_nodes_part.loc[
|
|
|
pct_vals_1.notna(),
|
|
pct_vals_1.notna(),
|
|
|
- ['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
|
|
- 'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_cabins', 'relative_position']
|
|
|
|
|
|
|
+ ['from_date',
|
|
|
|
|
+ 'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
|
|
+ 'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'relative_position',
|
|
|
|
|
+ 'prev_rise_cabins', 'start_hours_until_departure',
|
|
|
|
|
+ ]
|
|
|
].copy()
|
|
].copy()
|
|
|
df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
|
df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
|
|
df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
|
|
@@ -560,7 +575,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
)
|
|
)
|
|
|
df_match_1 = df_rise_gap_1.loc[
|
|
df_match_1 = df_rise_gap_1.loc[
|
|
|
(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1)
|
|
(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1)
|
|
|
- & (df_rise_gap_1['price_abs_gap'] <= 5.0)
|
|
|
|
|
|
|
+ & (df_rise_gap_1['price_abs_gap'] <= 10.0)
|
|
|
& same_sign_mask_1
|
|
& same_sign_mask_1
|
|
|
& (df_rise_gap_1['prev_rise_cabins'] == cabins)
|
|
& (df_rise_gap_1['prev_rise_cabins'] == cabins)
|
|
|
].copy()
|
|
].copy()
|
|
@@ -569,24 +584,29 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
if not df_match_1.empty:
|
|
if not df_match_1.empty:
|
|
|
dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
|
hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
|
- dtd_base_1 = pd.to_numeric(days_to_departure, errors='coerce')
|
|
|
|
|
|
|
+ # dtd_base_1 = pd.to_numeric(days_to_departure, errors='coerce')
|
|
|
|
|
|
|
|
- if pd.notna(dur_base_1) and pd.notna(dtd_base_1) and pd.notna(hud_base_1):
|
|
|
|
|
|
|
+ if pd.notna(dur_base_1) and pd.notna(hud_base_1):
|
|
|
df_match_chk_1 = df_match_1.copy()
|
|
df_match_chk_1 = df_match_1.copy()
|
|
|
|
|
|
|
|
# drop_dtd_vals_1 = pd.to_numeric(df_match_chk_1['rise_days_to_departure'], errors='coerce')
|
|
# drop_dtd_vals_1 = pd.to_numeric(df_match_chk_1['rise_days_to_departure'], errors='coerce')
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[drop_dtd_vals_1.notna()].copy()
|
|
# df_match_chk_1 = df_match_chk_1.loc[drop_dtd_vals_1.notna()].copy()
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[(drop_dtd_vals_1.loc[drop_dtd_vals_1.notna()] - float(dtd_base_1)).abs() <= 3].copy()
|
|
# df_match_chk_1 = df_match_chk_1.loc[(drop_dtd_vals_1.loc[drop_dtd_vals_1.notna()] - float(dtd_base_1)).abs() <= 3].copy()
|
|
|
|
|
|
|
|
- # rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
|
|
- # df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
|
|
- # df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= -24].copy()
|
|
|
|
|
|
|
+ # 反例收紧 (距离起飞的小时数)
|
|
|
|
|
+ rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= 0].copy()
|
|
|
|
|
+
|
|
|
|
|
+ start_hud_vals_1 = pd.to_numeric(df_match_chk_1['start_hours_until_departure'], errors='coerce')
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[start_hud_vals_1.notna()].copy()
|
|
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - start_hud_vals_1.loc[start_hud_vals_1.notna()]) <= 0].copy()
|
|
|
|
|
|
|
|
# 反例收紧:48小时内发生降价的不算显著反例
|
|
# 反例收紧:48小时内发生降价的不算显著反例
|
|
|
- _rise_pct_chk = pd.to_numeric(df_match_chk_1['rise_price_change_percent'], errors='coerce')
|
|
|
|
|
- _prev_dur_chk = pd.to_numeric(df_match_chk_1['prev_rise_duration_hours'], errors='coerce')
|
|
|
|
|
- _exclude_mask = _rise_pct_chk.lt(0) & _prev_dur_chk.lt(48)
|
|
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[~_exclude_mask.fillna(False)].copy()
|
|
|
|
|
|
|
+ # _rise_pct_chk = pd.to_numeric(df_match_chk_1['rise_price_change_percent'], errors='coerce')
|
|
|
|
|
+ # _prev_dur_chk = pd.to_numeric(df_match_chk_1['prev_rise_duration_hours'], errors='coerce')
|
|
|
|
|
+ # _exclude_mask = _rise_pct_chk.lt(0) & _prev_dur_chk.lt(48)
|
|
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[~_exclude_mask.fillna(False)].copy()
|
|
|
|
|
|
|
|
# 所有条件都对的上
|
|
# 所有条件都对的上
|
|
|
if not df_match_chk_1.empty:
|
|
if not df_match_chk_1.empty:
|