|
@@ -29,7 +29,7 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
).reset_index(drop=True)
|
|
).reset_index(drop=True)
|
|
|
|
|
|
|
|
df_input = df_input[df_input['hours_until_departure'] <= 480]
|
|
df_input = df_input[df_input['hours_until_departure'] <= 480]
|
|
|
- df_input = df_input[df_input['baggage_weight'] == 20] # 先保留20公斤行李的
|
|
|
|
|
|
|
+ df_input = df_input[df_input['baggage_weight'] == 0] # 先保留0公斤行李的
|
|
|
|
|
|
|
|
# 在hours_until_departure 的末尾 保留到当前时刻的数据
|
|
# 在hours_until_departure 的末尾 保留到当前时刻的数据
|
|
|
if not is_train:
|
|
if not is_train:
|
|
@@ -161,8 +161,8 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
|
|
|
|
|
|
|
|
# 制作历史包络线
|
|
# 制作历史包络线
|
|
|
envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
|
- idx_peak = df_input.groupby(envelope_group)['price_total'].idxmax()
|
|
|
|
|
- df_envelope = df_input.loc[idx_peak, envelope_group + [
|
|
|
|
|
|
|
+ idx_peak = df_target.groupby(envelope_group)['price_total'].idxmax()
|
|
|
|
|
+ df_envelope = df_target.loc[idx_peak, envelope_group + [
|
|
|
'from_time', 'price_total', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week',
|
|
'from_time', 'price_total', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week',
|
|
|
]].rename(columns={
|
|
]].rename(columns={
|
|
|
'price_total': 'peak_price',
|
|
'price_total': 'peak_price',
|
|
@@ -288,13 +288,16 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
df_min_hours['rise_price_sample_size'] = 0
|
|
|
|
|
|
|
|
# 这个阈值取多少?
|
|
# 这个阈值取多少?
|
|
|
- # pct_threshold = 0.01
|
|
|
|
|
- # pct_threshold_1 = 0.01
|
|
|
|
|
|
|
+ pct_threshold = 0.01
|
|
|
|
|
+ pct_threshold_1 = 0.01
|
|
|
|
|
|
|
|
for idx, row in df_min_hours.iterrows():
|
|
for idx, row in df_min_hours.iterrows():
|
|
|
city_pair = row['citypair']
|
|
city_pair = row['citypair']
|
|
|
flight_numbers = row['flight_numbers']
|
|
flight_numbers = row['flight_numbers']
|
|
|
baggage_weight = row['baggage_weight']
|
|
baggage_weight = row['baggage_weight']
|
|
|
|
|
+ from_date = row['from_date']
|
|
|
|
|
+ if flight_numbers == "UO235" and from_date == "2026-04-25": # 调试时用
|
|
|
|
|
+ pass
|
|
|
days_to_departure = row['days_to_departure']
|
|
days_to_departure = row['days_to_departure']
|
|
|
hours_until_departure = row['hours_until_departure']
|
|
hours_until_departure = row['hours_until_departure']
|
|
|
price_change_percent = row['price_change_percent']
|
|
price_change_percent = row['price_change_percent']
|
|
@@ -314,26 +317,26 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
(df_drop_nodes['baggage_weight'] == baggage_weight)
|
|
(df_drop_nodes['baggage_weight'] == baggage_weight)
|
|
|
]
|
|
]
|
|
|
# 降价前 增量阈值、当前阈值 的匹配
|
|
# 降价前 增量阈值、当前阈值 的匹配
|
|
|
- if not df_drop_nodes_part.empty and pd.notna(price_change_amount):
|
|
|
|
|
|
|
+ if not df_drop_nodes_part.empty and pd.notna(price_change_percent):
|
|
|
|
|
|
|
|
- pca_base = float(price_change_amount)
|
|
|
|
|
- pca_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_amount'], errors='coerce')
|
|
|
|
|
|
|
+ pct_base = float(price_change_percent)
|
|
|
|
|
+ pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
|
|
|
df_drop_gap = df_drop_nodes_part.loc[
|
|
df_drop_gap = df_drop_nodes_part.loc[
|
|
|
- pca_vals.notna(),
|
|
|
|
|
|
|
+ pct_vals.notna(),
|
|
|
['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'relative_position'
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'relative_position'
|
|
|
]
|
|
]
|
|
|
].copy()
|
|
].copy()
|
|
|
- df_drop_gap['pca_gap'] = (pca_vals.loc[pca_vals.notna()] - pca_base)
|
|
|
|
|
- df_drop_gap['pca_abs_gap'] = df_drop_gap['pca_gap'].abs()
|
|
|
|
|
|
|
+ df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
|
|
|
+ df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
|
|
|
|
|
|
|
|
price_base = pd.to_numeric(price_amount, errors='coerce')
|
|
price_base = pd.to_numeric(price_amount, errors='coerce')
|
|
|
high_price_vals = pd.to_numeric(df_drop_gap['high_price_amount'], errors='coerce')
|
|
high_price_vals = pd.to_numeric(df_drop_gap['high_price_amount'], errors='coerce')
|
|
|
df_drop_gap['price_gap'] = high_price_vals - price_base
|
|
df_drop_gap['price_gap'] = high_price_vals - price_base
|
|
|
df_drop_gap['price_abs_gap'] = df_drop_gap['price_gap'].abs()
|
|
df_drop_gap['price_abs_gap'] = df_drop_gap['price_gap'].abs()
|
|
|
|
|
|
|
|
- df_drop_gap = df_drop_gap.sort_values(['price_abs_gap', 'pca_abs_gap'], ascending=[True, True])
|
|
|
|
|
- df_match = df_drop_gap[(df_drop_gap['price_abs_gap'] <= 5.0) & (df_drop_gap['pca_abs_gap'] <= 10.0)].copy()
|
|
|
|
|
|
|
+ df_drop_gap = df_drop_gap.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
|
|
|
|
|
+ df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 3.0)].copy()
|
|
|
|
|
|
|
|
# 历史上出现的极近似的增长(下降)幅度后的降价场景
|
|
# 历史上出现的极近似的增长(下降)幅度后的降价场景
|
|
|
if not df_match.empty:
|
|
if not df_match.empty:
|
|
@@ -390,24 +393,24 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
(df_rise_nodes['baggage_weight'] == baggage_weight)
|
|
(df_rise_nodes['baggage_weight'] == baggage_weight)
|
|
|
]
|
|
]
|
|
|
# 升价前 增量阈值、当前阈值 的匹配
|
|
# 升价前 增量阈值、当前阈值 的匹配
|
|
|
- if not df_rise_nodes_part.empty and pd.notna(price_change_amount):
|
|
|
|
|
- pca_base_1 = float(price_change_amount)
|
|
|
|
|
- pca_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_amount'], errors='coerce')
|
|
|
|
|
|
|
+ if not df_rise_nodes_part.empty and pd.notna(price_change_percent):
|
|
|
|
|
+ pct_base_1 = float(price_change_percent)
|
|
|
|
|
+ pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
|
|
|
df_rise_gap_1 = df_rise_nodes_part.loc[
|
|
df_rise_gap_1 = df_rise_nodes_part.loc[
|
|
|
- pca_vals_1.notna(),
|
|
|
|
|
|
|
+ pct_vals_1.notna(),
|
|
|
['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'relative_position']
|
|
'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'relative_position']
|
|
|
].copy()
|
|
].copy()
|
|
|
- df_rise_gap_1['pca_gap'] = (pca_vals_1.loc[pca_vals_1.notna()] - pca_base_1)
|
|
|
|
|
- df_rise_gap_1['pca_abs_gap'] = df_rise_gap_1['pca_gap'].abs()
|
|
|
|
|
|
|
+ df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
|
|
|
+ df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
|
|
|
|
|
|
|
|
price_base_1 = pd.to_numeric(price_amount, errors='coerce')
|
|
price_base_1 = pd.to_numeric(price_amount, errors='coerce')
|
|
|
rise_price_vals_1 = pd.to_numeric(df_rise_gap_1['prev_rise_amount'], errors='coerce')
|
|
rise_price_vals_1 = pd.to_numeric(df_rise_gap_1['prev_rise_amount'], errors='coerce')
|
|
|
df_rise_gap_1['price_gap'] = rise_price_vals_1 - price_base_1
|
|
df_rise_gap_1['price_gap'] = rise_price_vals_1 - price_base_1
|
|
|
df_rise_gap_1['price_abs_gap'] = df_rise_gap_1['price_gap'].abs()
|
|
df_rise_gap_1['price_abs_gap'] = df_rise_gap_1['price_gap'].abs()
|
|
|
|
|
|
|
|
- df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap', 'pca_abs_gap'], ascending=[True, True])
|
|
|
|
|
- df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['price_abs_gap'] <= 5.0) & (df_rise_gap_1['pca_abs_gap'] <= 10.0)].copy()
|
|
|
|
|
|
|
+ df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
|
|
|
|
|
+ df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 3.0)].copy()
|
|
|
|
|
|
|
|
# 历史上出现的极近似的增长(下降)幅度后的升价场景
|
|
# 历史上出现的极近似的增长(下降)幅度后的升价场景
|
|
|
if not df_match_1.empty:
|
|
if not df_match_1.empty:
|
|
@@ -443,7 +446,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
|
|
|
else:
|
|
else:
|
|
|
drop_prob = round(length_drop / (length_rise + length_drop), 2)
|
|
drop_prob = round(length_drop / (length_rise + length_drop), 2)
|
|
|
# 依旧保持之前的降价判定,概率修改
|
|
# 依旧保持之前的降价判定,概率修改
|
|
|
- if drop_prob >= 0.4:
|
|
|
|
|
|
|
+ if drop_prob >= 0.6:
|
|
|
df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
# df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'd1'
|
|
# df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'd1'
|
|
|
df_min_hours.loc[idx, 'flag_dist'] = 'd1'
|
|
df_min_hours.loc[idx, 'flag_dist'] = 'd1'
|