|
|
@@ -1081,7 +1081,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# 降价前 增幅阈值的匹配 与 高价历史持续时间 得出降价时间的概率
|
|
|
if not df_drop_nodes_part.empty and pd.notna(price_change_percent):
|
|
|
# 增幅太小的去掉
|
|
|
- # df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.1]
|
|
|
+ df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.01]
|
|
|
# pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
# df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
|
|
|
|
|
|
@@ -1101,9 +1101,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
if not df_match.empty:
|
|
|
dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
|
hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
|
- seats_base = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
|
|
|
+ # seats_base = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
|
|
|
|
|
|
- if pd.notna(dur_base) and pd.notna(hud_base) and pd.notna(seats_base):
|
|
|
+ if pd.notna(dur_base) and pd.notna(hud_base): # and pd.notna(seats_base)
|
|
|
df_match_chk = df_match.copy()
|
|
|
dur_vals = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
|
df_match_chk = df_match_chk.loc[dur_vals.notna()].copy()
|
|
|
@@ -1113,9 +1113,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
df_match_chk = df_match_chk.loc[(drop_hud_vals.loc[drop_hud_vals.notna()] - float(hud_base)).abs() <= 12].copy()
|
|
|
|
|
|
- seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
|
|
|
- df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
|
|
|
- df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
|
|
|
+ # seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
|
|
|
+ # df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
|
|
|
+ # df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
|
|
|
|
|
|
# 持续时间、距离起飞时间、座位变化都匹配上
|
|
|
if not df_match_chk.empty:
|
|
|
@@ -1202,63 +1202,63 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# continue
|
|
|
|
|
|
# 特殊判定场景
|
|
|
- if price_change_percent < 0:
|
|
|
-
|
|
|
- df_tmp = df_keep_nodes_part.copy()
|
|
|
- # 确保组内顺序正确(如果前面已经排过,这行可省略)
|
|
|
- df_tmp = df_tmp.sort_values(
|
|
|
- by=["flight_day", "keep_hours_until_departure"],
|
|
|
- ascending=[True, False]
|
|
|
- )
|
|
|
- # 是否为负值
|
|
|
- df_tmp["is_negative"] = df_tmp["keep_price_change_percent"] < 0
|
|
|
+ # if price_change_percent < 0:
|
|
|
+
|
|
|
+ # df_tmp = df_keep_nodes_part.copy()
|
|
|
+ # # 确保组内顺序正确(如果前面已经排过,这行可省略)
|
|
|
+ # df_tmp = df_tmp.sort_values(
|
|
|
+ # by=["flight_day", "keep_hours_until_departure"],
|
|
|
+ # ascending=[True, False]
|
|
|
+ # )
|
|
|
+ # # 是否为负值
|
|
|
+ # df_tmp["is_negative"] = df_tmp["keep_price_change_percent"] < 0
|
|
|
|
|
|
- if df_tmp["is_negative"].any():
|
|
|
- # 标记“负值段”的开始
|
|
|
- # 当 is_negative 为 True 且 前一行不是负值时,认为是一个新段
|
|
|
- df_tmp["neg_block_id"] = (
|
|
|
- df_tmp["is_negative"]
|
|
|
- & ~df_tmp.groupby("flight_day")["is_negative"].shift(fill_value=False)
|
|
|
- ).groupby(df_tmp["flight_day"]).cumsum()
|
|
|
- # 在每个负值段内计数(第几个负值)
|
|
|
- df_tmp["neg_rank_in_block"] = (
|
|
|
- df_tmp.groupby(["flight_day", "neg_block_id"])
|
|
|
- .cumcount() + 1
|
|
|
- )
|
|
|
- # 每个连续负值段的长度
|
|
|
- df_tmp["neg_block_size"] = (
|
|
|
- df_tmp.groupby(["flight_day", "neg_block_id"])["is_negative"]
|
|
|
- .transform("sum")
|
|
|
- )
|
|
|
- # 只保留:
|
|
|
- # 1) 是负值
|
|
|
- # 2) 且不是该连续负值段的最后一个
|
|
|
- df_continuous_price_drop = df_tmp[
|
|
|
- (df_tmp["is_negative"]) &
|
|
|
- (df_tmp["neg_rank_in_block"] < df_tmp["neg_block_size"])
|
|
|
- ].drop(
|
|
|
- columns=[
|
|
|
- "is_negative",
|
|
|
- "neg_block_id",
|
|
|
- "neg_rank_in_block",
|
|
|
- "neg_block_size",
|
|
|
- ]
|
|
|
- )
|
|
|
- pct_diff_c = (df_continuous_price_drop['keep_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
- df_match_c = df_continuous_price_drop.loc[pct_diff_c <= pct_threshold_c, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
|
|
|
-
|
|
|
- # 符合连续降价条件
|
|
|
- if not df_match_c.empty and pd.notna(price_duration_hours):
|
|
|
- vals_c = df_match_c['keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- if not vals_c.empty:
|
|
|
- min_val_c = vals_c.min()
|
|
|
- if min_val_c <= float(price_duration_hours):
|
|
|
- df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
|
|
|
- length_drop = df_match_c.shape[0]
|
|
|
- # continue # 已经判定降价 后面不再做
|
|
|
+ # if df_tmp["is_negative"].any():
|
|
|
+ # # 标记“负值段”的开始
|
|
|
+ # # 当 is_negative 为 True 且 前一行不是负值时,认为是一个新段
|
|
|
+ # df_tmp["neg_block_id"] = (
|
|
|
+ # df_tmp["is_negative"]
|
|
|
+ # & ~df_tmp.groupby("flight_day")["is_negative"].shift(fill_value=False)
|
|
|
+ # ).groupby(df_tmp["flight_day"]).cumsum()
|
|
|
+ # # 在每个负值段内计数(第几个负值)
|
|
|
+ # df_tmp["neg_rank_in_block"] = (
|
|
|
+ # df_tmp.groupby(["flight_day", "neg_block_id"])
|
|
|
+ # .cumcount() + 1
|
|
|
+ # )
|
|
|
+ # # 每个连续负值段的长度
|
|
|
+ # df_tmp["neg_block_size"] = (
|
|
|
+ # df_tmp.groupby(["flight_day", "neg_block_id"])["is_negative"]
|
|
|
+ # .transform("sum")
|
|
|
+ # )
|
|
|
+ # # 只保留:
|
|
|
+ # # 1) 是负值
|
|
|
+ # # 2) 且不是该连续负值段的最后一个
|
|
|
+ # df_continuous_price_drop = df_tmp[
|
|
|
+ # (df_tmp["is_negative"]) &
|
|
|
+ # (df_tmp["neg_rank_in_block"] < df_tmp["neg_block_size"])
|
|
|
+ # ].drop(
|
|
|
+ # columns=[
|
|
|
+ # "is_negative",
|
|
|
+ # "neg_block_id",
|
|
|
+ # "neg_rank_in_block",
|
|
|
+ # "neg_block_size",
|
|
|
+ # ]
|
|
|
+ # )
|
|
|
+ # pct_diff_c = (df_continuous_price_drop['keep_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
+ # df_match_c = df_continuous_price_drop.loc[pct_diff_c <= pct_threshold_c, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
|
|
|
+
|
|
|
+ # # 符合连续降价条件
|
|
|
+ # if not df_match_c.empty and pd.notna(price_duration_hours):
|
|
|
+ # vals_c = df_match_c['keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ # if not vals_c.empty:
|
|
|
+ # min_val_c = vals_c.min()
|
|
|
+ # if min_val_c <= float(price_duration_hours):
|
|
|
+ # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
|
|
|
+ # length_drop = df_match_c.shape[0]
|
|
|
+ # # continue # 已经判定降价 后面不再做
|
|
|
|
|
|
# 一般判定场景
|
|
|
pct_base_1 = float(price_change_percent)
|
|
|
@@ -1281,21 +1281,21 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
|
|
|
dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
|
# hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
|
- seats_base_1 = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
|
|
|
+ # seats_base_1 = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
|
|
|
|
|
|
- if pd.notna(dur_base_1) and pd.notna(seats_base_1):
|
|
|
+ if pd.notna(dur_base_1): # and pd.notna(seats_base_1)
|
|
|
df_match_chk_1 = df_match_1.copy()
|
|
|
dur_vals_1 = pd.to_numeric(df_match_chk_1['modify_keep_price_duration_hours'], errors='coerce')
|
|
|
df_match_chk_1 = df_match_chk_1.loc[dur_vals_1.notna()].copy()
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 6].copy()
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 12].copy()
|
|
|
|
|
|
# drop_hud_vals_1 = pd.to_numeric(df_match_chk_1['keep_hours_until_departure'], errors='coerce')
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[drop_hud_vals_1.notna()].copy()
|
|
|
# df_match_chk_1 = df_match_chk_1.loc[(drop_hud_vals_1.loc[drop_hud_vals_1.notna()] - float(hud_base_1)).abs() <= 12].copy()
|
|
|
|
|
|
- seats_vals_1 = pd.to_numeric(df_match_chk_1['keep_seats_remaining_change_amount'], errors='coerce')
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
|
|
|
- df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.loc[seats_vals_1.notna()] == float(seats_base_1)].copy()
|
|
|
+ # seats_vals_1 = pd.to_numeric(df_match_chk_1['keep_seats_remaining_change_amount'], errors='coerce')
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.loc[seats_vals_1.notna()] == float(seats_base_1)].copy()
|
|
|
|
|
|
# 持续时间、距离起飞时间、座位变化都匹配上
|
|
|
if not df_match_chk_1.empty:
|