|
|
@@ -1244,6 +1244,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# seats_remaining_change_amount = row['seats_remaining_change_amount']
|
|
|
price_amount = row['adult_total_price']
|
|
|
seats_remaining = row['seats_remaining']
|
|
|
+ # envelope_position = row['envelope_position']
|
|
|
|
|
|
length_drop = 0
|
|
|
length_rise = 0
|
|
|
@@ -1287,8 +1288,10 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_drop_gap['price_gap'] = high_price_vals - price_base
|
|
|
df_drop_gap['price_abs_gap'] = df_drop_gap['price_gap'].abs()
|
|
|
|
|
|
- df_drop_gap = df_drop_gap.sort_values(['pct_abs_gap', 'price_abs_gap'], ascending=[True, True])
|
|
|
- df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 10.0)].copy()
|
|
|
+ # df_drop_gap = df_drop_gap.sort_values(['pct_abs_gap', 'price_abs_gap'], ascending=[True, True])
|
|
|
+ # df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 10.0)].copy()
|
|
|
+ df_drop_gap = df_drop_gap.sort_values(['price_abs_gap'], ascending=[True])
|
|
|
+ df_match = df_drop_gap[(df_drop_gap['price_abs_gap'] <= 5.0)].copy()
|
|
|
|
|
|
# 历史上出现的极近似的增长幅度后的降价场景
|
|
|
if not df_match.empty:
|
|
|
@@ -1350,34 +1353,6 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# 历史上未出现的极近似的增长幅度后的降价场景
|
|
|
else:
|
|
|
pass
|
|
|
- # if pd.notna(price_duration_hours) and price_change_percent >= 0.1:
|
|
|
- # pct_vals = pd.to_numeric(
|
|
|
- # df_drop_nodes_part['high_price_change_percent'],
|
|
|
- # errors='coerce'
|
|
|
- # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- # dur_vals = pd.to_numeric(
|
|
|
- # df_drop_nodes_part['high_price_duration_hours'],
|
|
|
- # errors='coerce'
|
|
|
- # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
-
|
|
|
- # if not pct_vals.empty and not dur_vals.empty:
|
|
|
- # pct_min = float(pct_vals.min())
|
|
|
- # pct_max = float(pct_vals.max())
|
|
|
- # dur_min = float(dur_vals.min())
|
|
|
- # dur_max = float(dur_vals.max())
|
|
|
-
|
|
|
- # if (pct_min <= float(price_change_percent) <= pct_max) and (dur_min <= float(price_duration_hours) <= dur_max):
|
|
|
- # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.5'
|
|
|
- # continue # 已经判定降价 后面不再做
|
|
|
- # elif (pct_min <= float(price_change_percent)) and (dur_min <= float(price_duration_hours)):
|
|
|
- # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.3
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.3'
|
|
|
- # continue # 已经判定降价 后面不再做
|
|
|
|
|
|
# 针对历史上发生的 连续涨价
|
|
|
if not df_rise_nodes.empty:
|
|
|
@@ -1395,76 +1370,6 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
]
|
|
|
|
|
|
if not df_rise_nodes_part.empty and pd.notna(price_change_percent):
|
|
|
- # pct_vals_1 = df_keep_nodes_part['keep_price_change_percent'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- # # 保留百分位 10% ~ 90% 之间的 数据
|
|
|
- # if not pct_vals_1.empty:
|
|
|
- # q10_1 = float(pct_vals_1.quantile(0.10))
|
|
|
- # q90_1 = float(pct_vals_1.quantile(0.90))
|
|
|
- # df_keep_nodes_part = df_keep_nodes_part[
|
|
|
- # df_keep_nodes_part['keep_price_change_percent'].between(q10_1, q90_1)
|
|
|
- # ]
|
|
|
- # if df_keep_nodes_part.empty:
|
|
|
- # continue
|
|
|
-
|
|
|
- # 特殊判定场景
|
|
|
- # if price_change_percent < 0:
|
|
|
-
|
|
|
- # df_tmp = df_keep_nodes_part.copy()
|
|
|
- # # 确保组内顺序正确(如果前面已经排过,这行可省略)
|
|
|
- # df_tmp = df_tmp.sort_values(
|
|
|
- # by=["flight_day", "keep_hours_until_departure"],
|
|
|
- # ascending=[True, False]
|
|
|
- # )
|
|
|
- # # 是否为负值
|
|
|
- # df_tmp["is_negative"] = df_tmp["keep_price_change_percent"] < 0
|
|
|
-
|
|
|
- # if df_tmp["is_negative"].any():
|
|
|
- # # 标记“负值段”的开始
|
|
|
- # # 当 is_negative 为 True 且 前一行不是负值时,认为是一个新段
|
|
|
- # df_tmp["neg_block_id"] = (
|
|
|
- # df_tmp["is_negative"]
|
|
|
- # & ~df_tmp.groupby("flight_day")["is_negative"].shift(fill_value=False)
|
|
|
- # ).groupby(df_tmp["flight_day"]).cumsum()
|
|
|
- # # 在每个负值段内计数(第几个负值)
|
|
|
- # df_tmp["neg_rank_in_block"] = (
|
|
|
- # df_tmp.groupby(["flight_day", "neg_block_id"])
|
|
|
- # .cumcount() + 1
|
|
|
- # )
|
|
|
- # # 每个连续负值段的长度
|
|
|
- # df_tmp["neg_block_size"] = (
|
|
|
- # df_tmp.groupby(["flight_day", "neg_block_id"])["is_negative"]
|
|
|
- # .transform("sum")
|
|
|
- # )
|
|
|
- # # 只保留:
|
|
|
- # # 1) 是负值
|
|
|
- # # 2) 且不是该连续负值段的最后一个
|
|
|
- # df_continuous_price_drop = df_tmp[
|
|
|
- # (df_tmp["is_negative"]) &
|
|
|
- # (df_tmp["neg_rank_in_block"] < df_tmp["neg_block_size"])
|
|
|
- # ].drop(
|
|
|
- # columns=[
|
|
|
- # "is_negative",
|
|
|
- # "neg_block_id",
|
|
|
- # "neg_rank_in_block",
|
|
|
- # "neg_block_size",
|
|
|
- # ]
|
|
|
- # )
|
|
|
- # pct_diff_c = (df_continuous_price_drop['keep_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
- # df_match_c = df_continuous_price_drop.loc[pct_diff_c <= pct_threshold_c, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
|
|
|
-
|
|
|
- # # 符合连续降价条件
|
|
|
- # if not df_match_c.empty and pd.notna(price_duration_hours):
|
|
|
- # vals_c = df_match_c['keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- # if not vals_c.empty:
|
|
|
- # min_val_c = vals_c.min()
|
|
|
- # if min_val_c <= float(price_duration_hours):
|
|
|
- # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
|
|
|
- # length_drop = df_match_c.shape[0]
|
|
|
- # # continue # 已经判定降价 后面不再做
|
|
|
-
|
|
|
# 一般判定场景
|
|
|
pct_base_1 = float(price_change_percent)
|
|
|
pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
|
|
|
@@ -1482,8 +1387,10 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_rise_gap_1['price_gap'] = rise_price_vals_1 - price_base_1
|
|
|
df_rise_gap_1['price_abs_gap'] = df_rise_gap_1['price_gap'].abs()
|
|
|
|
|
|
- df_rise_gap_1 = df_rise_gap_1.sort_values(['pct_abs_gap', 'price_abs_gap'], ascending=[True, True])
|
|
|
- df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 10.0)].copy()
|
|
|
+ # df_rise_gap_1 = df_rise_gap_1.sort_values(['pct_abs_gap', 'price_abs_gap'], ascending=[True, True])
|
|
|
+ # df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 10.0)].copy()
|
|
|
+ df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap'], ascending=[True])
|
|
|
+ df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['price_abs_gap'] <= 5.0)].copy()
|
|
|
|
|
|
# 历史上出现过近似变化幅度后继续涨价场景
|
|
|
if not df_match_1.empty:
|
|
|
@@ -1547,59 +1454,15 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = drop_prob
|
|
|
|
|
|
- # elif length_keep == length_drop: # 不降价与降价相同, 取0.5概率
|
|
|
-
|
|
|
- # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
|
|
|
-
|
|
|
- # df_match_1['hours_delta'] = hours_until_departure - df_match_1['keep_hours_until_departure']
|
|
|
- # df_match_1['modify_keep_price_duration_hours'] = df_match_1['keep_price_duration_hours'] - df_match_1['hours_delta']
|
|
|
- # df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
|
|
|
-
|
|
|
- # 比较 price_duration_hours 在 modify_keep_price_duration_hours 的百分位
|
|
|
- # vals = df_match_1['modify_keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- # if not vals.empty:
|
|
|
- # # q10_11 = float(vals.quantile(0.10))
|
|
|
- # min_val = vals.min()
|
|
|
- # if min_val <= float(price_duration_hours):
|
|
|
- # df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
|
|
|
-
|
|
|
# 历史上未出现过近似变化幅度后保持低价场景
|
|
|
else:
|
|
|
pass
|
|
|
- # df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n0'
|
|
|
-
|
|
|
- # if pd.notna(price_duration_hours) and price_change_percent <= 0.1:
|
|
|
- # df_keep_nodes_part_1 = df_keep_nodes_part[df_keep_nodes_part['keep_price_change_percent'] <= 0.1]
|
|
|
- # pct_vals_1 = pd.to_numeric(
|
|
|
- # df_keep_nodes_part_1['keep_price_change_percent'],
|
|
|
- # errors='coerce'
|
|
|
- # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- # dur_vals_1 = pd.to_numeric(
|
|
|
- # df_keep_nodes_part_1['keep_price_duration_hours'],
|
|
|
- # errors='coerce'
|
|
|
- # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
-
|
|
|
- # if not pct_vals_1.empty and not dur_vals_1.empty:
|
|
|
- # pct_min_1 = float(pct_vals_1.min())
|
|
|
- # pct_max_1 = float(pct_vals_1.max())
|
|
|
- # dur_min_1 = float(dur_vals_1.min())
|
|
|
- # dur_max_1 = float(dur_vals_1.max())
|
|
|
-
|
|
|
- # if (pct_min_1 <= float(price_change_percent) <= pct_max_1) and (dur_min_1 <= float(price_duration_hours) <= dur_max_1):
|
|
|
- # df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
- # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n1'
|
|
|
- pass
|
|
|
+ # 根据价格包络位置统一判定
|
|
|
+ # if envelope_position >= 0.97: # 在0.97分位之上的认为必降价?
|
|
|
+ # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ # df_min_hours.loc[idx, 'flag_dist'] = 'dd'
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 1
|
|
|
+
|
|
|
print("判定循环结束")
|
|
|
# 按航班号统一其降价/涨价的上限与下限, 上限统一取最大, 下限统一取最小
|
|
|
# _grp_cols = ['city_pair', 'flight_number_1', 'flight_number_2']
|