|
|
@@ -1011,7 +1011,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
df_sorted = df_sorted[
|
|
|
- df_sorted['hours_until_departure'].between(18, 54)
|
|
|
+ df_sorted['hours_until_departure'].between(12, 60)
|
|
|
].reset_index(drop=True)
|
|
|
|
|
|
# 每个 gid 取 hours_until_departure 最小的一条
|
|
|
@@ -1020,9 +1020,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
.reset_index(drop=True)
|
|
|
)
|
|
|
|
|
|
- # 确保 hours_until_departure 在 [18, 54] 的 范围内
|
|
|
+ # 确保 hours_until_departure 在 [12, 60] 的 范围内
|
|
|
# df_min_hours = df_min_hours[
|
|
|
- # df_min_hours['hours_until_departure'].between(18, 54)
|
|
|
+ # df_min_hours['hours_until_departure'].between(12, 60)
|
|
|
# ].reset_index(drop=True)
|
|
|
|
|
|
drop_info_csv_path = os.path.join(output_dir, f'{group_route_str}_drop_info.csv')
|
|
|
@@ -1043,18 +1043,26 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_min_hours['simple_drop_in_hours_dist'] = '' # 空串 表示未知
|
|
|
|
|
|
# 这个阈值取多少?
|
|
|
- pct_threshold = 0.01
|
|
|
+ pct_threshold = 0.001
|
|
|
# pct_threshold = 2
|
|
|
- pct_threshold_1 = 0.01
|
|
|
+ pct_threshold_1 = 0.001
|
|
|
pct_threshold_c = 0.001
|
|
|
|
|
|
for idx, row in df_min_hours.iterrows():
|
|
|
city_pair = row['city_pair']
|
|
|
flight_number_1 = row['flight_number_1']
|
|
|
flight_number_2 = row['flight_number_2']
|
|
|
+ if flight_number_1 == 'VJ878': # 调试时用
|
|
|
+ pass
|
|
|
price_change_percent = row['price_change_percent']
|
|
|
+ price_change_amount = row['price_change_amount']
|
|
|
price_duration_hours = row['price_duration_hours']
|
|
|
hours_until_departure = row['hours_until_departure']
|
|
|
+ seats_remaining_change_amount = row['seats_remaining_change_amount']
|
|
|
+
|
|
|
+ length_drop = 0
|
|
|
+ length_keep = 0
|
|
|
+
|
|
|
# 针对历史上发生的 高价->低价
|
|
|
if not df_drop_nodes.empty:
|
|
|
# 对准航班号, 不同起飞日期
|
|
|
@@ -1073,72 +1081,98 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
# 降价前 增幅阈值的匹配 与 高价历史持续时间 得出降价时间的概率
|
|
|
if not df_drop_nodes_part.empty and pd.notna(price_change_percent):
|
|
|
# 增幅太小的去掉
|
|
|
- df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.1]
|
|
|
- # pct_vals = df_drop_nodes_part['high_price_change_percent'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- # # 保留百分位 10% ~ 90% 之间的 数据
|
|
|
- # if not pct_vals.empty:
|
|
|
- # q10 = float(pct_vals.quantile(0.10))
|
|
|
- # q90 = float(pct_vals.quantile(0.90))
|
|
|
- # df_drop_nodes_part = df_drop_nodes_part[
|
|
|
- # df_drop_nodes_part['high_price_change_percent'].between(q10, q90)
|
|
|
- # ]
|
|
|
- # if df_drop_nodes_part.empty:
|
|
|
- # continue
|
|
|
- pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
- df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
|
|
|
-
|
|
|
- # 历史上出现的降价幅度
|
|
|
- if not df_match.empty and pd.notna(price_duration_hours):
|
|
|
- remaining_hours = (df_match['high_price_duration_hours'] - float(price_duration_hours)).clip(lower=0)
|
|
|
- remaining_hours = remaining_hours.round().astype(int)
|
|
|
-
|
|
|
- counts = remaining_hours.value_counts().sort_index()
|
|
|
- probs = (counts / counts.sum()).round(4)
|
|
|
-
|
|
|
- top_hours = int(probs.idxmax())
|
|
|
- top_prob = float(probs.max())
|
|
|
-
|
|
|
- dist_items = list(zip(probs.index.tolist(), probs.tolist()))
|
|
|
- dist_items = dist_items[:10]
|
|
|
- dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
|
|
|
-
|
|
|
- df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = top_prob
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
|
|
|
-
|
|
|
- continue # 已经判定降价 后面不再做
|
|
|
+ # df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.1]
|
|
|
+ # pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
+ # df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
|
|
|
+
|
|
|
+ pct_base = float(price_change_percent)
|
|
|
+ pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
|
|
|
+ df_drop_gap = df_drop_nodes_part.loc[
|
|
|
+ pct_vals.notna(),
|
|
|
+ ['drop_hours_until_departure', 'high_price_duration_hours', 'high_price_change_percent',
|
|
|
+ 'high_price_change_amount', 'high_price_seats_remaining_change_amount']
|
|
|
+ ].copy()
|
|
|
+ df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
|
|
|
+ df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
|
|
|
+ df_drop_gap = df_drop_gap.sort_values(['pct_abs_gap'], ascending=True)
|
|
|
+ df_match = df_drop_gap[df_drop_gap['pct_abs_gap'] <= pct_threshold]
|
|
|
+
|
|
|
+ # 历史上出现的极近似的增长幅度后的降价场景
|
|
|
+ if not df_match.empty:
|
|
|
+ dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
|
+ hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
|
+ seats_base = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
|
|
|
+
|
|
|
+ if pd.notna(dur_base) and pd.notna(hud_base) and pd.notna(seats_base):
|
|
|
+ df_match_chk = df_match.copy()
|
|
|
+ dur_vals = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
|
|
|
+ df_match_chk = df_match_chk.loc[dur_vals.notna()].copy()
|
|
|
+ df_match_chk = df_match_chk.loc[dur_vals.loc[dur_vals.notna()] - 12 <= float(dur_base)].copy()
|
|
|
+
|
|
|
+ drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
|
|
|
+ df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
|
|
|
+ df_match_chk = df_match_chk.loc[(drop_hud_vals.loc[drop_hud_vals.notna()] - float(hud_base)).abs() <= 12].copy()
|
|
|
+
|
|
|
+ seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
|
|
|
+ df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
|
|
|
+ df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
|
|
|
+
|
|
|
+ # 持续时间、距离起飞时间、座位变化都匹配上
|
|
|
+ if not df_match_chk.empty:
|
|
|
+ remaining_hours = (
|
|
|
+ pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
|
|
|
+ ).clip(lower=0)
|
|
|
+ remaining_hours = remaining_hours.round().astype(int)
|
|
|
+
|
|
|
+ counts = remaining_hours.value_counts().sort_index()
|
|
|
+ probs = (counts / counts.sum()).round(4)
|
|
|
+
|
|
|
+ top_hours = int(probs.idxmax())
|
|
|
+ top_prob = float(probs.max())
|
|
|
+
|
|
|
+ dist_items = list(zip(probs.index.tolist(), probs.tolist()))
|
|
|
+ dist_items = dist_items[:10]
|
|
|
+ dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
|
|
|
+
|
|
|
+ df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = top_prob
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
|
|
|
+
|
|
|
+ length_drop = df_match_chk.shape[0]
|
|
|
+ # continue # 已经判定降价 后面不再做
|
|
|
|
|
|
- # 历史上未出现的降价幅度
|
|
|
+ # 历史上未出现的极近似的增长幅度后的降价场景
|
|
|
else:
|
|
|
- if pd.notna(price_duration_hours) and price_change_percent >= 0.1:
|
|
|
- pct_vals = pd.to_numeric(
|
|
|
- df_drop_nodes_part['high_price_change_percent'],
|
|
|
- errors='coerce'
|
|
|
- ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- dur_vals = pd.to_numeric(
|
|
|
- df_drop_nodes_part['high_price_duration_hours'],
|
|
|
- errors='coerce'
|
|
|
- ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
-
|
|
|
- if not pct_vals.empty and not dur_vals.empty:
|
|
|
- pct_min = float(pct_vals.min())
|
|
|
- pct_max = float(pct_vals.max())
|
|
|
- dur_min = float(dur_vals.min())
|
|
|
- dur_max = float(dur_vals.max())
|
|
|
-
|
|
|
- if (pct_min <= float(price_change_percent) <= pct_max) and (dur_min <= float(price_duration_hours) <= dur_max):
|
|
|
- df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.5'
|
|
|
- continue # 已经判定降价 后面不再做
|
|
|
- elif (pct_min <= float(price_change_percent)) and (dur_min <= float(price_duration_hours)):
|
|
|
- df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.3
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.3'
|
|
|
- continue # 已经判定降价 后面不再做
|
|
|
+ pass
|
|
|
+ # if pd.notna(price_duration_hours) and price_change_percent >= 0.1:
|
|
|
+ # pct_vals = pd.to_numeric(
|
|
|
+ # df_drop_nodes_part['high_price_change_percent'],
|
|
|
+ # errors='coerce'
|
|
|
+ # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ # dur_vals = pd.to_numeric(
|
|
|
+ # df_drop_nodes_part['high_price_duration_hours'],
|
|
|
+ # errors='coerce'
|
|
|
+ # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+
|
|
|
+ # if not pct_vals.empty and not dur_vals.empty:
|
|
|
+ # pct_min = float(pct_vals.min())
|
|
|
+ # pct_max = float(pct_vals.max())
|
|
|
+ # dur_min = float(dur_vals.min())
|
|
|
+ # dur_max = float(dur_vals.max())
|
|
|
+
|
|
|
+ # if (pct_min <= float(price_change_percent) <= pct_max) and (dur_min <= float(price_duration_hours) <= dur_max):
|
|
|
+ # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.5'
|
|
|
+ # continue # 已经判定降价 后面不再做
|
|
|
+ # elif (pct_min <= float(price_change_percent)) and (dur_min <= float(price_duration_hours)):
|
|
|
+ # df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.3
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.3'
|
|
|
+ # continue # 已经判定降价 后面不再做
|
|
|
|
|
|
# 针对历史上发生 一直低价、一直高价、低价->高价、连续低价 等
|
|
|
if not df_keep_nodes.empty:
|
|
|
@@ -1223,71 +1257,116 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
|
|
|
- continue # 已经判定降价 后面不再做
|
|
|
+ length_drop = df_match_c.shape[0]
|
|
|
+ # continue # 已经判定降价 后面不再做
|
|
|
|
|
|
# 一般判定场景
|
|
|
- pct_diff_1 = (df_keep_nodes_part['keep_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
- df_match_1 = df_keep_nodes_part.loc[pct_diff_1 <= pct_threshold_1, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
|
|
|
-
|
|
|
- # 历史上出现过的保持低价场景
|
|
|
- if not df_match_1.empty and pd.notna(price_duration_hours):
|
|
|
- df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k0'
|
|
|
-
|
|
|
+ pct_base_1 = float(price_change_percent)
|
|
|
+ pct_vals_1 = pd.to_numeric(df_keep_nodes_part['keep_price_change_percent'], errors='coerce')
|
|
|
+ df_drop_gap_1 = df_keep_nodes_part.loc[
|
|
|
+ pct_vals_1.notna(),
|
|
|
+ ['keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent',
|
|
|
+ 'keep_price_change_amount', 'keep_seats_remaining_change_amount']
|
|
|
+ ].copy()
|
|
|
+ df_drop_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
|
|
|
+ df_drop_gap_1['pct_abs_gap'] = df_drop_gap_1['pct_gap'].abs()
|
|
|
+ df_drop_gap_1 = df_drop_gap_1.sort_values(['pct_abs_gap'], ascending=True)
|
|
|
+ df_match_1 = df_drop_gap_1.loc[df_drop_gap_1['pct_abs_gap'] <= pct_threshold_1].copy()
|
|
|
+
|
|
|
+ # 历史上出现过近似变化幅度后保持低价场景
|
|
|
+ if not df_match_1.empty:
|
|
|
df_match_1['hours_delta'] = hours_until_departure - df_match_1['keep_hours_until_departure']
|
|
|
df_match_1['modify_keep_price_duration_hours'] = df_match_1['keep_price_duration_hours'] - df_match_1['hours_delta']
|
|
|
# df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
|
|
|
|
|
|
- # 比较 price_duration_hours 在 modify_keep_price_duration_hours 的百分位
|
|
|
- vals = df_match_1['modify_keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- if not vals.empty:
|
|
|
- # q10_11 = float(vals.quantile(0.10))
|
|
|
- min_val = vals.min()
|
|
|
- if min_val <= float(price_duration_hours):
|
|
|
- df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
|
|
|
-
|
|
|
- # 历史上没有出现过的保持低价场景
|
|
|
- else:
|
|
|
- df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n0'
|
|
|
-
|
|
|
- if pd.notna(price_duration_hours) and price_change_percent <= 0.1:
|
|
|
- df_keep_nodes_part_1 = df_keep_nodes_part[df_keep_nodes_part['keep_price_change_percent'] <= 0.1]
|
|
|
- pct_vals_1 = pd.to_numeric(
|
|
|
- df_keep_nodes_part_1['keep_price_change_percent'],
|
|
|
- errors='coerce'
|
|
|
- ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
- dur_vals_1 = pd.to_numeric(
|
|
|
- df_keep_nodes_part_1['keep_price_duration_hours'],
|
|
|
- errors='coerce'
|
|
|
- ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
-
|
|
|
- if not pct_vals_1.empty and not dur_vals_1.empty:
|
|
|
- pct_min_1 = float(pct_vals_1.min())
|
|
|
- pct_max_1 = float(pct_vals_1.max())
|
|
|
- dur_min_1 = float(dur_vals_1.min())
|
|
|
- dur_max_1 = float(dur_vals_1.max())
|
|
|
-
|
|
|
- if (pct_min_1 <= float(price_change_percent) <= pct_max_1) and (dur_min_1 <= float(price_duration_hours) <= dur_max_1):
|
|
|
+ dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
|
|
|
+ # hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
|
|
|
+ seats_base_1 = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
|
|
|
+
|
|
|
+ if pd.notna(dur_base_1) and pd.notna(seats_base_1):
|
|
|
+ df_match_chk_1 = df_match_1.copy()
|
|
|
+ dur_vals_1 = pd.to_numeric(df_match_chk_1['modify_keep_price_duration_hours'], errors='coerce')
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[dur_vals_1.notna()].copy()
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 6].copy()
|
|
|
+
|
|
|
+ # drop_hud_vals_1 = pd.to_numeric(df_match_chk_1['keep_hours_until_departure'], errors='coerce')
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[drop_hud_vals_1.notna()].copy()
|
|
|
+ # df_match_chk_1 = df_match_chk_1.loc[(drop_hud_vals_1.loc[drop_hud_vals_1.notna()] - float(hud_base_1)).abs() <= 12].copy()
|
|
|
+
|
|
|
+ seats_vals_1 = pd.to_numeric(df_match_chk_1['keep_seats_remaining_change_amount'], errors='coerce')
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
|
|
|
+ df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.loc[seats_vals_1.notna()] == float(seats_base_1)].copy()
|
|
|
+
|
|
|
+ # 持续时间、距离起飞时间、座位变化都匹配上
|
|
|
+ if not df_match_chk_1.empty:
|
|
|
+ length_keep = df_match_chk_1.shape[0]
|
|
|
+ if length_keep > length_drop: # 不降价的多数压倒降价的少数
|
|
|
+
|
|
|
df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
- df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n1'
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k0'
|
|
|
+
|
|
|
+ elif length_keep == length_drop: # 不降价与降价相同, 取0.5概率
|
|
|
+
|
|
|
+ df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
|
|
|
+
|
|
|
+ # df_match_1['hours_delta'] = hours_until_departure - df_match_1['keep_hours_until_departure']
|
|
|
+ # df_match_1['modify_keep_price_duration_hours'] = df_match_1['keep_price_duration_hours'] - df_match_1['hours_delta']
|
|
|
+ # df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
|
|
|
+
|
|
|
+ # 比较 price_duration_hours 在 modify_keep_price_duration_hours 的百分位
|
|
|
+ # vals = df_match_1['modify_keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ # if not vals.empty:
|
|
|
+ # # q10_11 = float(vals.quantile(0.10))
|
|
|
+ # min_val = vals.min()
|
|
|
+ # if min_val <= float(price_duration_hours):
|
|
|
+ # df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
|
|
|
+
|
|
|
+ # 历史上未出现过近似变化幅度后保持低价场景
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ # df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n0'
|
|
|
+
|
|
|
+ # if pd.notna(price_duration_hours) and price_change_percent <= 0.1:
|
|
|
+ # df_keep_nodes_part_1 = df_keep_nodes_part[df_keep_nodes_part['keep_price_change_percent'] <= 0.1]
|
|
|
+ # pct_vals_1 = pd.to_numeric(
|
|
|
+ # df_keep_nodes_part_1['keep_price_change_percent'],
|
|
|
+ # errors='coerce'
|
|
|
+ # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ # dur_vals_1 = pd.to_numeric(
|
|
|
+ # df_keep_nodes_part_1['keep_price_duration_hours'],
|
|
|
+ # errors='coerce'
|
|
|
+ # ).replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+
|
|
|
+ # if not pct_vals_1.empty and not dur_vals_1.empty:
|
|
|
+ # pct_min_1 = float(pct_vals_1.min())
|
|
|
+ # pct_max_1 = float(pct_vals_1.max())
|
|
|
+ # dur_min_1 = float(dur_vals_1.min())
|
|
|
+ # dur_max_1 = float(dur_vals_1.max())
|
|
|
+
|
|
|
+ # if (pct_min_1 <= float(price_change_percent) <= pct_max_1) and (dur_min_1 <= float(price_duration_hours) <= dur_max_1):
|
|
|
+ # df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
+ # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n1'
|
|
|
pass
|
|
|
|
|
|
df_min_hours = df_min_hours.rename(columns={'seg1_dep_time': 'from_time'})
|
|
|
_pred_dt = pd.to_datetime(str(pred_time_str), format="%Y%m%d%H%M", errors="coerce")
|
|
|
- df_min_hours["update_hour"] = _pred_dt
|
|
|
+ df_min_hours["update_hour"] = _pred_dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
_dep_hour = pd.to_datetime(df_min_hours["from_time"], errors="coerce").dt.floor("h")
|
|
|
- df_min_hours["valid_begin_hour"] = _dep_hour - pd.to_timedelta(54, unit="h")
|
|
|
- df_min_hours["valid_end_hour"] = _dep_hour - pd.to_timedelta(18, unit="h")
|
|
|
+ df_min_hours["valid_begin_hour"] = (_dep_hour - pd.to_timedelta(60, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ df_min_hours["valid_end_hour"] = (_dep_hour - pd.to_timedelta(12, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
order_cols = ['city_pair', 'flight_day', 'flight_number_1', 'flight_number_2', 'from_time', 'baggage', 'currency',
|
|
|
'adult_total_price', 'hours_until_departure', 'price_change_percent', 'price_duration_hours',
|
|
|
@@ -1304,6 +1383,13 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
}
|
|
|
)
|
|
|
|
|
|
+ # 排序
|
|
|
+ df_predict = df_predict.sort_values(
|
|
|
+ by=['city_pair', 'flight_number_1', 'flight_number_2', 'flight_day'],
|
|
|
+ kind='mergesort',
|
|
|
+ na_position='last',
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
|
|
|
df_predict.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
|
|
|
|