|
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
|
import numpy as np
|
|
|
import bisect
|
|
|
import gc
|
|
|
+import os
|
|
|
from datetime import datetime, timedelta
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
from config import city_to_country, vj_city_code_map, vi_flight_number_map, build_country_holidays
|
|
|
@@ -833,7 +834,7 @@ def standardization(df, feature_scaler, target_scaler=None, is_training=True, is
|
|
|
return df, feature_scaler, target_scaler
|
|
|
|
|
|
|
|
|
-def preprocess_data_simple(df_input, is_train=False, output_dir='.'):
|
|
|
+def preprocess_data_simple(df_input, is_train=False):
|
|
|
|
|
|
df_input = preprocess_data_first_half(df_input)
|
|
|
|
|
|
@@ -884,6 +885,7 @@ def preprocess_data_simple(df_input, is_train=False, output_dir='.'):
|
|
|
df_input['Hours_Until_Departure'] = hours_until
|
|
|
df_input['Baggage'] = df_input['baggage']
|
|
|
|
|
|
+ # 训练过程
|
|
|
if is_train:
|
|
|
df_target = df_input[(df_input['hours_until_departure'] >= 18) & (df_input['hours_until_departure'] <= 54)].copy()
|
|
|
df_target = df_target.sort_values(
|
|
|
@@ -891,6 +893,7 @@ def preprocess_data_simple(df_input, is_train=False, output_dir='.'):
|
|
|
ascending=[True, False]
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
+ # 对于先升后降的分析
|
|
|
prev_pct = df_target.groupby('gid', group_keys=False)['price_change_percent'].shift(1)
|
|
|
prev_amo = df_target.groupby('gid', group_keys=False)['price_change_amount'].shift(1)
|
|
|
prev_dur = df_target.groupby('gid', group_keys=False)['price_duration_hours'].shift(1)
|
|
|
@@ -907,11 +910,13 @@ def preprocess_data_simple(df_input, is_train=False, output_dir='.'):
|
|
|
df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
|
|
|
|
flight_info_cols = [
|
|
|
- 'city_pair',
|
|
|
+ 'city_pair',
|
|
|
'flight_number_1', 'seg1_dep_air_port', 'seg1_dep_time', 'seg1_arr_air_port', 'seg1_arr_time',
|
|
|
'flight_number_2', 'seg2_dep_air_port', 'seg2_dep_time', 'seg2_arr_air_port', 'seg2_arr_time',
|
|
|
'currency', 'baggage', 'flight_day',
|
|
|
]
|
|
|
+
|
|
|
+ flight_info_cols = [c for c in flight_info_cols if c in df_target.columns]
|
|
|
|
|
|
df_gid_info = df_target[['gid'] + flight_info_cols].drop_duplicates(subset=['gid']).reset_index(drop=True)
|
|
|
df_drop_nodes = df_drop_nodes.merge(df_gid_info, on='gid', how='left')
|
|
|
@@ -920,12 +925,298 @@ def preprocess_data_simple(df_input, is_train=False, output_dir='.'):
|
|
|
'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount'
|
|
|
]
|
|
|
# 按顺序排列 去掉gid
|
|
|
- order_columns = flight_info_cols + drop_info_cols
|
|
|
- df_drop_nodes = df_drop_nodes[order_columns]
|
|
|
+ df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
|
|
|
+
|
|
|
+ # 对于没有先升后降的gid进行分析
|
|
|
+ gids_with_drop = df_target.loc[drop_mask, 'gid'].unique()
|
|
|
+ df_no_drop = df_target[~df_target['gid'].isin(gids_with_drop)].copy()
|
|
|
+
|
|
|
+ keep_info_cols = [
|
|
|
+ 'keep_hours_until_departure', 'keep_price_change_percent', 'keep_price_change_amount', 'keep_price_duration_hours'
|
|
|
+ ]
|
|
|
+
|
|
|
+ if df_no_drop.empty:
|
|
|
+ df_keep_nodes = pd.DataFrame(columns=flight_info_cols + keep_info_cols)
|
|
|
+ else:
|
|
|
+ df_no_drop = df_no_drop.sort_values(
|
|
|
+ by=['gid', 'hours_until_departure'],
|
|
|
+ ascending=[True, False]
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ df_no_drop['keep_segment'] = df_no_drop.groupby('gid')['price_change_percent'].transform(
|
|
|
+ lambda s: (s != s.shift()).cumsum()
|
|
|
+ )
|
|
|
+
|
|
|
+ df_keep_row = (
|
|
|
+ df_no_drop.groupby(['gid', 'keep_segment'], as_index=False)
|
|
|
+ .tail(1)
|
|
|
+ .reset_index(drop=True)
|
|
|
+ )
|
|
|
+
|
|
|
+ df_keep_nodes = df_keep_row[
|
|
|
+ ['gid', 'hours_until_departure', 'price_change_percent', 'price_change_amount', 'price_duration_hours']
|
|
|
+ ].copy()
|
|
|
+ df_keep_nodes.rename(
|
|
|
+ columns={
|
|
|
+ 'hours_until_departure': 'keep_hours_until_departure',
|
|
|
+ 'price_change_percent': 'keep_price_change_percent',
|
|
|
+ 'price_change_amount': 'keep_price_change_amount',
|
|
|
+ 'price_duration_hours': 'keep_price_duration_hours',
|
|
|
+ },
|
|
|
+ inplace=True,
|
|
|
+ )
|
|
|
+
|
|
|
+ df_keep_nodes = df_keep_nodes.merge(df_gid_info, on='gid', how='left')
|
|
|
+ df_keep_nodes = df_keep_nodes[flight_info_cols + keep_info_cols]
|
|
|
+
|
|
|
+ del df_keep_row
|
|
|
|
|
|
del df_gid_info
|
|
|
del df_target
|
|
|
+ del df_no_drop
|
|
|
+
|
|
|
+ return df_input, df_drop_nodes, df_keep_nodes
|
|
|
+
|
|
|
+ return df_input, None, None
|
|
|
+
|
|
|
+
|
|
|
+def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".", pred_time_str=""):
|
|
|
+ if df_input is None or df_input.empty:
|
|
|
+ return pd.DataFrame()
|
|
|
+
|
|
|
+ df_sorted = df_input.sort_values(
|
|
|
+ by=['gid', 'hours_until_departure'],
|
|
|
+ ascending=[True, False],
|
|
|
+ ).reset_index(drop=True)
|
|
|
+
|
|
|
+ df_sorted = df_sorted[
|
|
|
+ df_sorted['hours_until_departure'].between(18, 54)
|
|
|
+ ].reset_index(drop=True)
|
|
|
+
|
|
|
+ # 每个 gid 取 hours_until_departure 最小的一条
|
|
|
+ df_min_hours = (
|
|
|
+ df_sorted.drop_duplicates(subset=['gid'], keep='last')
|
|
|
+ .reset_index(drop=True)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 确保 hours_until_departure 在 [18, 54] 的 范围内
|
|
|
+ # df_min_hours = df_min_hours[
|
|
|
+ # df_min_hours['hours_until_departure'].between(18, 54)
|
|
|
+ # ].reset_index(drop=True)
|
|
|
+
|
|
|
+ drop_info_csv_path = os.path.join(output_dir, f'{group_route_str}_drop_info.csv')
|
|
|
+ if os.path.exists(drop_info_csv_path):
|
|
|
+ df_drop_nodes = pd.read_csv(drop_info_csv_path)
|
|
|
else:
|
|
|
- df_drop_nodes = None
|
|
|
+ df_drop_nodes = pd.DataFrame()
|
|
|
+
|
|
|
+ keep_info_csv_path = os.path.join(output_dir, f'{group_route_str}_keep_info.csv')
|
|
|
+ if os.path.exists(keep_info_csv_path):
|
|
|
+ df_keep_nodes = pd.read_csv(keep_info_csv_path)
|
|
|
+ else:
|
|
|
+ df_keep_nodes = pd.DataFrame()
|
|
|
+
|
|
|
+ df_min_hours['simple_will_price_drop'] = -1 # -1 表示未知
|
|
|
+ df_min_hours['simple_drop_in_hours'] = 0
|
|
|
+ df_min_hours['simple_drop_in_hours_prob'] = 0.0
|
|
|
+ df_min_hours['simple_drop_in_hours_dist'] = ''
|
|
|
+
|
|
|
+ # 这个阈值取多少?
|
|
|
+ pct_threshold = 0.01
|
|
|
+ # pct_threshold = 2
|
|
|
+ pct_threshold_1 = 0.001
|
|
|
+ pct_threshold_c = 0.001
|
|
|
+
|
|
|
+ for idx, row in df_min_hours.iterrows():
|
|
|
+ city_pair = row['city_pair']
|
|
|
+ flight_number_1 = row['flight_number_1']
|
|
|
+ flight_number_2 = row['flight_number_2']
|
|
|
+ price_change_percent = row['price_change_percent']
|
|
|
+ price_duration_hours = row['price_duration_hours']
|
|
|
+ hours_until_departure = row['hours_until_departure']
|
|
|
+ # 针对历史上发生的 高价->低价
|
|
|
+ if not df_drop_nodes.empty:
|
|
|
+ # 对准航班号, 不同起飞日期
|
|
|
+ if flight_number_2 and flight_number_2 != 'VJ':
|
|
|
+ df_drop_nodes_part = df_drop_nodes[
|
|
|
+ (df_drop_nodes['city_pair'] == city_pair) &
|
|
|
+ (df_drop_nodes['flight_number_1'] == flight_number_1) &
|
|
|
+ (df_drop_nodes['flight_number_2'] == flight_number_2)
|
|
|
+ ]
|
|
|
+ else:
|
|
|
+ df_drop_nodes_part = df_drop_nodes[
|
|
|
+ (df_drop_nodes['city_pair'] == city_pair) &
|
|
|
+ (df_drop_nodes['flight_number_1'] == flight_number_1)
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 降价前 增幅阈值的匹配 与 高价历史持续时间 得出降价时间的概率
|
|
|
+ if not df_drop_nodes_part.empty and pd.notna(price_change_percent):
|
|
|
+ # 增幅太小的去掉
|
|
|
+ df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.1]
|
|
|
+ # pct_vals = df_drop_nodes_part['high_price_change_percent'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ # # 保留百分位 10% ~ 90% 之间的 数据
|
|
|
+ # if not pct_vals.empty:
|
|
|
+ # q10 = float(pct_vals.quantile(0.10))
|
|
|
+ # q90 = float(pct_vals.quantile(0.90))
|
|
|
+ # df_drop_nodes_part = df_drop_nodes_part[
|
|
|
+ # df_drop_nodes_part['high_price_change_percent'].between(q10, q90)
|
|
|
+ # ]
|
|
|
+ # if df_drop_nodes_part.empty:
|
|
|
+ # continue
|
|
|
+ pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
+ df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
|
|
|
+
|
|
|
+ if not df_match.empty and pd.notna(price_duration_hours):
|
|
|
+ remaining_hours = (df_match['high_price_duration_hours'] - float(price_duration_hours)).clip(lower=0)
|
|
|
+ remaining_hours = remaining_hours.round().astype(int)
|
|
|
+
|
|
|
+ counts = remaining_hours.value_counts().sort_index()
|
|
|
+ probs = (counts / counts.sum()).round(4)
|
|
|
+
|
|
|
+ top_hours = int(probs.idxmax())
|
|
|
+ top_prob = float(probs.max())
|
|
|
+
|
|
|
+ dist_items = list(zip(probs.index.tolist(), probs.tolist()))
|
|
|
+ dist_items = dist_items[:10]
|
|
|
+ dist_str = ' | '.join([f"{int(h)}:{float(p)}" for h, p in dist_items])
|
|
|
+
|
|
|
+ df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = top_prob
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
|
|
|
+
|
|
|
+ continue # 已经判定降价 后面不再做
|
|
|
+
|
|
|
+ # 针对历史上发生 一直低价、一直高价、低价->高价、连续低价 等
|
|
|
+ if not df_keep_nodes.empty:
|
|
|
+ # 对准航班号, 不同起飞日期
|
|
|
+ if flight_number_2 and flight_number_2 != 'VJ':
|
|
|
+ df_keep_nodes_part = df_keep_nodes[
|
|
|
+ (df_keep_nodes['city_pair'] == city_pair) &
|
|
|
+ (df_keep_nodes['flight_number_1'] == flight_number_1) &
|
|
|
+ (df_keep_nodes['flight_number_2'] == flight_number_2)
|
|
|
+ ]
|
|
|
+ else:
|
|
|
+ df_keep_nodes_part = df_keep_nodes[
|
|
|
+ (df_keep_nodes['city_pair'] == city_pair) &
|
|
|
+ (df_keep_nodes['flight_number_1'] == flight_number_1)
|
|
|
+ ]
|
|
|
+
|
|
|
+ if not df_keep_nodes_part.empty and pd.notna(price_change_percent):
|
|
|
+ # pct_vals_1 = df_keep_nodes_part['keep_price_change_percent'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ # # 保留百分位 10% ~ 90% 之间的 数据
|
|
|
+ # if not pct_vals_1.empty:
|
|
|
+ # q10_1 = float(pct_vals_1.quantile(0.10))
|
|
|
+ # q90_1 = float(pct_vals_1.quantile(0.90))
|
|
|
+ # df_keep_nodes_part = df_keep_nodes_part[
|
|
|
+ # df_keep_nodes_part['keep_price_change_percent'].between(q10_1, q90_1)
|
|
|
+ # ]
|
|
|
+ # if df_keep_nodes_part.empty:
|
|
|
+ # continue
|
|
|
+
|
|
|
+ # 特殊判定场景
|
|
|
+ if price_change_percent < 0:
|
|
|
+
|
|
|
+ df_tmp = df_keep_nodes_part.copy()
|
|
|
+ # 确保组内顺序正确(如果前面已经排过,这行可省略)
|
|
|
+ df_tmp = df_tmp.sort_values(
|
|
|
+ by=["flight_day", "keep_hours_until_departure"],
|
|
|
+ ascending=[True, False]
|
|
|
+ )
|
|
|
+ # 是否为负值
|
|
|
+ df_tmp["is_negative"] = df_tmp["keep_price_change_percent"] < 0
|
|
|
+
|
|
|
+ if df_tmp["is_negative"].any():
|
|
|
+ # 标记“负值段”的开始
|
|
|
+ # 当 is_negative 为 True 且 前一行不是负值时,认为是一个新段
|
|
|
+ df_tmp["neg_block_id"] = (
|
|
|
+ df_tmp["is_negative"]
|
|
|
+ & ~df_tmp.groupby("flight_day")["is_negative"].shift(fill_value=False)
|
|
|
+ ).groupby(df_tmp["flight_day"]).cumsum()
|
|
|
+ # 在每个负值段内计数(第几个负值)
|
|
|
+ df_tmp["neg_rank_in_block"] = (
|
|
|
+ df_tmp.groupby(["flight_day", "neg_block_id"])
|
|
|
+ .cumcount() + 1
|
|
|
+ )
|
|
|
+ # 每个连续负值段的长度
|
|
|
+ df_tmp["neg_block_size"] = (
|
|
|
+ df_tmp.groupby(["flight_day", "neg_block_id"])["is_negative"]
|
|
|
+ .transform("sum")
|
|
|
+ )
|
|
|
+ # 只保留:
|
|
|
+ # 1) 是负值
|
|
|
+ # 2) 且不是该连续负值段的最后一个
|
|
|
+ df_continuous_price_drop = df_tmp[
|
|
|
+ (df_tmp["is_negative"]) &
|
|
|
+ (df_tmp["neg_rank_in_block"] < df_tmp["neg_block_size"])
|
|
|
+ ].drop(
|
|
|
+ columns=[
|
|
|
+ "is_negative",
|
|
|
+ "neg_block_id",
|
|
|
+ "neg_rank_in_block",
|
|
|
+ "neg_block_size",
|
|
|
+ ]
|
|
|
+ )
|
|
|
+ pct_diff_c = (df_continuous_price_drop['keep_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
+ df_match_c = df_continuous_price_drop.loc[pct_diff_c <= pct_threshold_c, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
|
|
|
+
|
|
|
+ # 符合连续降价条件
|
|
|
+ if not df_match_c.empty and pd.notna(price_duration_hours):
|
|
|
+ vals_c = df_match_c['keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ if not vals_c.empty:
|
|
|
+ min_val = vals_c.min()
|
|
|
+ if min_val <= float(price_duration_hours):
|
|
|
+ df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = ''
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 一般判定场景
|
|
|
+ pct_diff_1 = (df_keep_nodes_part['keep_price_change_percent'] - float(price_change_percent)).abs()
|
|
|
+ df_match_1 = df_keep_nodes_part.loc[pct_diff_1 <= pct_threshold_1, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
|
|
|
+
|
|
|
+ if not df_match_1.empty and pd.notna(price_duration_hours):
|
|
|
+
|
|
|
+ df_match_1['hours_delta'] = hours_until_departure - df_match_1['keep_hours_until_departure']
|
|
|
+ df_match_1['modify_keep_price_duration_hours'] = df_match_1['keep_price_duration_hours'] - df_match_1['hours_delta']
|
|
|
+ df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
|
|
|
+
|
|
|
+ # 比较 price_duration_hours 在 modify_keep_price_duration_hours 的百分位
|
|
|
+ vals = df_match_1['modify_keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
|
|
|
+ if not vals.empty:
|
|
|
+ q10_11 = float(vals.quantile(0.10))
|
|
|
+ # q90_11 = float(vals.quantile(0.90))
|
|
|
+ if q10_11 <= float(price_duration_hours):
|
|
|
+ df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
|
|
|
+ df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = ''
|
|
|
+
|
|
|
+ df_min_hours = df_min_hours.rename(columns={'seg1_dep_time': 'from_time'})
|
|
|
+ _pred_dt = pd.to_datetime(str(pred_time_str), format="%Y%m%d%H%M", errors="coerce")
|
|
|
+ df_min_hours["update_hour"] = _pred_dt
|
|
|
+ _dep_hour = pd.to_datetime(df_min_hours["from_time"], errors="coerce").dt.floor("h")
|
|
|
+ df_min_hours["valid_begin_hour"] = _dep_hour - pd.to_timedelta(54, unit="h")
|
|
|
+ df_min_hours["valid_end_hour"] = _dep_hour - pd.to_timedelta(18, unit="h")
|
|
|
+
|
|
|
+ order_cols = ['city_pair', 'flight_day', 'flight_number_1', 'flight_number_2', 'from_time', 'baggage', 'currency',
|
|
|
+ 'adult_total_price', 'hours_until_departure', 'price_change_percent', 'price_duration_hours',
|
|
|
+ 'update_hour', 'crawl_date',
|
|
|
+ 'valid_begin_hour', 'valid_end_hour',
|
|
|
+ 'simple_will_price_drop', 'simple_drop_in_hours', 'simple_drop_in_hours_prob', 'simple_drop_in_hours_dist'
|
|
|
+ ]
|
|
|
+ df_predict = df_min_hours[order_cols]
|
|
|
+ df_predict = df_predict.rename(columns={
|
|
|
+ 'simple_will_price_drop': 'will_price_drop',
|
|
|
+ 'simple_drop_in_hours': 'drop_in_hours',
|
|
|
+ 'simple_drop_in_hours_prob': 'drop_in_hours_prob',
|
|
|
+ 'simple_drop_in_hours_dist': 'drop_in_hours_dist',
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
|
|
|
+ df_predict.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
|
|
|
|
|
|
- return df_input, df_drop_nodes
|
|
|
+ print("预测结果已追加")
|
|
|
+ return df_predict
|