|
@@ -0,0 +1,158 @@
|
|
|
|
|
+import pandas as pd
|
|
|
|
|
+import numpy as np
|
|
|
|
|
+import gc
|
|
|
|
|
+import os
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def preprocess_data_simple(df_input, is_train=False):
|
|
|
|
|
+
|
|
|
|
|
+ print(">>> 开始数据预处理")
|
|
|
|
|
+ # 城市码映射成数字
|
|
|
|
|
+
|
|
|
|
|
+ # gid:基于指定字段的分组标记(整数)
|
|
|
|
|
+ df_input['gid'] = (
|
|
|
|
|
+ df_input
|
|
|
|
|
+ .groupby(
|
|
|
|
|
+ ['citypair', 'flight_numbers', 'from_date'], # 'baggage_weight' 先不进分组
|
|
|
|
|
+ sort=False
|
|
|
|
|
+ )
|
|
|
|
|
+ .ngroup()
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 在 gid 与 baggage_weight 内按时间降序
|
|
|
|
|
+ df_input = df_input.sort_values(
|
|
|
|
|
+ by=['gid', 'baggage_weight', 'hours_until_departure'],
|
|
|
|
|
+ ascending=[True, True, False]
|
|
|
|
|
+ ).reset_index(drop=True)
|
|
|
|
|
+
|
|
|
|
|
+ df_input = df_input[df_input['hours_until_departure'] <= 480]
|
|
|
|
|
+ df_input = df_input[df_input['baggage_weight'] == 20] # 先保留20公斤行李的
|
|
|
|
|
+
|
|
|
|
|
+ # 在hours_until_departure 的末尾 保留真实的而不是补齐的数据
|
|
|
|
|
+ if not is_train:
|
|
|
|
|
+ _tail_filled = df_input.groupby(['gid', 'baggage_weight'])['is_filled'].transform(
|
|
|
|
|
+ lambda s: s.iloc[::-1].cummin().iloc[::-1]
|
|
|
|
|
+ )
|
|
|
|
|
+ df_input = df_input[~((df_input['is_filled'] == 1) & (_tail_filled == 1))]
|
|
|
|
|
+
|
|
|
|
|
+ # 价格变化最小量阈值
|
|
|
|
|
+ price_change_amount_threshold = 5
|
|
|
|
|
+ df_input['_raw_price_diff'] = df_input.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total'].diff()
|
|
|
|
|
+
|
|
|
|
|
+ # 计算价格变化量
|
|
|
|
|
+ df_input['price_change_amount'] = (
|
|
|
|
|
+ df_input['_raw_price_diff']
|
|
|
|
|
+ .mask(df_input['_raw_price_diff'].abs() < price_change_amount_threshold, 0)
|
|
|
|
|
+ .replace(0, np.nan)
|
|
|
|
|
+ .groupby([df_input['gid'], df_input['baggage_weight']], group_keys=False)
|
|
|
|
|
+ .ffill()
|
|
|
|
|
+ .fillna(0)
|
|
|
|
|
+ .round(2)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 计算价格变化百分比(相对于上一时间点的变化率)
|
|
|
|
|
+ df_input['price_change_percent'] = (
|
|
|
|
|
+ df_input.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total']
|
|
|
|
|
+ .pct_change()
|
|
|
|
|
+ .mask(df_input['_raw_price_diff'].abs() < price_change_amount_threshold, 0)
|
|
|
|
|
+ .replace(0, np.nan)
|
|
|
|
|
+ .groupby([df_input['gid'], df_input['baggage_weight']], group_keys=False)
|
|
|
|
|
+ .ffill()
|
|
|
|
|
+ .fillna(0)
|
|
|
|
|
+ .round(4)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 第一步:标记价格变化段
|
|
|
|
|
+ df_input['price_change_segment'] = (
|
|
|
|
|
+ df_input.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_amount']
|
|
|
|
|
+ .apply(lambda s: (s != s.shift()).cumsum())
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 第二步:计算每个变化段内的持续时间
|
|
|
|
|
+ df_input['price_duration_hours'] = (
|
|
|
|
|
+ df_input.groupby(['gid', 'baggage_weight', 'price_change_segment'], group_keys=False)
|
|
|
|
|
+ .cumcount()
|
|
|
|
|
+ .add(1)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 可选:删除临时列
|
|
|
|
|
+ df_input = df_input.drop(columns=['price_change_segment', '_raw_price_diff'])
|
|
|
|
|
+
|
|
|
|
|
+ # 训练过程
|
|
|
|
|
+ if is_train:
|
|
|
|
|
+ df_target = df_input[(df_input['hours_until_departure'] >= 24) & (df_input['hours_until_departure'] <= 360)].copy()
|
|
|
|
|
+ df_target = df_target.sort_values(
|
|
|
|
|
+ by=['gid', 'baggage_weight', 'hours_until_departure'],
|
|
|
|
|
+ ascending=[True, True, False]
|
|
|
|
|
+ ).reset_index(drop=True)
|
|
|
|
|
+
|
|
|
|
|
+ # 对于先升后降的分析
|
|
|
|
|
+ prev_pct = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_percent'].shift(1)
|
|
|
|
|
+ prev_amo = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_amount'].shift(1)
|
|
|
|
|
+ prev_dur = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_duration_hours'].shift(1)
|
|
|
|
|
+ prev_price = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total'].shift(1)
|
|
|
|
|
+ drop_mask = (prev_pct > 0) & (df_target['price_change_percent'] < 0)
|
|
|
|
|
+
|
|
|
|
|
+ df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure']].copy()
|
|
|
|
|
+ df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
|
|
|
|
|
+ df_drop_nodes['drop_price_change_percent'] = df_target.loc[drop_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
|
|
|
+ df_drop_nodes['drop_price_change_amount'] = df_target.loc[drop_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
|
|
|
+ df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
|
|
|
|
|
+ df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
|
|
|
|
|
+ df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
|
|
+ df_drop_nodes['high_price_amount'] = prev_price.loc[drop_mask].astype(float).round(2).to_numpy()
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes.reset_index(drop=True)
|
|
|
|
|
+
|
|
|
|
|
+ flight_info_cols = [
|
|
|
|
|
+ 'citypair', 'flight_numbers', 'from_time', 'from_date', 'currency',
|
|
|
|
|
+ ]
|
|
|
|
|
+ flight_info_cols = [c for c in flight_info_cols if c in df_target.columns]
|
|
|
|
|
+ df_gid_info = df_target[['gid', 'baggage_weight'] + flight_info_cols].drop_duplicates(subset=['gid', 'baggage_weight']).reset_index(drop=True)
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
|
|
|
|
|
+
|
|
|
|
|
+ drop_info_cols = [
|
|
|
|
|
+ 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
|
|
|
|
|
+ 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount',
|
|
|
|
|
+ ]
|
|
|
|
|
+ # 按顺序排列 去掉gid
|
|
|
|
|
+ df_drop_nodes = df_drop_nodes[flight_info_cols + ['baggage_weight'] + drop_info_cols]
|
|
|
|
|
+
|
|
|
|
|
+ # 对于“上涨后再次上涨”的分析(连续两个正向变价段)
|
|
|
|
|
+ seg_start_mask = df_target['price_duration_hours'].eq(1)
|
|
|
|
|
+ rise_mask = seg_start_mask & (prev_pct > 0) & (df_target['price_change_percent'] > 0)
|
|
|
|
|
+
|
|
|
|
|
+ df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure']].copy()
|
|
|
|
|
+ df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
|
|
|
|
|
+ df_rise_nodes['rise_price_change_percent'] = df_target.loc[rise_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
|
|
|
|
|
+ df_rise_nodes['rise_price_change_amount'] = df_target.loc[rise_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
|
|
|
|
|
+ df_rise_nodes['prev_rise_duration_hours'] = prev_dur.loc[rise_mask].astype(float).to_numpy()
|
|
|
|
|
+ df_rise_nodes['prev_rise_change_percent'] = prev_pct.loc[rise_mask].astype(float).round(4).to_numpy()
|
|
|
|
|
+ df_rise_nodes['prev_rise_change_amount'] = prev_amo.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
|
|
|
+ df_rise_nodes['prev_rise_amount'] = prev_price.loc[rise_mask].astype(float).round(2).to_numpy()
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes.reset_index(drop=True)
|
|
|
|
|
+
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
|
|
|
|
|
+
|
|
|
|
|
+ rise_info_cols = [
|
|
|
|
|
+ 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
|
|
|
|
|
+ 'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount',
|
|
|
|
|
+ ]
|
|
|
|
|
+ df_rise_nodes = df_rise_nodes[flight_info_cols + ['baggage_weight'] + rise_info_cols]
|
|
|
|
|
+
|
|
|
|
|
+ # 制作历史包络线
|
|
|
|
|
+ envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
|
|
|
|
|
+ idx_peak = df_input.groupby(envelope_group)['price_total'].idxmax()
|
|
|
|
|
+ df_envelope = df_input.loc[idx_peak, envelope_group + [
|
|
|
|
|
+ 'price_total', 'hours_until_departure'
|
|
|
|
|
+ ]].rename(columns={
|
|
|
|
|
+ 'price_total': 'peak_price',
|
|
|
|
|
+ 'hours_until_departure': 'peak_hours',
|
|
|
|
|
+ }).reset_index(drop=True)
|
|
|
|
|
+
|
|
|
|
|
+ del df_gid_info
|
|
|
|
|
+ del df_target
|
|
|
|
|
+
|
|
|
|
|
+ return df_input, df_drop_nodes, df_rise_nodes, df_envelope
|
|
|
|
|
+
|
|
|
|
|
+ return df_input, None, None, None
|
|
|
|
|
+
|