|
@@ -4,7 +4,8 @@ import bisect
|
|
|
import gc
|
|
import gc
|
|
|
from datetime import datetime, timedelta
|
|
from datetime import datetime, timedelta
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
-from config import city_to_country, vj_city_code_map, build_country_holidays
|
|
|
|
|
|
|
+from config import city_to_country, vj_city_code_map, vi_flight_number_map, build_country_holidays
|
|
|
|
|
+from utils import insert_df_col
|
|
|
|
|
|
|
|
COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
|
|
COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
|
|
|
|
|
|
|
@@ -16,8 +17,22 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
df_input['city_pair'] = (
|
|
df_input['city_pair'] = (
|
|
|
df_input['from_city_code'].astype(str) + "-" + df_input['to_city_code'].astype(str)
|
|
df_input['from_city_code'].astype(str) + "-" + df_input['to_city_code'].astype(str)
|
|
|
)
|
|
)
|
|
|
|
|
+ # 城市码映射成数字
|
|
|
df_input['from_city_num'] = df_input['from_city_code'].map(vj_city_code_map)
|
|
df_input['from_city_num'] = df_input['from_city_code'].map(vj_city_code_map)
|
|
|
df_input['to_city_num'] = df_input['to_city_code'].map(vj_city_code_map)
|
|
df_input['to_city_num'] = df_input['to_city_code'].map(vj_city_code_map)
|
|
|
|
|
+
|
|
|
|
|
+ missing_from = (
|
|
|
|
|
+ df_input.loc[df_input['from_city_num'].isna(), 'from_city_code']
|
|
|
|
|
+ .unique()
|
|
|
|
|
+ )
|
|
|
|
|
+ missing_to = (
|
|
|
|
|
+ df_input.loc[df_input['to_city_num'].isna(), 'to_city_code']
|
|
|
|
|
+ .unique()
|
|
|
|
|
+ )
|
|
|
|
|
+ if missing_from:
|
|
|
|
|
+ print("未映射的 from_city:", missing_from)
|
|
|
|
|
+ if missing_to:
|
|
|
|
|
+ print("未映射的 to_city:", missing_to)
|
|
|
|
|
|
|
|
# 把 city_pair、from_city_code、from_city_num, to_city_code, to_city_num 放到前几列
|
|
# 把 city_pair、from_city_code、from_city_num, to_city_code, to_city_num 放到前几列
|
|
|
cols = df_input.columns.tolist()
|
|
cols = df_input.columns.tolist()
|
|
@@ -26,6 +41,7 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
cols.remove(c)
|
|
cols.remove(c)
|
|
|
# 这几列插入到最前面
|
|
# 这几列插入到最前面
|
|
|
df_input = df_input[['city_pair', 'from_city_code', 'from_city_num', 'to_city_code', 'to_city_num'] + cols]
|
|
df_input = df_input[['city_pair', 'from_city_code', 'from_city_num', 'to_city_code', 'to_city_num'] + cols]
|
|
|
|
|
+ pass
|
|
|
|
|
|
|
|
# 转格式
|
|
# 转格式
|
|
|
df_input['search_dep_time'] = pd.to_datetime(
|
|
df_input['search_dep_time'] = pd.to_datetime(
|
|
@@ -48,6 +64,42 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
df_input['flight_number_1'] = df_input['flight_number_1'].fillna('VJ')
|
|
df_input['flight_number_1'] = df_input['flight_number_1'].fillna('VJ')
|
|
|
df_input['flight_number_2'] = df_input['flight_number_2'].fillna('VJ')
|
|
df_input['flight_number_2'] = df_input['flight_number_2'].fillna('VJ')
|
|
|
|
|
|
|
|
|
|
+ # 航班号转数字
|
|
|
|
|
+ df_input['flight_1_num'] = df_input['flight_number_1'].map(vi_flight_number_map)
|
|
|
|
|
+ df_input['flight_2_num'] = df_input['flight_number_2'].map(vi_flight_number_map)
|
|
|
|
|
+
|
|
|
|
|
+ missing_flight_1 = (
|
|
|
|
|
+ df_input.loc[df_input['flight_1_num'].isna(), 'flight_number_1']
|
|
|
|
|
+ .unique()
|
|
|
|
|
+ )
|
|
|
|
|
+ missing_flight_2 = (
|
|
|
|
|
+ df_input.loc[df_input['flight_2_num'].isna(), 'flight_number_2']
|
|
|
|
|
+ .unique()
|
|
|
|
|
+ )
|
|
|
|
|
+ if missing_flight_1:
|
|
|
|
|
+ print("未映射的 flight_1:", missing_flight_1)
|
|
|
|
|
+ if missing_flight_2:
|
|
|
|
|
+ print("未映射的 flight_2:", missing_flight_2)
|
|
|
|
|
+
|
|
|
|
|
+ # flight_1_num 放在 seg1_dep_air_port 之前
|
|
|
|
|
+ insert_df_col(df_input, 'flight_1_num', 'seg1_dep_air_port')
|
|
|
|
|
+
|
|
|
|
|
+ # flight_2_num 放在 seg2_dep_air_port 之前
|
|
|
|
|
+ insert_df_col(df_input, 'flight_2_num', 'seg2_dep_air_port')
|
|
|
|
|
+
|
|
|
|
|
+ df_input['baggage_level'] = (df_input['baggage'] == 30).astype(int) # 30--> 1 20--> 0
|
|
|
|
|
+ # baggage_level 放在 flight_number_2 之前
|
|
|
|
|
+ insert_df_col(df_input, 'baggage_level', 'flight_number_2')
|
|
|
|
|
+
|
|
|
|
|
+ df_input['Adult_Total_Price'] = df_input['adult_total_price']
|
|
|
|
|
+ # Adult_Total_Price 放在 seats_remaining 之前 保存缩放前的原始值
|
|
|
|
|
+ insert_df_col(df_input, 'Adult_Total_Price', 'seats_remaining')
|
|
|
|
|
+
|
|
|
|
|
+ df_input['Hours_Until_Departure'] = df_input['hours_until_departure']
|
|
|
|
|
+ # Hours_Until_Departure 放在 days_to_departure 之前 保存缩放前的原始值
|
|
|
|
|
+ insert_df_col(df_input, 'Hours_Until_Departure', 'days_to_departure')
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
# gid:基于指定字段的分组标记(整数)
|
|
# gid:基于指定字段的分组标记(整数)
|
|
|
df_input['gid'] = (
|
|
df_input['gid'] = (
|
|
|
df_input
|
|
df_input
|
|
@@ -116,9 +168,7 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
# 删除原始第一机场码
|
|
# 删除原始第一机场码
|
|
|
df_input.drop(columns=['seg1_dep_air_port', 'seg1_arr_air_port'], inplace=True)
|
|
df_input.drop(columns=['seg1_dep_air_port', 'seg1_arr_air_port'], inplace=True)
|
|
|
# 第一机场对 放到 seg1_dep_time 列的前面
|
|
# 第一机场对 放到 seg1_dep_time 列的前面
|
|
|
- insert_idx = df_input.columns.get_loc('seg1_dep_time')
|
|
|
|
|
- airport_pair_1 = df_input.pop('airport_pair_1')
|
|
|
|
|
- df_input.insert(insert_idx, 'airport_pair_1', airport_pair_1)
|
|
|
|
|
|
|
+ insert_df_col(df_input, 'airport_pair_1', 'seg1_dep_time')
|
|
|
|
|
|
|
|
# 生成第二机场对(带缺失兜底)
|
|
# 生成第二机场对(带缺失兜底)
|
|
|
df_input['airport_pair_2'] = np.where(
|
|
df_input['airport_pair_2'] = np.where(
|
|
@@ -130,15 +180,12 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
# 删除原始第二机场码
|
|
# 删除原始第二机场码
|
|
|
df_input.drop(columns=['seg2_dep_air_port', 'seg2_arr_air_port'], inplace=True)
|
|
df_input.drop(columns=['seg2_dep_air_port', 'seg2_arr_air_port'], inplace=True)
|
|
|
# 第二机场对 放到 seg2_dep_time 列的前面
|
|
# 第二机场对 放到 seg2_dep_time 列的前面
|
|
|
- insert_idx = df_input.columns.get_loc('seg2_dep_time')
|
|
|
|
|
- airport_pair_2 = df_input.pop('airport_pair_2')
|
|
|
|
|
- df_input.insert(insert_idx, 'airport_pair_2', airport_pair_2)
|
|
|
|
|
-
|
|
|
|
|
|
|
+ insert_df_col(df_input, 'airport_pair_2', 'seg2_dep_time')
|
|
|
|
|
+
|
|
|
# 是否转乘
|
|
# 是否转乘
|
|
|
df_input['is_transfer'] = np.where(df_input['flight_number_2'] == 'VJ', 0, 1)
|
|
df_input['is_transfer'] = np.where(df_input['flight_number_2'] == 'VJ', 0, 1)
|
|
|
- insert_idx = df_input.columns.get_loc('flight_number_2')
|
|
|
|
|
- is_transfer = df_input.pop('is_transfer')
|
|
|
|
|
- df_input.insert(insert_idx, 'is_transfer', is_transfer)
|
|
|
|
|
|
|
+ # 是否转乘 放到 flight_number_2 列的前面
|
|
|
|
|
+ insert_df_col(df_input, 'is_transfer', 'flight_number_2')
|
|
|
|
|
|
|
|
# 重命名起飞时刻与到达时刻
|
|
# 重命名起飞时刻与到达时刻
|
|
|
df_input.rename(
|
|
df_input.rename(
|
|
@@ -236,7 +283,7 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
).astype(int)
|
|
).astype(int)
|
|
|
|
|
|
|
|
# 在任一侧是否节假日
|
|
# 在任一侧是否节假日
|
|
|
- df_input['flight_day_is_holiday'] = (
|
|
|
|
|
|
|
+ df_input['any_country_is_holiday'] = (
|
|
|
df_input[['dep_country_is_holiday', 'arr_country_is_holiday']]
|
|
df_input[['dep_country_is_holiday', 'arr_country_is_holiday']]
|
|
|
.max(axis=1)
|
|
.max(axis=1)
|
|
|
)
|
|
)
|
|
@@ -275,9 +322,7 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
# df_input['days_to_holiday'] = df_input['days_to_holiday'].fillna(999)
|
|
# df_input['days_to_holiday'] = df_input['days_to_holiday'].fillna(999)
|
|
|
|
|
|
|
|
# days_to_holiday 插在 update_hour 前面
|
|
# days_to_holiday 插在 update_hour 前面
|
|
|
- insert_idx = df_input.columns.get_loc('update_hour')
|
|
|
|
|
- days_to_holiday = df_input.pop('days_to_holiday')
|
|
|
|
|
- df_input.insert(insert_idx, 'days_to_holiday', days_to_holiday)
|
|
|
|
|
|
|
+ insert_df_col(df_input, 'days_to_holiday', 'update_hour')
|
|
|
|
|
|
|
|
# 制作targets
|
|
# 制作targets
|
|
|
print(f"\n>>> 开始处理 对应区间: n_hours = {current_n_hours}")
|
|
print(f"\n>>> 开始处理 对应区间: n_hours = {current_n_hours}")
|
|
@@ -372,5 +417,72 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
print(">>> 合并后 df_input 样例:")
|
|
print(">>> 合并后 df_input 样例:")
|
|
|
print(df_input[['gid', 'hours_until_departure', 'adult_total_price', 'target_will_price_drop', 'target_amount_of_drop', 'target_time_to_drop']].head(5))
|
|
print(df_input[['gid', 'hours_until_departure', 'adult_total_price', 'target_will_price_drop', 'target_amount_of_drop', 'target_time_to_drop']].head(5))
|
|
|
|
|
|
|
|
-
|
|
|
|
|
|
|
+ # 按顺序排列
|
|
|
|
|
+ order_columns = [
|
|
|
|
|
+ "city_pair", "from_city_code", "from_city_num", "to_city_code", "to_city_num", "flight_day",
|
|
|
|
|
+ "seats_remaining", "baggage", "baggage_level",
|
|
|
|
|
+ "price_change_times_total", "price_last_change_hours", "adult_total_price", "Adult_Total_Price", "target_will_price_drop", "target_time_to_drop",
|
|
|
|
|
+ "days_to_departure", "days_to_holiday", "hours_until_departure", "Hours_Until_Departure", "update_hour", "gid",
|
|
|
|
|
+ "flight_number_1", "flight_1_num", "airport_pair_1", "dep_time_1", "arr_time_1", "fly_duration_1",
|
|
|
|
|
+ "flight_by_hour", "flight_by_day", "flight_day_of_month", "flight_day_of_week", "flight_day_of_quarter", "flight_day_is_weekend", "is_transfer",
|
|
|
|
|
+ "flight_number_2", "flight_2_num", "airport_pair_2", "dep_time_2", "arr_time_2", "fly_duration_2", "fly_duration", "stop_duration",
|
|
|
|
|
+ "global_dep_time", "dep_country", "dep_country_is_holiday", "is_cross_country",
|
|
|
|
|
+ "global_arr_time", "arr_country", "arr_country_is_holiday", "any_country_is_holiday",
|
|
|
|
|
+ ]
|
|
|
|
|
+ df_input = df_input[order_columns]
|
|
|
|
|
+
|
|
|
return df_input
|
|
return df_input
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def standardization(df, feature_scaler, target_scaler, is_training=True, is_test=False):
|
|
|
|
|
+ print(">>> 开始标准化处理")
|
|
|
|
|
+
|
|
|
|
|
+ # 准备走标准化的特征
|
|
|
|
|
+ scaler_features = ['adult_total_price', 'fly_duration', 'stop_duration']
|
|
|
|
|
+
|
|
|
|
|
+ if is_training:
|
|
|
|
|
+ print(">>> 特征数据标准化开始")
|
|
|
|
|
+ if feature_scaler is None:
|
|
|
|
|
+ feature_scaler = StandardScaler()
|
|
|
|
|
+ if not is_test:
|
|
|
|
|
+ feature_scaler.fit(df[scaler_features])
|
|
|
|
|
+ df[scaler_features] = feature_scaler.transform(df[scaler_features])
|
|
|
|
|
+ print(">>> 特征数据标准化完成")
|
|
|
|
|
+
|
|
|
|
|
+ else:
|
|
|
|
|
+ df[scaler_features] = feature_scaler.transform(df[scaler_features])
|
|
|
|
|
+ print(">>> 预测模式下特征标准化处理完成")
|
|
|
|
|
+
|
|
|
|
|
+ # 准备走归一化的特征
|
|
|
|
|
+ # 事先定义好每个特征的合理范围
|
|
|
|
|
+ fixed_ranges = {
|
|
|
|
|
+ 'hours_until_departure': (0, 480), # 0-20天
|
|
|
|
|
+ 'from_city_num': (0, 38),
|
|
|
|
|
+ 'to_city_num': (0, 38),
|
|
|
|
|
+ 'flight_1_num': (0, 341),
|
|
|
|
|
+ 'flight_2_num': (0, 341),
|
|
|
|
|
+ 'seats_remaining': (1, 5),
|
|
|
|
|
+ 'price_change_times_total': (0, 30), # 假设价格变更次数不会超过30次
|
|
|
|
|
+ 'price_last_change_hours': (0, 480),
|
|
|
|
|
+ 'days_to_departure': (0, 30),
|
|
|
|
|
+ 'days_to_holiday': (0, 120), # 最长的越南节假日间隔120天
|
|
|
|
|
+ 'flight_by_hour': (0, 23),
|
|
|
|
|
+ 'flight_by_day': (1, 31),
|
|
|
|
|
+ 'flight_day_of_month': (1, 12),
|
|
|
|
|
+ 'flight_day_of_week': (0, 6),
|
|
|
|
|
+ 'flight_day_of_quarter': (1, 4),
|
|
|
|
|
+ }
|
|
|
|
|
+ normal_features = list(fixed_ranges.keys())
|
|
|
|
|
+
|
|
|
|
|
+ print(">>> 归一化特征列: ", normal_features)
|
|
|
|
|
+ print(">>> 基于固定范围的特征数据归一化开始")
|
|
|
|
|
+ for col in normal_features:
|
|
|
|
|
+ if col in df.columns:
|
|
|
|
|
+ # 核心归一化公式: (x - min) / (max - min)
|
|
|
|
|
+ col_min, col_max = fixed_ranges[col]
|
|
|
|
|
+ df[col] = (df[col] - col_min) / (col_max - col_min)
|
|
|
|
|
+ # 添加裁剪,将超出范围的值强制限制在[0,1]区间
|
|
|
|
|
+ df[col] = df[col].clip(0, 1)
|
|
|
|
|
+ print(">>> 基于固定范围的特征数据归一化完成")
|
|
|
|
|
+
|
|
|
|
|
+ return df, feature_scaler, target_scaler
|