import pandas as pd import numpy as np import bisect from datetime import datetime, timedelta from sklearn.preprocessing import StandardScaler from config import city_to_country, build_country_holidays COUNTRY_HOLIDAYS = build_country_holidays(city_to_country) def preprocess_data(df_train, features, categorical_features, is_training=True): print(">>> 开始数据预处理") # 生成 城市对 df_train['city_pair'] = ( df_train['from_city_code'].astype(str) + "-" + df_train['to_city_code'].astype(str) ) # 把 city_pair、from_city_code、to_city_code 放到前三列 cols = df_train.columns.tolist() # 删除已存在的三列(保证顺序正确) for c in ['city_pair', 'from_city_code', 'to_city_code']: cols.remove(c) # 这三列插入到最前面 df_train = df_train[['city_pair', 'from_city_code', 'to_city_code'] + cols] # 转格式 df_train['search_dep_time'] = pd.to_datetime( df_train['search_dep_time'], format='%Y%m%d', errors='coerce' ).dt.strftime('%Y-%m-%d') # 重命名起飞日期 df_train.rename(columns={'search_dep_time': 'flight_day'}, inplace=True) # 重命名航班号 df_train.rename( columns={ 'seg1_flight_number': 'flight_number_1', 'seg2_flight_number': 'flight_number_2' }, inplace=True ) # 分开填充 df_train['flight_number_1'] = df_train['flight_number_1'].fillna('VJ') df_train['flight_number_2'] = df_train['flight_number_2'].fillna('VJ') # 生成第一机场对 df_train['airport_pair_1'] = ( df_train['seg1_dep_air_port'].astype(str) + "-" + df_train['seg1_arr_air_port'].astype(str) ) # 删除原始第一机场码 df_train.drop(columns=['seg1_dep_air_port', 'seg1_arr_air_port'], inplace=True) # 第一机场对 放到 seg1_dep_time 列的前面 insert_idx = df_train.columns.get_loc('seg1_dep_time') airport_pair_1 = df_train.pop('airport_pair_1') df_train.insert(insert_idx, 'airport_pair_1', airport_pair_1) # 生成第二机场对(带缺失兜底) df_train['airport_pair_2'] = np.where( df_train['seg2_dep_air_port'].isna() | df_train['seg2_arr_air_port'].isna(), 'NA', df_train['seg2_dep_air_port'].astype(str) + "-" + df_train['seg2_arr_air_port'].astype(str) ) # 删除原始第二机场码 df_train.drop(columns=['seg2_dep_air_port', 'seg2_arr_air_port'], inplace=True) # 第二机场对 放到 seg2_dep_time 列的前面 insert_idx = df_train.columns.get_loc('seg2_dep_time') airport_pair_2 = df_train.pop('airport_pair_2') df_train.insert(insert_idx, 'airport_pair_2', airport_pair_2) # 是否转乘 df_train['is_transfer'] = np.where(df_train['flight_number_2'] == 'VJ', 0, 1) insert_idx = df_train.columns.get_loc('flight_number_2') is_transfer = df_train.pop('is_transfer') df_train.insert(insert_idx, 'is_transfer', is_transfer) # 重命名起飞时刻与到达时刻 df_train.rename( columns={ 'seg1_dep_time': 'dep_time_1', 'seg1_arr_time': 'arr_time_1', 'seg2_dep_time': 'dep_time_2', 'seg2_arr_time': 'arr_time_2', }, inplace=True ) # 第一段飞行时长 df_train['fly_duration_1'] = ( (df_train['arr_time_1'] - df_train['dep_time_1']) .dt.total_seconds() / 3600 ).round(2) # 第二段飞行时长(无转乘为 0) df_train['fly_duration_2'] = ( (df_train['arr_time_2'] - df_train['dep_time_2']) .dt.total_seconds() / 3600 ).fillna(0).round(2) # 总飞行时长 df_train['fly_duration'] = ( df_train['fly_duration_1'] + df_train['fly_duration_2'] ).round(2) # 中转停留时长(无转乘为 0) df_train['stop_duration'] = ( (df_train['dep_time_2'] - df_train['arr_time_1']) .dt.total_seconds() / 3600 ).fillna(0).round(2) # 裁剪,防止负数 # for c in ['fly_duration_1', 'fly_duration_2', 'fly_duration', 'stop_duration']: # df_train[c] = df_train[c].clip(lower=0) # 和 is_transfer 逻辑保持一致 # df_train.loc[df_train['is_transfer'] == 0, ['fly_duration_2', 'stop_duration']] = 0 # 一次性插到 is_filled 前面 insert_before = 'is_filled' new_cols = [ 'fly_duration_1', 'fly_duration_2', 'fly_duration', 'stop_duration' ] cols = df_train.columns.tolist() idx = cols.index(insert_before) # 删除旧位置 cols = [c for c in cols if c not in new_cols] # 插入新位置(顺序保持) cols[idx:idx] = new_cols # python独有空切片插入法 df_train = df_train[cols] # 一次生成多个字段 dep_t1 = df_train['dep_time_1'] # 几点起飞(0–23) df_train['flight_by_hour'] = dep_t1.dt.hour # 起飞日期几号(1–31) df_train['flight_by_day'] = dep_t1.dt.day # 起飞日期几月(1–12) df_train['flight_day_of_month'] = dep_t1.dt.month # 起飞日期周几(0=周一, 6=周日) df_train['flight_day_of_week'] = dep_t1.dt.weekday # 起飞日期季度(1–4) df_train['flight_day_of_quarter'] = dep_t1.dt.quarter # 是否周末(周六 / 周日) df_train['flight_day_is_weekend'] = dep_t1.dt.weekday.isin([5, 6]).astype(int) # 找到对应的国家码 df_train['dep_country'] = df_train['from_city_code'].map(city_to_country) df_train['arr_country'] = df_train['to_city_code'].map(city_to_country) # 整体出发时间 就是 dep_time_1 df_train['global_dep_time'] = df_train['dep_time_1'] # 整体到达时间:有转乘用 arr_time_2,否则用 arr_time_1 df_train['global_arr_time'] = df_train['arr_time_2'].fillna(df_train['arr_time_1']) # 出发日期在出发国家是否节假日 df_train['dep_country_is_holiday'] = df_train.apply( lambda r: r['global_dep_time'].date() in COUNTRY_HOLIDAYS.get(r['dep_country'], set()), axis=1 ).astype(int) # 到达日期在到达国家是否节假日 df_train['arr_country_is_holiday'] = df_train.apply( lambda r: r['global_arr_time'].date() in COUNTRY_HOLIDAYS.get(r['arr_country'], set()), axis=1 ).astype(int) # 在任一侧是否节假日 df_train['flight_day_is_holiday'] = ( df_train[['dep_country_is_holiday', 'arr_country_is_holiday']] .max(axis=1) ) # 是否跨国航线 df_train['is_cross_country'] = ( df_train['dep_country'] != df_train['arr_country'] ).astype(int) pass