|
|
@@ -10,9 +10,37 @@ from utils import insert_df_col
|
|
|
COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
|
|
|
|
|
|
|
|
|
-def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=36):
|
|
|
- print(">>> 开始数据预处理")
|
|
|
+def preprocess_data_cycle(df_input, interval_hours=8, feature_length=240, target_length=24, is_training=True):
|
|
|
+
|
|
|
+ # df_input_part = df_input[(df_input['hours_until_departure'] >= current_n_hours) & (df_input['hours_until_departure'] < current_n_hours)].copy()
|
|
|
+
|
|
|
+ df_input = preprocess_data_first_half(df_input)
|
|
|
+
|
|
|
+ # 创建一个空列表来存储所有处理后的数据部分
|
|
|
+ list_df_parts = []
|
|
|
+
|
|
|
+ crop_lower_limit_list = [4] # [4, 28, 52, 76, 100]
|
|
|
+ for crop_lower_limit in crop_lower_limit_list:
|
|
|
+ target_n_hours = crop_lower_limit + target_length
|
|
|
+ feature_n_hours = target_n_hours + interval_hours
|
|
|
+ crop_upper_limit = feature_n_hours + feature_length
|
|
|
+ df_input_part = preprocess_data(df_input, is_training=is_training, crop_upper_limit=crop_upper_limit, feature_n_hours=feature_n_hours,
|
|
|
+ target_n_hours=target_n_hours, crop_lower_limit=crop_lower_limit)
|
|
|
+ # 将处理后的部分添加到列表中
|
|
|
+ list_df_parts.append(df_input_part)
|
|
|
+ if not is_training:
|
|
|
+ break
|
|
|
+
|
|
|
+ # 合并所有处理后的数据部分
|
|
|
+ if list_df_parts:
|
|
|
+ df_combined = pd.concat(list_df_parts, ignore_index=True)
|
|
|
+ return df_combined
|
|
|
+ else:
|
|
|
+ return pd.DataFrame() # 如果没有数据,返回空DataFrame
|
|
|
|
|
|
+def preprocess_data_first_half(df_input):
|
|
|
+ '''前半部分'''
|
|
|
+ print(">>> 开始数据预处理")
|
|
|
# 生成 城市对
|
|
|
df_input['city_pair'] = (
|
|
|
df_input['from_city_code'].astype(str) + "-" + df_input['to_city_code'].astype(str)
|
|
|
@@ -110,9 +138,14 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
.ngroup()
|
|
|
)
|
|
|
|
|
|
+ return df_input
|
|
|
+
|
|
|
+def preprocess_data(df_input, is_training=True, crop_upper_limit=480, feature_n_hours=36, target_n_hours=28, crop_lower_limit=4):
|
|
|
+ print(f"裁剪范围: [{crop_lower_limit}, {crop_upper_limit}], 间隔窗口: [{target_n_hours}, {feature_n_hours}]")
|
|
|
+
|
|
|
# 做一下时间段裁剪, 保留起飞前480小时之内且大于等于4小时的
|
|
|
- df_input = df_input[(df_input['hours_until_departure'] < 480) &
|
|
|
- (df_input['hours_until_departure'] >= 4)].reset_index(drop=True)
|
|
|
+ df_input = df_input[(df_input['hours_until_departure'] < crop_upper_limit) &
|
|
|
+ (df_input['hours_until_departure'] >= crop_lower_limit)].reset_index(drop=True)
|
|
|
|
|
|
# 在 gid 与 baggage 内按时间降序
|
|
|
df_input = df_input.sort_values(
|
|
|
@@ -120,34 +153,115 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
ascending=[True, True, False]
|
|
|
).reset_index(drop=True)
|
|
|
|
|
|
+ # 价格幅度阈值
|
|
|
+ VALID_DROP_MIN = 5
|
|
|
+
|
|
|
# 价格变化掩码
|
|
|
g = df_input.groupby(['gid', 'baggage'])
|
|
|
diff = g['adult_total_price'].transform('diff')
|
|
|
- change_mask = diff.abs() >= 5 # 变化太小的不计入
|
|
|
-
|
|
|
+ # change_mask = diff.abs() >= VALID_DROP_MIN # 变化太小的不计入
|
|
|
+ decrease_mask = diff <= -VALID_DROP_MIN # 降价(变化太小的不计入)
|
|
|
+ increase_mask = diff >= VALID_DROP_MIN # 升价(变化太小的不计入)
|
|
|
+
|
|
|
+ df_input['_price_event_dir'] = np.where(increase_mask, 1, np.where(decrease_mask, -1, 0))
|
|
|
+
|
|
|
+ # 计算连续升价/降价次数
|
|
|
+ def _calc_price_streaks(df_group):
|
|
|
+ dirs = df_group['_price_event_dir'].to_numpy()
|
|
|
+ n = len(dirs)
|
|
|
+ inc = np.full(n, np.nan)
|
|
|
+ dec = np.full(n, np.nan)
|
|
|
+
|
|
|
+ last_dir = 0
|
|
|
+ inc_cnt = 0
|
|
|
+ dec_cnt = 0
|
|
|
+ for i, d in enumerate(dirs):
|
|
|
+ if d == 1:
|
|
|
+ inc_cnt = inc_cnt + 1 if last_dir == 1 else 1
|
|
|
+ dec_cnt = 0
|
|
|
+ last_dir = 1
|
|
|
+ inc[i] = inc_cnt
|
|
|
+ dec[i] = dec_cnt
|
|
|
+ elif d == -1:
|
|
|
+ dec_cnt = dec_cnt + 1 if last_dir == -1 else 1
|
|
|
+ inc_cnt = 0
|
|
|
+ last_dir = -1
|
|
|
+ inc[i] = inc_cnt
|
|
|
+ dec[i] = dec_cnt
|
|
|
+
|
|
|
+ inc_s = pd.Series(inc, index=df_group.index).ffill().fillna(0).astype(int)
|
|
|
+ dec_s = pd.Series(dec, index=df_group.index).ffill().fillna(0).astype(int)
|
|
|
+ return pd.DataFrame(
|
|
|
+ {
|
|
|
+ 'price_increase_times_consecutive': inc_s,
|
|
|
+ 'price_decrease_times_consecutive': dec_s,
|
|
|
+ },
|
|
|
+ index=df_group.index,
|
|
|
+ )
|
|
|
+
|
|
|
+ streak_df = df_input.groupby(['gid', 'baggage'], sort=False, group_keys=False).apply(_calc_price_streaks)
|
|
|
+ df_input = df_input.join(streak_df)
|
|
|
+ df_input.drop(columns=['_price_event_dir'], inplace=True)
|
|
|
+
|
|
|
# 价格变化次数
|
|
|
- df_input['price_change_times_total'] = (
|
|
|
- change_mask.groupby([df_input['gid'], df_input['baggage']]).cumsum()
|
|
|
+ # df_input['price_change_times_total'] = (
|
|
|
+ # change_mask.groupby([df_input['gid'], df_input['baggage']]).cumsum()
|
|
|
+ # )
|
|
|
+ # 价格下降次数
|
|
|
+ df_input['price_decrease_times_total'] = (
|
|
|
+ decrease_mask.groupby([df_input['gid'], df_input['baggage']]).cumsum()
|
|
|
+ )
|
|
|
+ # 价格上升次数
|
|
|
+ df_input['price_increase_times_total'] = (
|
|
|
+ increase_mask.groupby([df_input['gid'], df_input['baggage']]).cumsum()
|
|
|
)
|
|
|
|
|
|
# 上次发生变价的小时数
|
|
|
- last_change_hour = (
|
|
|
+ # last_change_hour = (
|
|
|
+ # df_input['hours_until_departure']
|
|
|
+ # .where(change_mask)
|
|
|
+ # .groupby([df_input['gid'], df_input['baggage']])
|
|
|
+ # .ffill() # 前向填充
|
|
|
+ # )
|
|
|
+ # 上次发生降价的小时数
|
|
|
+ last_decrease_hour = (
|
|
|
+ df_input['hours_until_departure']
|
|
|
+ .where(decrease_mask)
|
|
|
+ .groupby([df_input['gid'], df_input['baggage']])
|
|
|
+ .ffill() # 前向填充
|
|
|
+ )
|
|
|
+ # 上次发生升价的小时数
|
|
|
+ last_increase_hour = (
|
|
|
df_input['hours_until_departure']
|
|
|
- .where(change_mask)
|
|
|
+ .where(increase_mask)
|
|
|
.groupby([df_input['gid'], df_input['baggage']])
|
|
|
.ffill() # 前向填充
|
|
|
)
|
|
|
|
|
|
# 当前距离上一次变价过去多少小时
|
|
|
- df_input['price_last_change_hours'] = (
|
|
|
- last_change_hour - df_input['hours_until_departure']
|
|
|
+ # df_input['price_last_change_hours'] = (
|
|
|
+ # last_change_hour - df_input['hours_until_departure']
|
|
|
+ # ).fillna(0)
|
|
|
+ # 当前距离上一次降价过去多少小时
|
|
|
+ df_input['price_last_decrease_hours'] = (
|
|
|
+ last_decrease_hour - df_input['hours_until_departure']
|
|
|
+ ).fillna(0)
|
|
|
+ # 当前距离上一次升价过去多少小时
|
|
|
+ df_input['price_last_increase_hours'] = (
|
|
|
+ last_increase_hour - df_input['hours_until_departure']
|
|
|
).fillna(0)
|
|
|
pass
|
|
|
|
|
|
# 想插入到 seats_remaining 前面的新列
|
|
|
new_cols = [
|
|
|
- 'price_change_times_total',
|
|
|
- 'price_last_change_hours'
|
|
|
+ # 'price_change_times_total',
|
|
|
+ # 'price_last_change_hours',
|
|
|
+ 'price_decrease_times_total',
|
|
|
+ 'price_decrease_times_consecutive',
|
|
|
+ 'price_last_decrease_hours',
|
|
|
+ 'price_increase_times_total',
|
|
|
+ 'price_increase_times_consecutive',
|
|
|
+ 'price_last_increase_hours',
|
|
|
]
|
|
|
# 当前所有列
|
|
|
cols = df_input.columns.tolist()
|
|
|
@@ -481,9 +595,9 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
# 训练模式
|
|
|
if is_training:
|
|
|
print(">>> 训练模式:计算 target 相关列")
|
|
|
- print(f"\n>>> 开始处理 对应区间: n_hours = {current_n_hours}")
|
|
|
- target_lower_limit = 4
|
|
|
- target_upper_limit = current_n_hours
|
|
|
+ print(f"\n>>> 开始处理 对应区间: n_hours = {target_n_hours}")
|
|
|
+ target_lower_limit = crop_lower_limit
|
|
|
+ target_upper_limit = target_n_hours
|
|
|
mask_targets = (df_input['hours_until_departure'] >= target_lower_limit) & (df_input['hours_until_departure'] < target_upper_limit) & (df_input['baggage'] == 30)
|
|
|
df_targets = df_input.loc[mask_targets].copy()
|
|
|
|
|
|
@@ -491,11 +605,11 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
print(f"当前 目标区间数据量: {targets_amout}, 区间: [{target_lower_limit}, {target_upper_limit})")
|
|
|
|
|
|
if targets_amout == 0:
|
|
|
- print(f">>> n_hours = {current_n_hours} 无有效数据,跳过")
|
|
|
+ print(f">>> n_hours = {target_n_hours} 无有效数据,跳过")
|
|
|
return pd.DataFrame()
|
|
|
|
|
|
print(">>> 计算 price_at_n_hours")
|
|
|
- df_input_object = df_input[(df_input['hours_until_departure'] >= current_n_hours) & (df_input['baggage'] == 30)].copy()
|
|
|
+ df_input_object = df_input[(df_input['hours_until_departure'] >= feature_n_hours) & (df_input['baggage'] == 30)].copy()
|
|
|
df_last = df_input_object.groupby('gid', observed=True).last().reset_index() # 一般落在起飞前36\32\30小时
|
|
|
|
|
|
# 提取并重命名 price 列
|
|
|
@@ -514,14 +628,14 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
g = df_targets.groupby('gid', group_keys=False)
|
|
|
df_targets['price_diff'] = g['adult_total_price'].diff()
|
|
|
|
|
|
- VALID_DROP_MIN = 10
|
|
|
- LOWER_HOUR = 4
|
|
|
- UPPER_HOUR = 28
|
|
|
+ # VALID_DROP_MIN = 5
|
|
|
+ # LOWER_HOUR = 4
|
|
|
+ # UPPER_HOUR = 28
|
|
|
|
|
|
valid_drop_mask = (
|
|
|
- (df_targets['price_diff'] <= -VALID_DROP_MIN) &
|
|
|
- (df_targets['hours_until_departure'] >= LOWER_HOUR) &
|
|
|
- (df_targets['hours_until_departure'] <= UPPER_HOUR)
|
|
|
+ (df_targets['price_diff'] <= -VALID_DROP_MIN)
|
|
|
+ # (df_targets['hours_until_departure'] >= LOWER_HOUR) &
|
|
|
+ # (df_targets['hours_until_departure'] <= UPPER_HOUR)
|
|
|
)
|
|
|
# 有效的降价
|
|
|
df_valid_drops = df_targets.loc[valid_drop_mask]
|
|
|
@@ -639,7 +753,9 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
order_columns = [
|
|
|
"city_pair", "from_city_code", "from_city_num", "to_city_code", "to_city_num", "flight_day",
|
|
|
"seats_remaining", "baggage", "baggage_level",
|
|
|
- "price_change_times_total", "price_last_change_hours", "adult_total_price", "Adult_Total_Price", "target_will_price_drop", "target_amount_of_drop", "target_time_to_drop",
|
|
|
+ "price_decrease_times_total", "price_decrease_times_consecutive", "price_last_decrease_hours",
|
|
|
+ "price_increase_times_total", "price_increase_times_consecutive", "price_last_increase_hours",
|
|
|
+ "adult_total_price", "Adult_Total_Price", "target_will_price_drop", "target_amount_of_drop", "target_time_to_drop",
|
|
|
"days_to_departure", "days_to_holiday", "hours_until_departure", "Hours_Until_Departure", "update_hour", "crawl_date", "gid",
|
|
|
"flight_number_1", "flight_1_num", "airport_pair_1", "dep_time_1", "arr_time_1", "fly_duration_1",
|
|
|
"flight_by_hour", "flight_by_day", "flight_day_of_month", "flight_day_of_week", "flight_day_of_quarter", "flight_day_is_weekend", "is_transfer",
|
|
|
@@ -654,7 +770,7 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
return df_input
|
|
|
|
|
|
|
|
|
-def standardization(df, feature_scaler, target_scaler=None, is_training=True, is_val=False):
|
|
|
+def standardization(df, feature_scaler, target_scaler=None, is_training=True, is_val=False, feature_length=240):
|
|
|
print(">>> 开始标准化处理")
|
|
|
|
|
|
# 准备走标准化的特征
|
|
|
@@ -684,8 +800,14 @@ def standardization(df, feature_scaler, target_scaler=None, is_training=True, is
|
|
|
'flight_1_num': (0, 341),
|
|
|
'flight_2_num': (0, 341),
|
|
|
'seats_remaining': (1, 5),
|
|
|
- 'price_change_times_total': (0, 30), # 假设价格变更次数不会超过30次
|
|
|
- 'price_last_change_hours': (0, 480),
|
|
|
+ # 'price_change_times_total': (0, 30), # 假设价格变更次数不会超过30次
|
|
|
+ # 'price_last_change_hours': (0, 480),
|
|
|
+ 'price_decrease_times_total': (0, 20), # 假设价格下降次数不会超过20次
|
|
|
+ 'price_decrease_times_consecutive': (0, 10), # 假设价格连续下降次数不会超过10次
|
|
|
+ 'price_last_decrease_hours': (0, feature_length), #(0-240小时)
|
|
|
+ 'price_increase_times_total': (0, 20), # 假设价格上升次数不会超过20次
|
|
|
+ 'price_increase_times_consecutive': (0, 10), # 假设价格连续上升次数不会超过10次
|
|
|
+ 'price_last_increase_hours': (0, feature_length), #(0-240小时)
|
|
|
'price_zone_comprehensive': (0, 5),
|
|
|
'days_to_departure': (0, 30),
|
|
|
'days_to_holiday': (0, 120), # 最长的越南节假日间隔120天
|