|
|
@@ -10,7 +10,7 @@ from utils import insert_df_col
|
|
|
COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
|
|
|
|
|
|
|
|
|
-def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=28):
|
|
|
+def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=48):
|
|
|
print(">>> 开始数据预处理")
|
|
|
|
|
|
# 生成 城市对
|
|
|
@@ -339,58 +339,113 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
|
|
|
if targets_amout == 0:
|
|
|
print(f">>> n_hours = {current_n_hours} 无有效数据,跳过")
|
|
|
return pd.DataFrame()
|
|
|
-
|
|
|
+
|
|
|
print(">>> 计算 price_at_n_hours")
|
|
|
df_input_object = df_input[(df_input['hours_until_departure'] >= current_n_hours) & (df_input['baggage'] == 30)].copy()
|
|
|
- df_last = df_input_object.groupby('gid', observed=True).last().reset_index() # 一般落在起飞前28小时
|
|
|
+ df_last = df_input_object.groupby('gid', observed=True).last().reset_index() # 一般落在起飞前48小时
|
|
|
|
|
|
# 提取并重命名 price 列
|
|
|
df_last_price_at_n_hours = df_last[['gid', 'adult_total_price']].rename(columns={'adult_total_price': 'price_at_n_hours'})
|
|
|
print(">>> price_at_n_hours计算完成,示例:")
|
|
|
print(df_last_price_at_n_hours.head(5))
|
|
|
+
|
|
|
+ # 新的计算降价方式
|
|
|
+ # 先排序
|
|
|
+ df_targets = df_targets.sort_values(
|
|
|
+ ['gid', 'hours_until_departure'],
|
|
|
+ ascending=[True, False]
|
|
|
+ )
|
|
|
+
|
|
|
+ # 在 gid 内计算价格变化
|
|
|
+ g = df_targets.groupby('gid', group_keys=False)
|
|
|
+ df_targets['price_diff'] = g['adult_total_price'].diff()
|
|
|
|
|
|
- # 计算降价信息
|
|
|
- print(">>> 计算降价信息")
|
|
|
- df_targets = df_targets.merge(df_last_price_at_n_hours, on='gid', how='left')
|
|
|
- df_targets['price_drop_amount'] = df_targets['price_at_n_hours'] - df_targets['adult_total_price']
|
|
|
- df_targets['price_dropped'] = (
|
|
|
- (df_targets['adult_total_price'] < df_targets['price_at_n_hours']) &
|
|
|
- (df_targets['price_drop_amount'] >= 5) # 降幅不能太小
|
|
|
+ VALID_DROP_MIN = 10
|
|
|
+ LOWER_HOUR = 4
|
|
|
+ UPPER_HOUR = 28
|
|
|
+
|
|
|
+ valid_drop_mask = (
|
|
|
+ (df_targets['price_diff'] <= -VALID_DROP_MIN) &
|
|
|
+ (df_targets['hours_until_departure'] >= LOWER_HOUR) &
|
|
|
+ (df_targets['hours_until_departure'] <= UPPER_HOUR)
|
|
|
)
|
|
|
- df_price_drops = df_targets[df_targets['price_dropped']].copy()
|
|
|
-
|
|
|
- price_drops_len = df_price_drops.shape[0]
|
|
|
- if price_drops_len == 0:
|
|
|
- print(f">>> n_hours = {current_n_hours} 无降价信息")
|
|
|
- # 创建包含指定列的空 DataFrame
|
|
|
- df_price_drop_info = pd.DataFrame({
|
|
|
- 'gid': pd.Series(dtype='int64'),
|
|
|
- 'first_drop_hours_until_departure': pd.Series(dtype='int64'),
|
|
|
- 'price_at_first_drop_hours': pd.Series(dtype='float64')
|
|
|
- })
|
|
|
- else:
|
|
|
- df_price_drop_info = df_price_drops.groupby('gid', observed=True).first().reset_index() # 第一次发生的降价
|
|
|
- df_price_drop_info = df_price_drop_info[['gid', 'hours_until_departure', 'adult_total_price']].rename(columns={
|
|
|
- 'hours_until_departure': 'first_drop_hours_until_departure',
|
|
|
- 'adult_total_price': 'price_at_first_drop_hours'
|
|
|
- })
|
|
|
- print(">>> 降价信息计算完成,示例:")
|
|
|
- print(df_price_drop_info.head(5))
|
|
|
+ # 有效的降价
|
|
|
+ df_valid_drops = df_targets.loc[valid_drop_mask]
|
|
|
+
|
|
|
+ # 找「第一次」降价(每个 gid)
|
|
|
+ df_first_price_drop = (
|
|
|
+ df_valid_drops
|
|
|
+ .groupby('gid', as_index=False)
|
|
|
+ .first()
|
|
|
+ )
|
|
|
+
|
|
|
+ # 简化列
|
|
|
+ df_first_price_drop = df_first_price_drop[
|
|
|
+ ['gid', 'hours_until_departure', 'adult_total_price', 'price_diff']
|
|
|
+ ].rename(columns={
|
|
|
+ 'hours_until_departure': 'time_to_price_drop',
|
|
|
+ 'adult_total_price': 'price_at_d_hours',
|
|
|
+ 'price_diff': 'amount_of_price_drop',
|
|
|
+ })
|
|
|
+
|
|
|
+ # 把降价幅度转成正数(更直观)
|
|
|
+ df_first_price_drop['amount_of_price_drop'] = (-df_first_price_drop['amount_of_price_drop']).round(2)
|
|
|
+ pass
|
|
|
+
|
|
|
+ # # 计算降价信息
|
|
|
+ # print(">>> 计算降价信息")
|
|
|
+ # df_targets = df_targets.merge(df_last_price_at_n_hours, on='gid', how='left')
|
|
|
+ # df_targets['price_drop_amount'] = df_targets['price_at_n_hours'] - df_targets['adult_total_price']
|
|
|
+ # df_targets['price_dropped'] = (
|
|
|
+ # (df_targets['adult_total_price'] < df_targets['price_at_n_hours']) &
|
|
|
+ # (df_targets['price_drop_amount'] >= 5) # 降幅不能太小
|
|
|
+ # )
|
|
|
+ # df_price_drops = df_targets[df_targets['price_dropped']].copy()
|
|
|
+
|
|
|
+ # price_drops_len = df_price_drops.shape[0]
|
|
|
+ # if price_drops_len == 0:
|
|
|
+ # print(f">>> n_hours = {current_n_hours} 无降价信息")
|
|
|
+ # # 创建包含指定列的空 DataFrame
|
|
|
+ # df_price_drop_info = pd.DataFrame({
|
|
|
+ # 'gid': pd.Series(dtype='int64'),
|
|
|
+ # 'first_drop_hours_until_departure': pd.Series(dtype='int64'),
|
|
|
+ # 'price_at_first_drop_hours': pd.Series(dtype='float64')
|
|
|
+ # })
|
|
|
+ # else:
|
|
|
+ # df_price_drop_info = df_price_drops.groupby('gid', observed=True).first().reset_index() # 第一次发生的降价
|
|
|
+ # df_price_drop_info = df_price_drop_info[['gid', 'hours_until_departure', 'adult_total_price']].rename(columns={
|
|
|
+ # 'hours_until_departure': 'first_drop_hours_until_departure',
|
|
|
+ # 'adult_total_price': 'price_at_first_drop_hours'
|
|
|
+ # })
|
|
|
+ # print(">>> 降价信息计算完成,示例:")
|
|
|
+ # print(df_price_drop_info.head(5))
|
|
|
|
|
|
- # 合并信息
|
|
|
- df_gid_info = df_last_price_at_n_hours.merge(df_price_drop_info, on='gid', how='left')
|
|
|
- df_gid_info['will_price_drop'] = df_gid_info['price_at_first_drop_hours'].notnull().astype(int)
|
|
|
- df_gid_info['amount_of_price_drop'] = df_gid_info['price_at_n_hours'] - df_gid_info['price_at_first_drop_hours']
|
|
|
- df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0) # 区别
|
|
|
- df_gid_info['time_to_price_drop'] = current_n_hours - df_gid_info['first_drop_hours_until_departure']
|
|
|
- df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0) # 区别
|
|
|
+ # # 合并信息
|
|
|
+ # df_gid_info = df_last_price_at_n_hours.merge(df_price_drop_info, on='gid', how='left')
|
|
|
+ # df_gid_info['will_price_drop'] = df_gid_info['price_at_first_drop_hours'].notnull().astype(int)
|
|
|
+ # df_gid_info['amount_of_price_drop'] = df_gid_info['price_at_n_hours'] - df_gid_info['price_at_first_drop_hours']
|
|
|
+ # df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0) # 区别
|
|
|
+ # df_gid_info['time_to_price_drop'] = current_n_hours - df_gid_info['first_drop_hours_until_departure']
|
|
|
+ # df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0) # 区别
|
|
|
+
|
|
|
+ # del df_input_object
|
|
|
+ # del df_last
|
|
|
+ # del df_last_price_at_n_hours
|
|
|
+ # del df_price_drops
|
|
|
+ # del df_price_drop_info
|
|
|
+
|
|
|
+ df_gid_info = df_last_price_at_n_hours.merge(df_first_price_drop, on='gid', how='left')
|
|
|
+ df_gid_info['will_price_drop'] = df_gid_info['time_to_price_drop'].notnull().astype(int)
|
|
|
+ df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0)
|
|
|
+ df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0)
|
|
|
+ pass
|
|
|
|
|
|
del df_input_object
|
|
|
del df_last
|
|
|
del df_last_price_at_n_hours
|
|
|
+ del df_first_price_drop
|
|
|
+ del df_valid_drops
|
|
|
del df_targets
|
|
|
- del df_price_drops
|
|
|
- del df_price_drop_info
|
|
|
gc.collect()
|
|
|
|
|
|
# 将目标变量合并到输入数据中
|