1 tháng trước cách đây · b30f2c4341
--- a/data_preprocess.py
+++ b/data_preprocess.py
@@ -10,7 +10,7 @@ from utils import insert_df_col
 
				 COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
			
 
				 
			
 
				 
			
 
				-def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=28):
			
 
				+def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=48):
			
 
				     print(">>> 开始数据预处理") 
			
 
				 
			
 
				     # 生成 城市对
			
@@ -339,58 +339,113 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
 
				         if targets_amout == 0:
			
 
				             print(f">>> n_hours = {current_n_hours} 无有效数据，跳过")
			
 
				             return pd.DataFrame()
			
 
				-
			
 
				+        
			
 
				         print(">>> 计算 price_at_n_hours")
			
 
				         df_input_object = df_input[(df_input['hours_until_departure'] >= current_n_hours) & (df_input['baggage'] == 30)].copy()
			
 
				-        df_last = df_input_object.groupby('gid', observed=True).last().reset_index()   # 一般落在起飞前28小时
			
 
				+        df_last = df_input_object.groupby('gid', observed=True).last().reset_index()   # 一般落在起飞前48小时
			
 
				         
			
 
				         # 提取并重命名 price 列
			
 
				         df_last_price_at_n_hours = df_last[['gid', 'adult_total_price']].rename(columns={'adult_total_price': 'price_at_n_hours'})
			
 
				         print(">>> price_at_n_hours计算完成，示例：")
			
 
				         print(df_last_price_at_n_hours.head(5))
			
 
				+
			
 
				+        # 新的计算降价方式
			
 
				+        # 先排序
			
 
				+        df_targets = df_targets.sort_values(
			
 
				+            ['gid', 'hours_until_departure'],
			
 
				+            ascending=[True, False]
			
 
				+        )
			
 
				+
			
 
				+        # 在 gid 内计算价格变化
			
 
				+        g = df_targets.groupby('gid', group_keys=False)
			
 
				+        df_targets['price_diff'] = g['adult_total_price'].diff()
			
 
				         
			
 
				-        # 计算降价信息
			
 
				-        print(">>> 计算降价信息")
			
 
				-        df_targets = df_targets.merge(df_last_price_at_n_hours, on='gid', how='left')
			
 
				-        df_targets['price_drop_amount'] = df_targets['price_at_n_hours'] - df_targets['adult_total_price']
			
 
				-        df_targets['price_dropped'] = (
			
 
				-            (df_targets['adult_total_price'] < df_targets['price_at_n_hours']) &
			
 
				-            (df_targets['price_drop_amount'] >= 5)  # 降幅不能太小
			
 
				+        VALID_DROP_MIN = 10
			
 
				+        LOWER_HOUR = 4
			
 
				+        UPPER_HOUR = 28
			
 
				+
			
 
				+        valid_drop_mask = (
			
 
				+            (df_targets['price_diff'] <= -VALID_DROP_MIN) &
			
 
				+            (df_targets['hours_until_departure'] >= LOWER_HOUR) &
			
 
				+            (df_targets['hours_until_departure'] <= UPPER_HOUR)
			
 
				         )
			
 
				-        df_price_drops = df_targets[df_targets['price_dropped']].copy()
			
 
				-
			
 
				-        price_drops_len = df_price_drops.shape[0]
			
 
				-        if price_drops_len == 0:
			
 
				-            print(f">>> n_hours = {current_n_hours} 无降价信息")
			
 
				-            # 创建包含指定列的空 DataFrame
			
 
				-            df_price_drop_info = pd.DataFrame({
			
 
				-                'gid': pd.Series(dtype='int64'),
			
 
				-                'first_drop_hours_until_departure': pd.Series(dtype='int64'),
			
 
				-                'price_at_first_drop_hours': pd.Series(dtype='float64')
			
 
				-            })
			
 
				-        else:
			
 
				-            df_price_drop_info = df_price_drops.groupby('gid', observed=True).first().reset_index()   # 第一次发生的降价
			
 
				-            df_price_drop_info = df_price_drop_info[['gid', 'hours_until_departure', 'adult_total_price']].rename(columns={
			
 
				-                    'hours_until_departure': 'first_drop_hours_until_departure',
			
 
				-                    'adult_total_price': 'price_at_first_drop_hours'
			
 
				-            })
			
 
				-            print(">>> 降价信息计算完成，示例：")
			
 
				-            print(df_price_drop_info.head(5))
			
 
				+        # 有效的降价
			
 
				+        df_valid_drops = df_targets.loc[valid_drop_mask]
			
 
				+
			
 
				+        # 找「第一次」降价（每个 gid）
			
 
				+        df_first_price_drop = (
			
 
				+            df_valid_drops
			
 
				+            .groupby('gid', as_index=False)
			
 
				+            .first()
			
 
				+        )
			
 
				+
			
 
				+        # 简化列
			
 
				+        df_first_price_drop = df_first_price_drop[
			
 
				+            ['gid', 'hours_until_departure', 'adult_total_price', 'price_diff']
			
 
				+        ].rename(columns={
			
 
				+            'hours_until_departure': 'time_to_price_drop',
			
 
				+            'adult_total_price': 'price_at_d_hours',
			
 
				+            'price_diff': 'amount_of_price_drop',
			
 
				+        })
			
 
				+
			
 
				+        # 把降价幅度转成正数（更直观）
			
 
				+        df_first_price_drop['amount_of_price_drop'] = (-df_first_price_drop['amount_of_price_drop']).round(2)
			
 
				+        pass
			
 
				+
			
 
				+        # # 计算降价信息
			
 
				+        # print(">>> 计算降价信息")
			
 
				+        # df_targets = df_targets.merge(df_last_price_at_n_hours, on='gid', how='left')
			
 
				+        # df_targets['price_drop_amount'] = df_targets['price_at_n_hours'] - df_targets['adult_total_price']
			
 
				+        # df_targets['price_dropped'] = (
			
 
				+        #     (df_targets['adult_total_price'] < df_targets['price_at_n_hours']) &
			
 
				+        #     (df_targets['price_drop_amount'] >= 5)  # 降幅不能太小
			
 
				+        # )
			
 
				+        # df_price_drops = df_targets[df_targets['price_dropped']].copy()
			
 
				+
			
 
				+        # price_drops_len = df_price_drops.shape[0]
			
 
				+        # if price_drops_len == 0:
			
 
				+        #     print(f">>> n_hours = {current_n_hours} 无降价信息")
			
 
				+        #     # 创建包含指定列的空 DataFrame
			
 
				+        #     df_price_drop_info = pd.DataFrame({
			
 
				+        #         'gid': pd.Series(dtype='int64'),
			
 
				+        #         'first_drop_hours_until_departure': pd.Series(dtype='int64'),
			
 
				+        #         'price_at_first_drop_hours': pd.Series(dtype='float64')
			
 
				+        #     })
			
 
				+        # else:
			
 
				+        #     df_price_drop_info = df_price_drops.groupby('gid', observed=True).first().reset_index()   # 第一次发生的降价
			
 
				+        #     df_price_drop_info = df_price_drop_info[['gid', 'hours_until_departure', 'adult_total_price']].rename(columns={
			
 
				+        #             'hours_until_departure': 'first_drop_hours_until_departure',
			
 
				+        #             'adult_total_price': 'price_at_first_drop_hours'
			
 
				+        #     })
			
 
				+        #     print(">>> 降价信息计算完成，示例：")
			
 
				+        #     print(df_price_drop_info.head(5))
			
 
				         
			
 
				-        # 合并信息
			
 
				-        df_gid_info = df_last_price_at_n_hours.merge(df_price_drop_info, on='gid', how='left')
			
 
				-        df_gid_info['will_price_drop'] = df_gid_info['price_at_first_drop_hours'].notnull().astype(int)
			
 
				-        df_gid_info['amount_of_price_drop'] = df_gid_info['price_at_n_hours'] - df_gid_info['price_at_first_drop_hours']
			
 
				-        df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0)  # 区别    
			
 
				-        df_gid_info['time_to_price_drop'] = current_n_hours - df_gid_info['first_drop_hours_until_departure']
			
 
				-        df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0)  # 区别
			
 
				+        # # 合并信息
			
 
				+        # df_gid_info = df_last_price_at_n_hours.merge(df_price_drop_info, on='gid', how='left')
			
 
				+        # df_gid_info['will_price_drop'] = df_gid_info['price_at_first_drop_hours'].notnull().astype(int)
			
 
				+        # df_gid_info['amount_of_price_drop'] = df_gid_info['price_at_n_hours'] - df_gid_info['price_at_first_drop_hours']
			
 
				+        # df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0)  # 区别    
			
 
				+        # df_gid_info['time_to_price_drop'] = current_n_hours - df_gid_info['first_drop_hours_until_departure']
			
 
				+        # df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0)  # 区别
			
 
				+
			
 
				+        # del df_input_object
			
 
				+        # del df_last
			
 
				+        # del df_last_price_at_n_hours
			
 
				+        # del df_price_drops
			
 
				+        # del df_price_drop_info
			
 
				+
			
 
				+        df_gid_info = df_last_price_at_n_hours.merge(df_first_price_drop, on='gid', how='left')
			
 
				+        df_gid_info['will_price_drop'] = df_gid_info['time_to_price_drop'].notnull().astype(int)
			
 
				+        df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0)
			
 
				+        df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0)
			
 
				+        pass
			
 
				 
			
 
				         del df_input_object
			
 
				         del df_last
			
 
				         del df_last_price_at_n_hours
			
 
				+        del df_first_price_drop
			
 
				+        del df_valid_drops
			
 
				         del df_targets
			
 
				-        del df_price_drops
			
 
				-        del df_price_drop_info
			
 
				         gc.collect()
			
 
				         
			
 
				         # 将目标变量合并到输入数据中
			
--- a/main_tr.py
+++ b/main_tr.py
@@ -295,7 +295,7 @@ def start_train():
 
				             # joblib.dump(target_scaler_list, target_scaler_path)
			
 
				             
			
 
				             # 生成序列
			
 
				-            sequences, targets, group_ids = create_fixed_length_sequences(df_train_inputs, features, target_vars, input_length=452)
			
 
				+            sequences, targets, group_ids = create_fixed_length_sequences(df_train_inputs, features, target_vars)
			
 
				             
			
 
				             # 新增有效性检查
			
 
				             if len(sequences) == 0 or len(targets) == 0 or len(group_ids) == 0:
			
--- a/utils.py
+++ b/utils.py
@@ -28,7 +28,7 @@ def insert_df_col(df, insert_col_name, base_col_name, inplace=True):
 
				     return df
			
 
				 
			
 
				 # 真正创建序列过程
			
 
				-def create_fixed_length_sequences(df, features, target_vars, input_length=452, is_train=True):
			
 
				+def create_fixed_length_sequences(df, features, target_vars, threshold=48, input_length=432, is_train=True):
			
 
				     print(">>开始创建序列")
			
 
				     start_time = time.time()
			
 
				 
			
@@ -36,8 +36,6 @@ def create_fixed_length_sequences(df, features, target_vars, input_length=452, i
 
				     targets = []
			
 
				     group_ids = []
			
 
				 
			
 
				-    threshold = 28   # 截止起飞小时数
			
 
				-
			
 
				     # gid 基于 city_pair, flight_day, flight_number_1, flight_number_2 分组 不包括 baggage
			
 
				     grouped = df.groupby(['gid'])
			
 
				     for _, df_group in grouped:
			
@@ -51,7 +49,7 @@ def create_fixed_length_sequences(df, features, target_vars, input_length=452, i
 
				         df_group_bag_30 = df_group[df_group['baggage']==30]
			
 
				         df_group_bag_20 = df_group[df_group['baggage']==20]
			
 
				 
			
 
				-        # 过滤训练时间段 (28 ~ 480)
			
 
				+        # 过滤训练时间段 (48 ~ 480)
			
 
				         df_group_bag_30_filtered = df_group_bag_30[(df_group_bag_30['Hours_Until_Departure'] >= threshold) & (df_group_bag_30['Hours_Until_Departure'] < threshold + input_length)]
			
 
				         df_group_bag_20_filtered = df_group_bag_20[(df_group_bag_20['Hours_Until_Departure'] >= threshold) & (df_group_bag_20['Hours_Until_Departure'] < threshold + input_length)]
			
 
				 
			
@@ -64,7 +62,7 @@ def create_fixed_length_sequences(df, features, target_vars, input_length=452, i
 
				             seq_features_1 = df_group_bag_30_filtered[features].to_numpy()
			
 
				             seq_features_2 = df_group_bag_20_filtered[features].to_numpy()
			
 
				             
			
 
				-            # 将几个特征序列沿着第 0 维拼接，得到形状为 (2, 452, 25)
			
 
				+            # 将几个特征序列沿着第 0 维拼接，得到形状为 (2, 432, 25)
			
 
				             combined_features = torch.stack([torch.tensor(seq_features_1, dtype=torch.float32),    
			
 
				                                              torch.tensor(seq_features_2, dtype=torch.float32)])