Browse Source

修改目标降价的判定方式

node04 1 tháng trước cách đây
mục cha
commit
b30f2c4341
3 tập tin đã thay đổi với 97 bổ sung44 xóa
  1. 93 38
      data_preprocess.py
  2. 1 1
      main_tr.py
  3. 3 5
      utils.py

+ 93 - 38
data_preprocess.py

@@ -10,7 +10,7 @@ from utils import insert_df_col
 COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
 
 
-def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=28):
+def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=48):
     print(">>> 开始数据预处理") 
 
     # 生成 城市对
@@ -339,58 +339,113 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
         if targets_amout == 0:
             print(f">>> n_hours = {current_n_hours} 无有效数据,跳过")
             return pd.DataFrame()
-
+        
         print(">>> 计算 price_at_n_hours")
         df_input_object = df_input[(df_input['hours_until_departure'] >= current_n_hours) & (df_input['baggage'] == 30)].copy()
-        df_last = df_input_object.groupby('gid', observed=True).last().reset_index()   # 一般落在起飞前28小时
+        df_last = df_input_object.groupby('gid', observed=True).last().reset_index()   # 一般落在起飞前48小时
         
         # 提取并重命名 price 列
         df_last_price_at_n_hours = df_last[['gid', 'adult_total_price']].rename(columns={'adult_total_price': 'price_at_n_hours'})
         print(">>> price_at_n_hours计算完成,示例:")
         print(df_last_price_at_n_hours.head(5))
+
+        # 新的计算降价方式
+        # 先排序
+        df_targets = df_targets.sort_values(
+            ['gid', 'hours_until_departure'],
+            ascending=[True, False]
+        )
+
+        # 在 gid 内计算价格变化
+        g = df_targets.groupby('gid', group_keys=False)
+        df_targets['price_diff'] = g['adult_total_price'].diff()
         
-        # 计算降价信息
-        print(">>> 计算降价信息")
-        df_targets = df_targets.merge(df_last_price_at_n_hours, on='gid', how='left')
-        df_targets['price_drop_amount'] = df_targets['price_at_n_hours'] - df_targets['adult_total_price']
-        df_targets['price_dropped'] = (
-            (df_targets['adult_total_price'] < df_targets['price_at_n_hours']) &
-            (df_targets['price_drop_amount'] >= 5)  # 降幅不能太小
+        VALID_DROP_MIN = 10
+        LOWER_HOUR = 4
+        UPPER_HOUR = 28
+
+        valid_drop_mask = (
+            (df_targets['price_diff'] <= -VALID_DROP_MIN) &
+            (df_targets['hours_until_departure'] >= LOWER_HOUR) &
+            (df_targets['hours_until_departure'] <= UPPER_HOUR)
         )
-        df_price_drops = df_targets[df_targets['price_dropped']].copy()
-
-        price_drops_len = df_price_drops.shape[0]
-        if price_drops_len == 0:
-            print(f">>> n_hours = {current_n_hours} 无降价信息")
-            # 创建包含指定列的空 DataFrame
-            df_price_drop_info = pd.DataFrame({
-                'gid': pd.Series(dtype='int64'),
-                'first_drop_hours_until_departure': pd.Series(dtype='int64'),
-                'price_at_first_drop_hours': pd.Series(dtype='float64')
-            })
-        else:
-            df_price_drop_info = df_price_drops.groupby('gid', observed=True).first().reset_index()   # 第一次发生的降价
-            df_price_drop_info = df_price_drop_info[['gid', 'hours_until_departure', 'adult_total_price']].rename(columns={
-                    'hours_until_departure': 'first_drop_hours_until_departure',
-                    'adult_total_price': 'price_at_first_drop_hours'
-            })
-            print(">>> 降价信息计算完成,示例:")
-            print(df_price_drop_info.head(5))
+        # 有效的降价
+        df_valid_drops = df_targets.loc[valid_drop_mask]
+
+        # 找「第一次」降价(每个 gid)
+        df_first_price_drop = (
+            df_valid_drops
+            .groupby('gid', as_index=False)
+            .first()
+        )
+
+        # 简化列
+        df_first_price_drop = df_first_price_drop[
+            ['gid', 'hours_until_departure', 'adult_total_price', 'price_diff']
+        ].rename(columns={
+            'hours_until_departure': 'time_to_price_drop',
+            'adult_total_price': 'price_at_d_hours',
+            'price_diff': 'amount_of_price_drop',
+        })
+
+        # 把降价幅度转成正数(更直观)
+        df_first_price_drop['amount_of_price_drop'] = (-df_first_price_drop['amount_of_price_drop']).round(2)
+        pass
+
+        # # 计算降价信息
+        # print(">>> 计算降价信息")
+        # df_targets = df_targets.merge(df_last_price_at_n_hours, on='gid', how='left')
+        # df_targets['price_drop_amount'] = df_targets['price_at_n_hours'] - df_targets['adult_total_price']
+        # df_targets['price_dropped'] = (
+        #     (df_targets['adult_total_price'] < df_targets['price_at_n_hours']) &
+        #     (df_targets['price_drop_amount'] >= 5)  # 降幅不能太小
+        # )
+        # df_price_drops = df_targets[df_targets['price_dropped']].copy()
+
+        # price_drops_len = df_price_drops.shape[0]
+        # if price_drops_len == 0:
+        #     print(f">>> n_hours = {current_n_hours} 无降价信息")
+        #     # 创建包含指定列的空 DataFrame
+        #     df_price_drop_info = pd.DataFrame({
+        #         'gid': pd.Series(dtype='int64'),
+        #         'first_drop_hours_until_departure': pd.Series(dtype='int64'),
+        #         'price_at_first_drop_hours': pd.Series(dtype='float64')
+        #     })
+        # else:
+        #     df_price_drop_info = df_price_drops.groupby('gid', observed=True).first().reset_index()   # 第一次发生的降价
+        #     df_price_drop_info = df_price_drop_info[['gid', 'hours_until_departure', 'adult_total_price']].rename(columns={
+        #             'hours_until_departure': 'first_drop_hours_until_departure',
+        #             'adult_total_price': 'price_at_first_drop_hours'
+        #     })
+        #     print(">>> 降价信息计算完成,示例:")
+        #     print(df_price_drop_info.head(5))
         
-        # 合并信息
-        df_gid_info = df_last_price_at_n_hours.merge(df_price_drop_info, on='gid', how='left')
-        df_gid_info['will_price_drop'] = df_gid_info['price_at_first_drop_hours'].notnull().astype(int)
-        df_gid_info['amount_of_price_drop'] = df_gid_info['price_at_n_hours'] - df_gid_info['price_at_first_drop_hours']
-        df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0)  # 区别    
-        df_gid_info['time_to_price_drop'] = current_n_hours - df_gid_info['first_drop_hours_until_departure']
-        df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0)  # 区别
+        # # 合并信息
+        # df_gid_info = df_last_price_at_n_hours.merge(df_price_drop_info, on='gid', how='left')
+        # df_gid_info['will_price_drop'] = df_gid_info['price_at_first_drop_hours'].notnull().astype(int)
+        # df_gid_info['amount_of_price_drop'] = df_gid_info['price_at_n_hours'] - df_gid_info['price_at_first_drop_hours']
+        # df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0)  # 区别    
+        # df_gid_info['time_to_price_drop'] = current_n_hours - df_gid_info['first_drop_hours_until_departure']
+        # df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0)  # 区别
+
+        # del df_input_object
+        # del df_last
+        # del df_last_price_at_n_hours
+        # del df_price_drops
+        # del df_price_drop_info
+
+        df_gid_info = df_last_price_at_n_hours.merge(df_first_price_drop, on='gid', how='left')
+        df_gid_info['will_price_drop'] = df_gid_info['time_to_price_drop'].notnull().astype(int)
+        df_gid_info['amount_of_price_drop'] = df_gid_info['amount_of_price_drop'].fillna(0)
+        df_gid_info['time_to_price_drop'] = df_gid_info['time_to_price_drop'].fillna(0)
+        pass
 
         del df_input_object
         del df_last
         del df_last_price_at_n_hours
+        del df_first_price_drop
+        del df_valid_drops
         del df_targets
-        del df_price_drops
-        del df_price_drop_info
         gc.collect()
         
         # 将目标变量合并到输入数据中

+ 1 - 1
main_tr.py

@@ -295,7 +295,7 @@ def start_train():
             # joblib.dump(target_scaler_list, target_scaler_path)
             
             # 生成序列
-            sequences, targets, group_ids = create_fixed_length_sequences(df_train_inputs, features, target_vars, input_length=452)
+            sequences, targets, group_ids = create_fixed_length_sequences(df_train_inputs, features, target_vars)
             
             # 新增有效性检查
             if len(sequences) == 0 or len(targets) == 0 or len(group_ids) == 0:

+ 3 - 5
utils.py

@@ -28,7 +28,7 @@ def insert_df_col(df, insert_col_name, base_col_name, inplace=True):
     return df
 
 # 真正创建序列过程
-def create_fixed_length_sequences(df, features, target_vars, input_length=452, is_train=True):
+def create_fixed_length_sequences(df, features, target_vars, threshold=48, input_length=432, is_train=True):
     print(">>开始创建序列")
     start_time = time.time()
 
@@ -36,8 +36,6 @@ def create_fixed_length_sequences(df, features, target_vars, input_length=452, i
     targets = []
     group_ids = []
 
-    threshold = 28   # 截止起飞小时数
-
     # gid 基于 city_pair, flight_day, flight_number_1, flight_number_2 分组 不包括 baggage
     grouped = df.groupby(['gid'])
     for _, df_group in grouped:
@@ -51,7 +49,7 @@ def create_fixed_length_sequences(df, features, target_vars, input_length=452, i
         df_group_bag_30 = df_group[df_group['baggage']==30]
         df_group_bag_20 = df_group[df_group['baggage']==20]
 
-        # 过滤训练时间段 (28 ~ 480)
+        # 过滤训练时间段 (48 ~ 480)
         df_group_bag_30_filtered = df_group_bag_30[(df_group_bag_30['Hours_Until_Departure'] >= threshold) & (df_group_bag_30['Hours_Until_Departure'] < threshold + input_length)]
         df_group_bag_20_filtered = df_group_bag_20[(df_group_bag_20['Hours_Until_Departure'] >= threshold) & (df_group_bag_20['Hours_Until_Departure'] < threshold + input_length)]
 
@@ -64,7 +62,7 @@ def create_fixed_length_sequences(df, features, target_vars, input_length=452, i
             seq_features_1 = df_group_bag_30_filtered[features].to_numpy()
             seq_features_2 = df_group_bag_20_filtered[features].to_numpy()
             
-            # 将几个特征序列沿着第 0 维拼接,得到形状为 (2, 452, 25)
+            # 将几个特征序列沿着第 0 维拼接,得到形状为 (2, 432, 25)
             combined_features = torch.stack([torch.tensor(seq_features_1, dtype=torch.float32),    
                                              torch.tensor(seq_features_2, dtype=torch.float32)])