Просмотр исходного кода

修改预测判定, 使得VJ的正向样本里近似24小时内降价

node04 2 недель назад
Родитель
Сommit
2d7d39fe20
2 измененных файлов с 25 добавлено и 17 удалено
  1. 24 16
      data_preprocess.py
  2. 1 1
      main_tr_0.py

+ 24 - 16
data_preprocess.py

@@ -1173,7 +1173,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
     df_min_hours = df_min_hours[(df_min_hours['seats_remaining'] >= 5)].reset_index(drop=True)
     df_min_hours = df_min_hours[(df_min_hours['seats_remaining'] >= 5)].reset_index(drop=True)
 
 
     df_min_hours['simple_will_price_drop'] = 0   
     df_min_hours['simple_will_price_drop'] = 0   
-    df_min_hours['simple_drop_in_hours'] = 0
+    # df_min_hours['simple_drop_in_hours'] = 0
     df_min_hours['simple_drop_in_hours_prob'] = 0.0
     df_min_hours['simple_drop_in_hours_prob'] = 0.0
     df_min_hours['simple_drop_in_hours_dist'] = ''   # 空串 表示未知
     df_min_hours['simple_drop_in_hours_dist'] = ''   # 空串 表示未知
     df_min_hours['flag_dist'] = ''
     df_min_hours['flag_dist'] = ''
@@ -1272,6 +1272,12 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                         df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
                         df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
                         df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
                         df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
 
 
+                        dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
+                        dur_delta = dur_num_chk - float(dur_base)
+                        df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
+                        df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
+                        df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 48].copy()
+                        
                         # seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
                         # seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
                         # df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
                         # df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
                         # df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
                         # df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
@@ -1290,23 +1296,26 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                             # if len(drop_mode_values) > 0:
                             # if len(drop_mode_values) > 0:
                             #     df_min_hours.loc[idx, 'drop_price_change_mode'] = round(float(drop_mode_values[0]), 2)
                             #     df_min_hours.loc[idx, 'drop_price_change_mode'] = round(float(drop_mode_values[0]), 2)
 
 
-                            remaining_hours = (
-                                pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
-                            ).clip(lower=0)
-                            remaining_hours = remaining_hours.round().astype(int)
+                            # remaining_hours = (
+                            #     pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
+                            # ).clip(lower=0)
+                            # remaining_hours = remaining_hours.round().astype(int)
 
 
-                            counts = remaining_hours.value_counts().sort_index()
-                            probs = (counts / counts.sum()).round(4)
+                            # counts = remaining_hours.value_counts().sort_index()
+                            # probs = (counts / counts.sum()).round(4)
 
 
-                            top_hours = int(probs.idxmax())
-                            top_prob = float(probs.max())
+                            # top_hours = int(probs.idxmax())
+                            # top_prob = float(probs.max())
 
 
-                            dist_items = list(zip(probs.index.tolist(), probs.tolist()))
-                            dist_items = dist_items[:10]
-                            dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
+                            # dist_items = list(zip(probs.index.tolist(), probs.tolist()))
+                            # dist_items = dist_items[:10]
+                            # dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
+
+                            dur_delta_list = df_match_chk['dur_delta'].tolist()
+                            dist_str = "'" + ' '.join([f"{ddl:g}" for ddl in dur_delta_list])
 
 
                             df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
                             df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
-                            df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
+                            # df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
                             df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 1
                             df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 1
                             df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
                             df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
                             df_min_hours.loc[idx, 'flag_dist'] = 'd0'
                             df_min_hours.loc[idx, 'flag_dist'] = 'd0'
@@ -1396,7 +1405,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                             # 可以明确的判定不降价
                             # 可以明确的判定不降价
                             if length_drop == 0:
                             if length_drop == 0:
                                 df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
                                 df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
-                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                                # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
                                 df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
                                 df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
                                 # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'r0'
                                 # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'r0'
                                 df_min_hours.loc[idx, 'flag_dist'] = 'r0'
                                 df_min_hours.loc[idx, 'flag_dist'] = 'r0'
@@ -1460,7 +1469,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                   'adult_total_price', 'days_to_departure', 'hours_until_departure', 'price_change_percent', 'price_change_amount', 'price_duration_hours', 
                   'adult_total_price', 'days_to_departure', 'hours_until_departure', 'price_change_percent', 'price_change_amount', 'price_duration_hours', 
                   'update_hour', 'crawl_date',
                   'update_hour', 'crawl_date',
                   'valid_begin_hour', 'valid_end_hour',
                   'valid_begin_hour', 'valid_end_hour',
-                  'simple_will_price_drop', 'simple_drop_in_hours', 'simple_drop_in_hours_prob', 'simple_drop_in_hours_dist',
+                  'simple_will_price_drop', 'simple_drop_in_hours_prob', 'simple_drop_in_hours_dist',
                   'flag_dist',
                   'flag_dist',
                   'drop_price_change_upper', 'drop_price_change_lower', 'drop_price_sample_size',
                   'drop_price_change_upper', 'drop_price_change_lower', 'drop_price_sample_size',
                   'rise_price_change_upper', 'rise_price_change_lower', 'rise_price_sample_size',
                   'rise_price_change_upper', 'rise_price_change_lower', 'rise_price_sample_size',
@@ -1473,7 +1482,6 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
     df_predict = df_min_hours[order_cols]
     df_predict = df_min_hours[order_cols]
     df_predict = df_predict.rename(columns={
     df_predict = df_predict.rename(columns={
             'simple_will_price_drop': 'will_price_drop',
             'simple_will_price_drop': 'will_price_drop',
-            'simple_drop_in_hours': 'drop_in_hours',
             'simple_drop_in_hours_prob': 'drop_in_hours_prob',
             'simple_drop_in_hours_prob': 'drop_in_hours_prob',
             'simple_drop_in_hours_dist': 'drop_in_hours_dist',
             'simple_drop_in_hours_dist': 'drop_in_hours_dist',
         }
         }

+ 1 - 1
main_tr_0.py

@@ -50,7 +50,7 @@ def start_train():
     # date_end = datetime.today().strftime("%Y-%m-%d")
     # date_end = datetime.today().strftime("%Y-%m-%d")
     date_end = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
     date_end = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
     # date_begin = (datetime.today() - timedelta(days=32)).strftime("%Y-%m-%d")
     # date_begin = (datetime.today() - timedelta(days=32)).strftime("%Y-%m-%d")
-    date_begin = "2026-03-01"   # 2026-01-01  2026-04-17
+    date_begin = "2026-03-01"   # 2026-01-01  2026-04-23
 
 
     print(f"训练时间范围: {date_begin} 到 {date_end}")
     print(f"训练时间范围: {date_begin} 到 {date_end}")