ソースを参照

调整uo的过滤处理,放宽样本库时间范围

node04 5 日 前
コミット
786926a9bc
3 ファイル変更13 行追加6 行削除
  1. 8 2
      data_process.py
  2. 4 3
      main_tr.py
  3. 1 1
      result_keep_verify.py

+ 8 - 2
data_process.py

@@ -87,7 +87,7 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
 
     # 训练过程
     if is_train:
-        df_target = df_input[(df_input['hours_until_departure'] >= 72) & (df_input['hours_until_departure'] <= 360)].copy()
+        df_target = df_input[(df_input['hours_until_departure'] >= 48) & (df_input['hours_until_departure'] <= 384)].copy()
         df_target = df_target.sort_values(
             by=['gid', 'baggage_weight', 'hours_until_departure'],
             ascending=[True, True, False]
@@ -140,6 +140,9 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
         ]
         # 按顺序排列 保留gid
         df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
+        df_drop_nodes['start_hours_until_departure'] = (df_drop_nodes['drop_hours_until_departure'] + df_drop_nodes['high_price_duration_hours']).round().astype('Int64')
+        df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_hours_until_departure'] <= 360]
+        df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72] 
         
         # 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
         # seg_start_mask = df_target['price_duration_hours'].eq(1)
@@ -176,6 +179,9 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
             'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_cabins'
         ]
         df_rise_nodes = df_rise_nodes[flight_info_cols + rise_info_cols]
+        df_rise_nodes['start_hours_until_departure'] = (df_rise_nodes['rise_hours_until_departure'] + df_rise_nodes['prev_rise_duration_hours']).round().astype('Int64')
+        df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_hours_until_departure'] <= 360]
+        df_rise_nodes = df_rise_nodes[df_rise_nodes['start_hours_until_departure'] >= 72]
         
         # 制作历史包络线
         envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
@@ -348,7 +354,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
     # df_min_hours['is_target_day'] = (df_min_hours['from_date'] == df_min_hours['target_flight_day']).astype(int)
     
     # 综合评分阈值:大于阈值的都认为值得投放
-    relative_position_threshold = 0.5
+    relative_position_threshold = 0.4
     df_min_hours['is_good_target'] = (df_min_hours['relative_position'] >= relative_position_threshold).astype(int)    
     total_cnt_before = len(df_min_hours)   # 记录下过滤前的总数
     df_min_hours = df_min_hours[(df_min_hours['is_good_target'] == 1)].reset_index(drop=True)   # 保留值得投放的 

+ 4 - 3
main_tr.py

@@ -20,9 +20,10 @@ def start_train():
     cpu_cores = os.cpu_count()  # 你的系统是72
     max_workers = min(8, cpu_cores)  # 最大不超过8个进程
 
-    from_date_end = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")  # 截止日改为昨天
-    # from_date_begin = "2026-03-17"  # 2026-03-17 2026-04-15
-    from_date_begin = "2026-03-17"
+    from_date_end = (datetime.today() + timedelta(days=1)).strftime("%Y-%m-%d")      # 截止日改为明天
+    from_date_begin = from_date_end
+    # from_date_begin = "2026-03-17"  # 2026-03-17 2026-04-30 2026-05-06
+    # from_date_begin = "2026-05-06"
 
     print(f"训练时间范围: {from_date_begin} 到 {from_date_end}")
 

+ 1 - 1
result_keep_verify.py

@@ -338,5 +338,5 @@ def verify_process_2(min_batch_time_str, max_batch_time_str):
 
 if __name__ == "__main__":
     # verify_process("202604151100", "202604161400")
-    verify_process_2("202604290900", "202604291500")
+    verify_process_2("202605071300", "202605080900")
     pass