4 тижнів тому · 36c67cbdc2
--- a/config.py
+++ b/config.py
@@ -6,6 +6,7 @@ CLEAN_VJ_HOT_FAR_INFO_TAB = "clean_flights_vj_hot_7_30_info_tab"
 
				 CLEAN_VJ_NOTHOT_NEAR_INFO_TAB = "clean_flights_vj_nothot_0_7_info_tab"

			
 
				 CLEAN_VJ_NOTHOT_FAR_INFO_TAB = "clean_flights_vj_nothot_7_30_info_tab"

			
 
				 

			
 
				+INTERVAL_HOURS = 8

			
 
				 

			
 
				 mongodb_config = {

			
 
				     "host": "192.168.20.218",

			
--- a/data_loader.py
+++ b/data_loader.py
@@ -1005,10 +1005,10 @@ if __name__ == "__main__":
 
				     os.makedirs(output_dir, exist_ok=True)

			
 
				 

			
 
				     # 加载热门航线数据

			
 
				-    date_begin = "2025-12-07"

			
 
				+    date_begin = "2026-01-08"

			
 
				     date_end = datetime.today().strftime("%Y-%m-%d")

			
 
				 

			
 
				-    flight_route_list = vj_flight_route_list_hot[4:]  # 热门 vj_flight_route_list_hot  冷门 vj_flight_route_list_nothot

			
 
				+    flight_route_list = vj_flight_route_list_hot[:]  # 热门 vj_flight_route_list_hot  冷门 vj_flight_route_list_nothot

			
 
				     table_name = CLEAN_VJ_HOT_NEAR_INFO_TAB  # 热门 CLEAN_VJ_HOT_NEAR_INFO_TAB  冷门 CLEAN_VJ_NOTHOT_NEAR_INFO_TAB

			
 
				     is_hot = 1   # 1 热门 0 冷门

			
 
				     group_size = 1

			
--- a/data_preprocess.py
+++ b/data_preprocess.py
@@ -496,7 +496,7 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
 
				         
			
 
				         print(">>> 计算 price_at_n_hours")
			
 
				         df_input_object = df_input[(df_input['hours_until_departure'] >= current_n_hours) & (df_input['baggage'] == 30)].copy()
			
 
				-        df_last = df_input_object.groupby('gid', observed=True).last().reset_index()   # 一般落在起飞前48小时
			
 
				+        df_last = df_input_object.groupby('gid', observed=True).last().reset_index()   # 一般落在起飞前36\32\30小时
			
 
				         
			
 
				         # 提取并重命名 price 列
			
 
				         df_last_price_at_n_hours = df_last[['gid', 'adult_total_price']].rename(columns={'adult_total_price': 'price_at_n_hours'})
			
--- a/evaluate.py
+++ b/evaluate.py
@@ -14,7 +14,7 @@ from utils import FlightDataset
 
				 # 分布式模型评估
			
 
				 def evaluate_model_distribute(model, device, sequences, targets, group_ids, batch_size=16, test_loader=None, 
			
 
				                               batch_flight_routes=None, target_scaler=None, 
			
 
				-                              flag_distributed=False, rank=0, local_rank=0, world_size=1, output_dir='.', batch_idx=-1,
			
 
				+                              flag_distributed=False, rank=0, local_rank=0, world_size=1, output_dir='.', photo_dir='.', batch_idx=-1,
			
 
				                               csv_file='evaluate_results.csv', evalute_flag='evaluate', save_mode='a'):
			
 
				     
			
 
				     if test_loader is None:
			
@@ -95,7 +95,7 @@ def evaluate_model_distribute(model, device, sequences, targets, group_ids, batc
 
				         y_trues_class_labels = y_trues_class.astype(int)
			
 
				 
			
 
				         # 打印指标
			
 
				-        printScore_cc(y_trues_class_labels, y_preds_class_labels, batch_fn_str=batch_fn_str, batch_idx=batch_idx, evalute_flag=evalute_flag)
			
 
				+        printScore_cc(y_trues_class_labels, y_preds_class_labels, batch_fn_str=batch_fn_str, batch_idx=batch_idx, evalute_flag=evalute_flag, photo_dir=photo_dir)
			
 
				 
			
 
				         # 构造 DataFrame
			
 
				         results_df = pd.DataFrame({
			
@@ -154,7 +154,7 @@ def evaluate_model_distribute(model, device, sequences, targets, group_ids, batc
 
				         return None
			
 
				 
			
 
				 
			
 
				-def printScore_cc(y_trues_class_labels, y_preds_class_labels, batch_fn_str='', batch_idx=-1, evalute_flag='evaluate'):
			
 
				+def printScore_cc(y_trues_class_labels, y_preds_class_labels, batch_fn_str='', batch_idx=-1, evalute_flag='evaluate', photo_dir='.'):
			
 
				     
			
 
				     accuracy = accuracy_score(y_trues_class_labels, y_preds_class_labels)
			
 
				     precision = precision_score(y_trues_class_labels, y_preds_class_labels, zero_division=0)
			
@@ -188,4 +188,4 @@ def printScore_cc(y_trues_class_labels, y_preds_class_labels, batch_fn_str='', b
 
				     plt.xlabel('预测情况', fontproperties=font_prop)
			
 
				     plt.ylabel('实际结果', fontproperties=font_prop)
			
 
				     plt.title('分类结果的混淆矩阵', fontproperties=font_prop)
			
 
				-    plt.savefig(f"./photo/{evalute_flag}_confusion_matrix_{batch_idx}_{batch_fn_str}.png")
			
 
				+    plt.savefig(f"{photo_dir}/{evalute_flag}_confusion_matrix_{batch_idx}_{batch_fn_str}.png")
			
--- a/main_pe.py
+++ b/main_pe.py
@@ -1,10 +1,11 @@
 
				 import os
			
 
				 import torch
			
 
				 import joblib
			
 
				-import pandas as pd
			
 
				-import numpy as np
			
 
				+# import pandas as pd
			
 
				+# import numpy as np
			
 
				 import pickle
			
 
				 import time
			
 
				+import argparse
			
 
				 from datetime import datetime, timedelta
			
 
				 from config import mongodb_config, vj_flight_route_list_hot, vj_flight_route_list_nothot, CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB
			
 
				 from data_loader import mongo_con_parse, load_train_data
			
@@ -15,10 +16,6 @@ from predict import predict_future_distribute
 
				 from main_tr import features, categorical_features, target_vars
			
 
				 
			
 
				 
			
 
				-output_dir = "./data_shards"
			
 
				-photo_dir = "./photo"
			
 
				-
			
 
				-
			
 
				 def initialize_model():
			
 
				     input_size = len(features)
			
 
				     model = PriceDropClassifiTransModel(input_size, num_periods=2, hidden_size=64, num_layers=3, output_size=1, dropout=0.2)
			
@@ -34,11 +31,28 @@ def convert_date_format(date_str):
 
				     return dt
			
 
				     # return dt.strftime('%Y%m%d%H%M00')
			
 
				 
			
 
				-def start_predict():
			
 
				+def start_predict(interval_hours):
			
 
				+
			
 
				+    print(f"开始预测，间隔小时数: {interval_hours}")
			
 
				+
			
 
				+    output_dir = "./data_shards"
			
 
				+    photo_dir = "./photo"
			
 
				+    predict_dir = "./predictions"
			
 
				+
			
 
				+    if interval_hours == 4:
			
 
				+        output_dir = "./data_shards_4"
			
 
				+        photo_dir = "./photo_4"
			
 
				+        predict_dir = "./predictions_4"
			
 
				+
			
 
				+    elif interval_hours == 2:
			
 
				+        output_dir = "./data_shards_2"
			
 
				+        photo_dir = "./photo_2"
			
 
				+        predict_dir = "./predictions_2"
			
 
				 
			
 
				     # 确保目录存在
			
 
				     os.makedirs(output_dir, exist_ok=True) 
			
 
				     os.makedirs(photo_dir, exist_ok=True)
			
 
				+    os.makedirs(predict_dir, exist_ok=True)
			
 
				 
			
 
				     # 清空上一次预测结果
			
 
				     # csv_file_list = ['future_predictions.csv']
			
@@ -61,9 +75,15 @@ def start_predict():
 
				     pred_time_str = hourly_time.strftime("%Y%m%d%H%M")
			
 
				     print(f"预测时间(取整): {pred_time_str}")
			
 
				 
			
 
				-    # 预测时间范围，满足起飞时间 在28小时后到40小时后
			
 
				+    current_n_hours = 36
			
 
				+    if interval_hours == 4:
			
 
				+        current_n_hours = 32
			
 
				+    elif interval_hours == 2:
			
 
				+        current_n_hours = 30
			
 
				+
			
 
				+    # 预测时间范围，满足起飞时间 在28小时后到36/32/30小时后
			
 
				     pred_hour_begin = hourly_time + timedelta(hours=28)
			
 
				-    pred_hour_end = hourly_time + timedelta(hours=40)
			
 
				+    pred_hour_end = hourly_time + timedelta(hours=current_n_hours)
			
 
				 
			
 
				     pred_date_end = pred_hour_end.strftime("%Y-%m-%d")
			
 
				     pred_date_begin = pred_hour_begin.strftime("%Y-%m-%d")
			
@@ -134,7 +154,7 @@ def start_predict():
 
				         # 创建临时字段：seg1_dep_time 的整点时间
			
 
				         df_test['seg1_dep_hour'] = df_test['seg1_dep_time'].dt.floor('h')
			
 
				         # 使用整点时间进行比较过滤
			
 
				-        mask = (df_test['seg1_dep_hour'] >= pred_hour_begin) & (df_test['seg1_dep_hour'] <= pred_hour_end)
			
 
				+        mask = (df_test['seg1_dep_hour'] >= pred_hour_begin) & (df_test['seg1_dep_hour'] < pred_hour_end)
			
 
				         original_count = len(df_test)
			
 
				         df_test = df_test[mask].reset_index(drop=True)
			
 
				         filtered_count = len(df_test)
			
@@ -147,7 +167,7 @@ def start_predict():
 
				             continue
			
 
				 
			
 
				         # 数据预处理
			
 
				-        df_test_inputs = preprocess_data(df_test, features, categorical_features, is_training=False)
			
 
				+        df_test_inputs = preprocess_data(df_test, features, categorical_features, is_training=False, current_n_hours=current_n_hours)
			
 
				         
			
 
				         total_rows = df_test_inputs.shape[0]
			
 
				         print(f"行数: {total_rows}")
			
@@ -165,11 +185,21 @@ def start_predict():
 
				 
			
 
				         # 标准化与归一化处理
			
 
				         df_test_inputs, feature_scaler, _ = standardization(df_test_inputs, feature_scaler, is_training=False)
			
 
				-
			
 
				         print("标准化后数据样本:\n", df_test_inputs.head())
			
 
				 
			
 
				+        threshold = current_n_hours
			
 
				+        input_length = 444
			
 
				+
			
 
				+        # 确保 threshold 与 input_length 之合为 480
			
 
				+        if threshold == 36:
			
 
				+            input_length = 444
			
 
				+        elif threshold == 32:
			
 
				+            input_length = 448
			
 
				+        elif threshold == 30:
			
 
				+            input_length = 450
			
 
				+
			
 
				         # 生成序列
			
 
				-        sequences, _, group_ids = create_fixed_length_sequences(df_test_inputs, features, target_vars, is_train=False)
			
 
				+        sequences, _, group_ids = create_fixed_length_sequences(df_test_inputs, features, target_vars, threshold, input_length, is_train=False)
			
 
				         print(f"序列数量：{len(sequences)}")
			
 
				 
			
 
				         #----- 新增：智能模型加载 -----#
			
@@ -192,7 +222,7 @@ def start_predict():
 
				 
			
 
				         target_scaler = None
			
 
				         # 预测未来数据
			
 
				-        predict_future_distribute(model, sequences, group_ids, target_scaler=target_scaler, output_dir=output_dir, pred_time_str=pred_time_str)
			
 
				+        predict_future_distribute(model, sequences, group_ids, target_scaler=target_scaler, predict_dir=predict_dir, pred_time_str=pred_time_str)
			
 
				 
			
 
				     print("所有批次的预测结束")
			
 
				     print()
			
@@ -213,4 +243,8 @@ def start_predict():
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    start_predict()
			
 
				+    parser = argparse.ArgumentParser(description='预测脚本')
			
 
				+    parser.add_argument('--interval', type=int, choices=[2, 4, 8], 
			
 
				+                        default=8, help='间隔小时数（2, 4, 8）')
			
 
				+    args = parser.parse_args()
			
 
				+    start_predict(args.interval)
			
--- a/main_tr.py
+++ b/main_tr.py
@@ -19,7 +19,7 @@ from data_preprocess import preprocess_data, standardization
 
				 from train import prepare_data_distribute, train_model_distribute
			
 
				 from evaluate import printScore_cc
			
 
				 from config import mongodb_config, vj_flight_route_list, vj_flight_route_list_hot, vj_flight_route_list_nothot, \
			
 
				-    CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_HOT_FAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_FAR_INFO_TAB
			
 
				+    CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_HOT_FAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_FAR_INFO_TAB, INTERVAL_HOURS
			
 
				 
			
 
				 warnings.filterwarnings('ignore')
			
 
				 
			
@@ -110,6 +110,12 @@ def start_train():
 
				 
			
 
				     output_dir = "./data_shards"
			
 
				     photo_dir = "./photo"
			
 
				+    if INTERVAL_HOURS == 4:
			
 
				+        output_dir = "./data_shards_4"
			
 
				+        photo_dir = "./photo_4"
			
 
				+    elif INTERVAL_HOURS == 2:
			
 
				+        output_dir = "./data_shards_2"
			
 
				+        photo_dir = "./photo_2"
			
 
				 
			
 
				     date_end = datetime.today().strftime("%Y-%m-%d")
			
 
				     # date_begin = (datetime.today() - timedelta(days=41)).strftime("%Y-%m-%d")
			
@@ -274,9 +280,15 @@ def start_train():
 
				                 print(f"训练数据为空，跳过此批次。")
			
 
				                 continue_before_process(redis_client, lock_key)
			
 
				                 continue
			
 
				+
			
 
				+            current_n_hours = 36
			
 
				+            if INTERVAL_HOURS == 4:
			
 
				+                current_n_hours = 32
			
 
				+            elif INTERVAL_HOURS == 2:
			
 
				+                current_n_hours = 30
			
 
				             
			
 
				             # 数据预处理
			
 
				-            df_train_inputs = preprocess_data(df_train, features, categorical_features, is_training=True)
			
 
				+            df_train_inputs = preprocess_data(df_train, features, categorical_features, is_training=True, current_n_hours=current_n_hours)
			
 
				             print("预处理后数据样本:\n", df_train_inputs.head())
			
 
				 
			
 
				             total_rows = df_train_inputs.shape[0]
			
@@ -304,8 +316,19 @@ def start_train():
 
				             assemble_idx = batch_idx // assemble_size  # 计算当前集群索引
			
 
				             print("assemble_idx:", assemble_idx)
			
 
				             
			
 
				+            threshold = current_n_hours
			
 
				+            input_length = 444
			
 
				+
			
 
				+            # 确保 threshold 与 input_length 之合为 480
			
 
				+            if threshold == 36:
			
 
				+                input_length = 444
			
 
				+            elif threshold == 32:
			
 
				+                input_length = 448
			
 
				+            elif threshold == 30:
			
 
				+                input_length = 450
			
 
				+
			
 
				             # 生成序列
			
 
				-            sequences, targets, group_ids = create_fixed_length_sequences(df_train_inputs, features, target_vars)
			
 
				+            sequences, targets, group_ids = create_fixed_length_sequences(df_train_inputs, features, target_vars, threshold, input_length)
			
 
				             
			
 
				             # 新增有效性检查
			
 
				             if len(sequences) == 0 or len(targets) == 0 or len(group_ids) == 0:
			
--- a/predict.py
+++ b/predict.py
@@ -7,7 +7,7 @@ from torch.utils.data import DataLoader
 
				 from utils import FlightDataset
			
 
				 
			
 
				 
			
 
				-def predict_future_distribute(model, sequences, group_ids, batch_size=16, target_scaler=None, output_dir=".", pred_time_str=""):
			
 
				+def predict_future_distribute(model, sequences, group_ids, batch_size=16, target_scaler=None, predict_dir=".", pred_time_str=""):
			
 
				     if not sequences:
			
 
				         print("没有足够的数据进行预测。")
			
 
				         return
			
@@ -65,11 +65,7 @@ def predict_future_distribute(model, sequences, group_ids, batch_size=16, target
 
				     for col in numeric_columns:
			
 
				         results_df[col] = results_df[col].where(results_df[col].abs() >= threshold, 0)
			
 
				     
			
 
				-    # 修改预测保存路径
			
 
				-    output_dir = './predictions'
			
 
				-    os.makedirs(output_dir, exist_ok=True)
			
 
				-
			
 
				-    csv_path1 = os.path.join(output_dir, f'future_predictions_{pred_time_str}.csv')
			
 
				+    csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
			
 
				     results_df.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
			
 
				 
			
 
				     print("预测结果已追加")
			
--- a/result_validate.py
+++ b/result_validate.py
@@ -4,7 +4,9 @@ import pandas as pd
 
				 from data_loader import mongo_con_parse, validate_one_line, fill_hourly_crawl_date
			
 
				 
			
 
				 
			
 
				-def validate_process(node, date, pred_time_str):
			
 
				+def validate_process(node, pred_time_str):
			
 
				+
			
 
				+    date = pred_time_str[4:8]
			
 
				 
			
 
				     output_dir = f"./validate/{node}_{date}"
			
 
				     os.makedirs(output_dir, exist_ok=True)
			
@@ -113,5 +115,5 @@ def validate_process(node, date, pred_time_str):
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    node, date, pred_time_str = "node0108", "0110", "202601100800"
			
 
				-    validate_process(node, date, pred_time_str)
			
 
				+    node, pred_time_str = "node0108", "202601121000"
			
 
				+    validate_process(node, pred_time_str)
			
--- a/train.py
+++ b/train.py
@@ -302,7 +302,7 @@ def train_model_distribute(train_sequences, train_targets, train_group_ids, val_
 
				             batch_flight_routes=batch_flight_routes, target_scaler=target_scaler,
			
 
				             flag_distributed=flag_distributed,
			
 
				             rank=rank, local_rank=local_rank, world_size=world_size, 
			
 
				-            output_dir=output_dir, batch_idx=batch_idx, save_mode='a'
			
 
				+            output_dir=output_dir, photo_dir=photo_dir, batch_idx=batch_idx, save_mode='a'
			
 
				         )
			
 
				     else:
			
 
				         evaluate_model_distribute(
			
@@ -312,7 +312,7 @@ def train_model_distribute(train_sequences, train_targets, train_group_ids, val_
 
				             test_loader=val_loader,  # 使用累积验证集
			
 
				             batch_flight_routes=batch_flight_routes, target_scaler=target_scaler,
			
 
				             flag_distributed=False,
			
 
				-            output_dir=output_dir, batch_idx=batch_idx, save_mode='a'
			
 
				+            output_dir=output_dir, photo_dir=photo_dir, batch_idx=batch_idx, save_mode='a'
			
 
				         )
			
 
				 
			
 
				     return model