1 săptămână în urmă · 575bc3d789
--- a/config.py
+++ b/config.py
@@ -1,3 +1,6 @@
 
															+import holidays

														
 
															+import pandas as pd

														
 
															+

														
 
															 CLEAN_VJ_HOT_NEAR_INFO_TAB = "clean_flights_vj_hot_0_7_info_tab"

														
 
															 CLEAN_VJ_HOT_FAR_INFO_TAB = "clean_flights_vj_hot_7_30_info_tab"

														
 
															 CLEAN_VJ_NOTHOT_NEAR_INFO_TAB = "clean_flights_vj_nothot_0_7_info_tab"

														
@@ -12,6 +15,74 @@ mongodb_config = {
 
															     "pwd": ""

														
 
															 }

														
 
															+# 城市码-国家码的映射

														
 
															+city_to_country = {

														
 
															+    "CAN": "CN",  # 广州，中国

														
 
															+    "DPS": "ID",  # 巴厘岛，印度尼西亚

														
 
															+    "HAN": "VN",  # 河内，越南

														
 
															+    "SGN": "VN",  # 胡志明(西贡)，越南

														
 
															+    "CTU": "CN",  # 成都，中国

														
 
															+    "DAD": "VN",  # 岘港，越南

														
 
															+    "SEL": "KR",  # 首尔，韩国

														
 
															+    "DEL": "IN",  # 德里，印度

														
 
															+    "UIH": "VN",  # 归仁，越南

														
 
															+    "HKG": "HK",  # 香港，中国

														
 
															+    "PQC": "VN",  # 富国岛，越南

														
 
															+    "KUL": "MY",  # 吉隆坡，马来西亚

														
 
															+    "NGO": "JP",  # 名古屋，日本

														
 
															+    "NHA": "VN",  # 芽庄，越南

														
 
															+    "PUS": "KR",  # 釜山，韩国

														
 
															+    "SHA": "CN",  # 上海，中国

														
 
															+    "SIN": "SG",  # 新加坡，新加坡

														
 
															+    "TPE": "TW",  # 台北，中国台湾

														
 
															+    "TYO": "JP",  # 东京，日本

														
 
															+    "BKK": "TH",  # 曼谷，泰国

														
 
															+    "BLR": "IN",  # 班加罗尔，印度

														
 
															+    "FUK": "JP",  # 福冈，日本

														
 
															+    "BMV": "VN",  # 邦美蜀，越南

														
 
															+    "BNE": "AU",  # 布里斯班，澳大利亚

														
 
															+    "BOM": "IN",  # 孟买，印度

														
 
															+    "DLI": "VN",  # 大叻，越南

														
 
															+    "OSA": "JP",  # 大阪，日本

														
 
															+    "RMQ": "TW",  # 台中，中国台湾

														
 
															+    "HKT": "TH",  # 普吉岛，泰国

														
 
															+    "HPH": "VN",  # 海防，越南

														
 
															+    "KHH": "TW",  # 高雄，中国台湾

														
 
															+    "MEL": "AU",  # 墨尔本，澳大利亚

														
 
															+    "MNL": "PH",  # 马尼拉，菲律宾

														
 
															+    "SYD": "AU",  # 悉尼，澳大利亚

														
 
															+    "REP": "KH",  # 暹粒，柬埔寨

														
 
															+    "VTE": "LA",  # 万象，老挝

														
 
															+    "HYD": "IN",  # 海得拉巴，印度

														
 
															+    "AMD": "IN",  # 艾哈迈达巴德，印度

														
 
															+}

														
 
															+

														
 
															+# 生成各个国家(地区)的节假日

														
 
															+def build_country_holidays(city_to_country):

														
 
															+    countries = sorted(set(city_to_country.values()))

														
 
															+    start_date = pd.Timestamp('2025-11-01')

														
 
															+    end_date = pd.Timestamp('2026-12-31')

														
 
															+

														
 
															+    country_holidays = {}

														
 
															+

														
 
															+    for country in countries:

														
 
															+        try:

														
 
															+            hdays = holidays.country_holidays(

														
 
															+                country,

														
 
															+                years=[2025, 2026]

														
 
															+            )

														
 
															+            # 转成 set[date]，方便高速查询

														
 
															+            country_holidays[country] = {

														
 
															+                d for d in hdays

														
 
															+                if start_date.date() <= d <= end_date.date()

														
 
															+            }

														
 
															+        except Exception:

														
 
															+            # 个别国家 holidays 库可能不支持

														
 
															+            country_holidays[country] = set()

														
 
															+

														
 
															+    return country_holidays

														
 
															+

														
 
															+

														
 
															 # 热门的航线

														
 
															 vj_flight_route_list_hot = [

														
 
															     "CAN-DPS", "CAN-HAN", "CAN-SGN", "CTU-HAN", "CTU-SGN",

														
@@ -46,27 +117,30 @@ vj_flight_route_list = vj_flight_route_list_hot + vj_flight_route_list_nothot
 
															 if __name__ == '__main__':

														
 
															-    from collections import Counter

														
 
															-    # 检查重复项

														
 
															-    # 统计每个航线出现的次数

														
 
															-    route_counter = Counter(vj_flight_route_list)

														
 
															-

														
 
															-    # 找出重复的航线

														
 
															-    duplicates = {route: count for route, count in route_counter.items() if count > 1}

														
 
															-

														
 
															-    # 输出结果

														
 
															-    if duplicates:

														
 
															-        print("发现重复的航线：")

														
 
															-        for route, count in duplicates.items():

														
 
															-            print(f"  {route}: 出现 {count} 次")

														
 
															-

														
 
															-        print(f"\n总共发现 {len(duplicates)} 条重复航线")

														
 
															-

														
 
															-        # 查找这些航线分别在哪个列表中

														
 
															-        print("\n重复航线分布：")

														
 
															-        for route in duplicates:

														
 
															-            hot_count = vj_flight_route_list_hot.count(route)

														
 
															-            nothot_count = vj_flight_route_list_nothot.count(route)

														
 
															-            print(f"  {route}: hot列表中出现 {hot_count} 次, nothot列表中出现 {nothot_count} 次")

														
 
															-    else:

														
 
															-        print("没有发现重复航线")

														
 
															+    # from collections import Counter

														
 
															+    # # 检查重复项

														
 
															+    # # 统计每个航线出现的次数

														
 
															+    # route_counter = Counter(vj_flight_route_list)

														
 
															+

														
 
															+    # # 找出重复的航线

														
 
															+    # duplicates = {route: count for route, count in route_counter.items() if count > 1}

														
 
															+

														
 
															+    # # 输出结果

														
 
															+    # if duplicates:

														
 
															+    #     print("发现重复的航线：")

														
 
															+    #     for route, count in duplicates.items():

														
 
															+    #         print(f"  {route}: 出现 {count} 次")

														
 
															+

														
 
															+    #     print(f"\n总共发现 {len(duplicates)} 条重复航线")

														
 
															+

														
 
															+    #     # 查找这些航线分别在哪个列表中

														
 
															+    #     print("\n重复航线分布：")

														
 
															+    #     for route in duplicates:

														
 
															+    #         hot_count = vj_flight_route_list_hot.count(route)

														
 
															+    #         nothot_count = vj_flight_route_list_nothot.count(route)

														
 
															+    #         print(f"  {route}: hot列表中出现 {hot_count} 次, nothot列表中出现 {nothot_count} 次")

														
 
															+    # else:

														
 
															+    #     print("没有发现重复航线")

														
 
															+

														
 
															+    COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)

														
 
															+    print(COUNTRY_HOLIDAYS)

														
--- a/data_loader.py
+++ b/data_loader.py
@@ -568,12 +568,15 @@ def load_train_data(db, flight_route_list, table_name, date_begin, date_end, out
 
															         route = f"{from_city}-{to_city}"

														
 
															         print(f"开始处理航线: {route}")

														
 
															         all_groups = query_groups_of_city_code(db, from_city, to_city, table_name)

														
 
															+        all_groups_len = len(all_groups)

														
 
															+        print(f"该航线共有{all_groups_len}个航班号")

														
 
															         # 每一组航班号

														
 
															         for each_group in all_groups:

														
 
															             flight_nums = each_group.get("flight_numbers")

														
 
															             print(f"开始处理航班号: {flight_nums}")

														
 
															             details = each_group.get("details")

														
 
															-            # 查远期表

														
 
															+

														
 
															+            print(f"查远期表")

														
 
															             if is_hot == 1:

														
 
															                 df1 = query_flight_range_status(db, CLEAN_VJ_HOT_FAR_INFO_TAB, from_city, to_city,

														
 
															                                                 date_begin_s, date_end_s, flight_nums)

														
@@ -586,7 +589,7 @@ def load_train_data(db, flight_route_list, table_name, date_begin, date_end, out
 
															                 print(f"航班号:{flight_nums} 远期表无数据, 跳过")

														
 
															                 continue

														
 
															-            # 查近期表

														
 
															+            print(f"查近期表")

														
 
															             if is_hot == 1:

														
 
															                 df2 = query_flight_range_status(db, CLEAN_VJ_HOT_NEAR_INFO_TAB, from_city, to_city,

														
 
															                                                 date_begin_s, date_end_s, flight_nums)

														
--- a/data_preprocess.py
+++ b/data_preprocess.py
@@ -0,0 +1,185 @@
 
															+import pandas as pd
														
 
															+import numpy as np
														
 
															+import bisect
														
 
															+from datetime import datetime, timedelta
														
 
															+from sklearn.preprocessing import StandardScaler
														
 
															+from config import city_to_country, build_country_holidays
														
 
															+
														
 
															+COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
														
 
															+
														
 
															+
														
 
															+def preprocess_data(df_train, features, categorical_features, is_training=True):
														
 
															+    print(">>> 开始数据预处理") 
														
 
															+
														
 
															+    # 生成 城市对
														
 
															+    df_train['city_pair'] = (
														
 
															+        df_train['from_city_code'].astype(str) + "-" + df_train['to_city_code'].astype(str)
														
 
															+    )
														
 
															+    # 把 city_pair、from_city_code、to_city_code 放到前三列
														
 
															+    cols = df_train.columns.tolist()
														
 
															+    # 删除已存在的三列（保证顺序正确）
														
 
															+    for c in ['city_pair', 'from_city_code', 'to_city_code']:
														
 
															+        cols.remove(c)
														
 
															+    # 这三列插入到最前面
														
 
															+    df_train = df_train[['city_pair', 'from_city_code', 'to_city_code'] + cols]
														
 
															+
														
 
															+    # 转格式
														
 
															+    df_train['search_dep_time'] = pd.to_datetime(
														
 
															+        df_train['search_dep_time'],
														
 
															+        format='%Y%m%d',
														
 
															+        errors='coerce'
														
 
															+    ).dt.strftime('%Y-%m-%d')
														
 
															+    # 重命名起飞日期
														
 
															+    df_train.rename(columns={'search_dep_time': 'flight_day'}, inplace=True)
														
 
															+    
														
 
															+    # 重命名航班号
														
 
															+    df_train.rename(
														
 
															+        columns={
														
 
															+            'seg1_flight_number': 'flight_number_1',
														
 
															+            'seg2_flight_number': 'flight_number_2'
														
 
															+        },
														
 
															+        inplace=True
														
 
															+    )
														
 
															+    # 分开填充
														
 
															+    df_train['flight_number_1'] = df_train['flight_number_1'].fillna('VJ')
														
 
															+    df_train['flight_number_2'] = df_train['flight_number_2'].fillna('VJ')
														
 
															+
														
 
															+    # 生成第一机场对
														
 
															+    df_train['airport_pair_1'] = (
														
 
															+        df_train['seg1_dep_air_port'].astype(str) + "-" + df_train['seg1_arr_air_port'].astype(str)
														
 
															+    )
														
 
															+    # 删除原始第一机场码
														
 
															+    df_train.drop(columns=['seg1_dep_air_port', 'seg1_arr_air_port'], inplace=True)
														
 
															+    # 第一机场对 放到 seg1_dep_time 列的前面
														
 
															+    insert_idx = df_train.columns.get_loc('seg1_dep_time')
														
 
															+    airport_pair_1 = df_train.pop('airport_pair_1')
														
 
															+    df_train.insert(insert_idx, 'airport_pair_1', airport_pair_1)
														
 
															+
														
 
															+    # 生成第二机场对（带缺失兜底）
														
 
															+    df_train['airport_pair_2'] = np.where(
														
 
															+        df_train['seg2_dep_air_port'].isna() | df_train['seg2_arr_air_port'].isna(),
														
 
															+        'NA',
														
 
															+        df_train['seg2_dep_air_port'].astype(str) + "-" +
														
 
															+        df_train['seg2_arr_air_port'].astype(str)
														
 
															+    )
														
 
															+    # 删除原始第二机场码
														
 
															+    df_train.drop(columns=['seg2_dep_air_port', 'seg2_arr_air_port'], inplace=True)
														
 
															+    # 第二机场对 放到 seg2_dep_time 列的前面
														
 
															+    insert_idx = df_train.columns.get_loc('seg2_dep_time')
														
 
															+    airport_pair_2 = df_train.pop('airport_pair_2')
														
 
															+    df_train.insert(insert_idx, 'airport_pair_2', airport_pair_2)
														
 
															+    
														
 
															+    # 是否转乘
														
 
															+    df_train['is_transfer'] = np.where(df_train['flight_number_2'] == 'VJ', 0, 1)
														
 
															+    insert_idx = df_train.columns.get_loc('flight_number_2')
														
 
															+    is_transfer = df_train.pop('is_transfer')
														
 
															+    df_train.insert(insert_idx, 'is_transfer', is_transfer)
														
 
															+
														
 
															+    # 重命名起飞时刻与到达时刻
														
 
															+    df_train.rename(
														
 
															+        columns={
														
 
															+            'seg1_dep_time': 'dep_time_1',
														
 
															+            'seg1_arr_time': 'arr_time_1',
														
 
															+            'seg2_dep_time': 'dep_time_2',
														
 
															+            'seg2_arr_time': 'arr_time_2',
														
 
															+        },
														
 
															+        inplace=True
														
 
															+    )
														
 
															+    
														
 
															+    # 第一段飞行时长
														
 
															+    df_train['fly_duration_1'] = (
														
 
															+        (df_train['arr_time_1'] - df_train['dep_time_1'])
														
 
															+        .dt.total_seconds() / 3600
														
 
															+    ).round(2)
														
 
															+
														
 
															+    # 第二段飞行时长（无转乘为 0）
														
 
															+    df_train['fly_duration_2'] = (
														
 
															+        (df_train['arr_time_2'] - df_train['dep_time_2'])
														
 
															+        .dt.total_seconds() / 3600
														
 
															+    ).fillna(0).round(2)
														
 
															+
														
 
															+    # 总飞行时长
														
 
															+    df_train['fly_duration'] = (
														
 
															+        df_train['fly_duration_1'] + df_train['fly_duration_2']
														
 
															+    ).round(2)
														
 
															+
														
 
															+    # 中转停留时长（无转乘为 0）
														
 
															+    df_train['stop_duration'] = (
														
 
															+        (df_train['dep_time_2'] - df_train['arr_time_1'])
														
 
															+        .dt.total_seconds() / 3600
														
 
															+    ).fillna(0).round(2)
														
 
															+
														
 
															+    # 裁剪,防止负数
														
 
															+    # for c in ['fly_duration_1', 'fly_duration_2', 'fly_duration', 'stop_duration']:
														
 
															+    #     df_train[c] = df_train[c].clip(lower=0)
														
 
															+
														
 
															+    # 和 is_transfer 逻辑保持一致
														
 
															+    # df_train.loc[df_train['is_transfer'] == 0, ['fly_duration_2', 'stop_duration']] = 0
														
 
															+    
														
 
															+    # 一次性插到 is_filled 前面
														
 
															+    insert_before = 'is_filled'
														
 
															+    new_cols = [
														
 
															+        'fly_duration_1',
														
 
															+        'fly_duration_2',
														
 
															+        'fly_duration',
														
 
															+        'stop_duration'
														
 
															+    ]
														
 
															+    cols = df_train.columns.tolist()
														
 
															+    idx = cols.index(insert_before)
														
 
															+    # 删除旧位置
														
 
															+    cols = [c for c in cols if c not in new_cols]
														
 
															+    # 插入新位置（顺序保持）
														
 
															+    cols[idx:idx] = new_cols    # python独有空切片插入法
														
 
															+    df_train = df_train[cols]
														
 
															+
														
 
															+    # 一次生成多个字段
														
 
															+    dep_t1 = df_train['dep_time_1']
														
 
															+    # 几点起飞（0–23）
														
 
															+    df_train['flight_by_hour'] = dep_t1.dt.hour
														
 
															+    # 起飞日期几号（1–31）
														
 
															+    df_train['flight_by_day'] = dep_t1.dt.day
														
 
															+    # 起飞日期几月（1–12）
														
 
															+    df_train['flight_day_of_month'] = dep_t1.dt.month
														
 
															+    # 起飞日期周几（0=周一, 6=周日）
														
 
															+    df_train['flight_day_of_week'] = dep_t1.dt.weekday
														
 
															+    # 起飞日期季度（1–4）
														
 
															+    df_train['flight_day_of_quarter'] = dep_t1.dt.quarter
														
 
															+    # 是否周末（周六 / 周日）
														
 
															+    df_train['flight_day_is_weekend'] = dep_t1.dt.weekday.isin([5, 6]).astype(int)
														
 
															+
														
 
															+    # 找到对应的国家码
														
 
															+    df_train['dep_country'] = df_train['from_city_code'].map(city_to_country)
														
 
															+    df_train['arr_country'] = df_train['to_city_code'].map(city_to_country) 
														
 
															+
														
 
															+    # 整体出发时间 就是 dep_time_1
														
 
															+    df_train['global_dep_time'] = df_train['dep_time_1']
														
 
															+    # 整体到达时间：有转乘用 arr_time_2，否则用 arr_time_1
														
 
															+    df_train['global_arr_time'] = df_train['arr_time_2'].fillna(df_train['arr_time_1'])
														
 
															+
														
 
															+    # 出发日期在出发国家是否节假日
														
 
															+    df_train['dep_country_is_holiday'] = df_train.apply(
														
 
															+        lambda r: r['global_dep_time'].date()
														
 
															+        in COUNTRY_HOLIDAYS.get(r['dep_country'], set()),
														
 
															+        axis=1
														
 
															+    ).astype(int)
														
 
															+
														
 
															+    # 到达日期在到达国家是否节假日
														
 
															+    df_train['arr_country_is_holiday'] = df_train.apply(
														
 
															+        lambda r: r['global_arr_time'].date()
														
 
															+        in COUNTRY_HOLIDAYS.get(r['arr_country'], set()),
														
 
															+        axis=1
														
 
															+    ).astype(int)
														
 
															+
														
 
															+    # 在任一侧是否节假日
														
 
															+    df_train['flight_day_is_holiday'] = (
														
 
															+        df_train[['dep_country_is_holiday', 'arr_country_is_holiday']]
														
 
															+        .max(axis=1)
														
 
															+    )
														
 
															+
														
 
															+    # 是否跨国航线
														
 
															+    df_train['is_cross_country'] = (
														
 
															+        df_train['dep_country'] != df_train['arr_country']
														
 
															+    ).astype(int)
														
 
															+
														
 
															+    pass
														
 
															+
														
--- a/main_tr.py
+++ b/main_tr.py
@@ -0,0 +1,204 @@
 
															+import warnings
														
 
															+import os
														
 
															+import torch
														
 
															+import torch.distributed as dist
														
 
															+from torch.nn.parallel import DistributedDataParallel as DDP
														
 
															+import joblib
														
 
															+import gc
														
 
															+import pandas as pd
														
 
															+import numpy as np
														
 
															+import redis
														
 
															+import time
														
 
															+import pickle
														
 
															+import shutil
														
 
															+from datetime import datetime, timedelta
														
 
															+from data_loader import chunk_list, mongo_con_parse, load_train_data
														
 
															+from data_preprocess import preprocess_data
														
 
															+from config import mongodb_config, vj_flight_route_list, vj_flight_route_list_hot, vj_flight_route_list_nothot, \
														
 
															+    CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_HOT_FAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_FAR_INFO_TAB
														
 
															+
														
 
															+warnings.filterwarnings('ignore')
														
 
															+
														
 
															+
														
 
															+# 根据环境变量的存在设置分布式开关
														
 
															+if 'LOCAL_RANK' in os.environ:
														
 
															+    FLAG_Distributed = True
														
 
															+else:
														
 
															+    FLAG_Distributed = False
														
 
															+
														
 
															+
														
 
															+# 定义特征和参数
														
 
															+categorical_features = ['city_pair', 'flight_number_1', 'flight_number_2']
														
 
															+other_features = []
														
 
															+features = []
														
 
															+
														
 
															+target_vars = ['target_min_to_price']   # 最低会降到的价格
														
 
															+
														
 
															+# 分布式环境初始化
														
 
															+def init_distributed_backend():
														
 
															+    if FLAG_Distributed:
														
 
															+        local_rank = int(os.environ['LOCAL_RANK'])
														
 
															+        # 关键：绑定设备必须在初始化进程组之前
														
 
															+        torch.cuda.set_device(local_rank)            # 显式设置当前进程使用的 GPU
														
 
															+        try:
														
 
															+            dist.init_process_group(
														
 
															+                backend='nccl',
														
 
															+                init_method='env://',
														
 
															+                world_size=int(os.environ['WORLD_SIZE']),
														
 
															+                rank=int(os.environ['RANK']),
														
 
															+                timeout=timedelta(minutes=30)   
														
 
															+            )
														
 
															+            print(f"Process group initialized for rank {dist.get_rank()}")  # 添加日志
														
 
															+        except Exception as e:
														
 
															+            print(f"Failed to initialize process group: {e}")  # 捕获异常
														
 
															+            raise
														
 
															+        device = torch.device("cuda", local_rank)
														
 
															+    else:
														
 
															+        # 如果不在分布式环境中, 使用默认设备
														
 
															+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
														
 
															+        print("use common environment")
														
 
															+    return device
														
 
															+
														
 
															+# 初始化模型和相关参数
														
 
															+def initialize_model(device):
														
 
															+    return None
														
 
															+
														
 
															+def continue_before_process(redis_client, lock_key):
														
 
															+    # rank0 跳出循环前的处理
														
 
															+    redis_client.set(lock_key, 2)               # 设置 Redis 锁 key 的值为 2
														
 
															+    print("rank0 已将 Redis 锁 key 值设置为 2")
														
 
															+    time.sleep(5)
														
 
															+    print("rank0 5秒等待结束")
														
 
															+
														
 
															+def start_train():
														
 
															+    device = init_distributed_backend()
														
 
															+
														
 
															+    model = initialize_model(device)
														
 
															+
														
 
															+    if FLAG_Distributed:
														
 
															+        rank = dist.get_rank()
														
 
															+        local_rank = int(os.environ.get('LOCAL_RANK'))
														
 
															+        world_size = dist.get_world_size()
														
 
															+    else:
														
 
															+        rank = 0
														
 
															+        local_rank = 0
														
 
															+        world_size = 1
														
 
															+
														
 
															+    output_dir = "./data_shards" 
														
 
															+    photo_dir = "./photo"
														
 
															+
														
 
															+    date_end = datetime.today().strftime("%Y-%m-%d")
														
 
															+    date_begin = (datetime.today() - timedelta(days=10)).strftime("%Y-%m-%d")
														
 
															+
														
 
															+    # 仅在 rank == 0 时要做的
														
 
															+    if rank == 0:
														
 
															+        # 如果处理中断, 注释掉以下代码
														
 
															+        batch_dir = os.path.join(output_dir, "batches")
														
 
															+        try:
														
 
															+            shutil.rmtree(batch_dir)
														
 
															+        except FileNotFoundError:
														
 
															+            print(f"rank:{rank}, {batch_dir} not found")
														
 
															+
														
 
															+        # 如果处理中断, 注释掉以下代码
														
 
															+        csv_file_list = ['evaluate_results.csv']
														
 
															+        for csv_file in csv_file_list:
														
 
															+            try:
														
 
															+                csv_path = os.path.join(output_dir, csv_file)
														
 
															+                os.remove(csv_path)
														
 
															+            except Exception as e:
														
 
															+                print(f"remove {csv_path}: {str(e)}")
														
 
															+
														
 
															+        # 确保目录存在
														
 
															+        os.makedirs(output_dir, exist_ok=True) 
														
 
															+        os.makedirs(photo_dir, exist_ok=True)
														
 
															+
														
 
															+        print(f"最终特征列表：{features}")
														
 
															+
														
 
															+    # 定义优化器和损失函数(只回归)
														
 
															+    # criterion = RegressionLoss(loss_func_flag="Quantile", quantile=0.5)
														
 
															+    # optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)
														
 
															+
														
 
															+    group_size = 1
														
 
															+    num_epochs_per_batch = 200  # 每个批次训练的轮数，可以根据需要调整
														
 
															+
														
 
															+    # 初始化 Redis 客户端（请根据实际情况修改 host、port、db）
														
 
															+    redis_client = redis.Redis(host='192.168.20.222', port=6379, db=0)
														
 
															+    lock_key = "data_loading_lock_11"
														
 
															+    barrier_key = 'distributed_barrier_11'
														
 
															+
														
 
															+    batch_idx = -1
														
 
															+
														
 
															+    # 主干代码
														
 
															+    flight_route_list = vj_flight_route_list_hot + vj_flight_route_list_nothot
														
 
															+    flight_route_list_len = len(flight_route_list)
														
 
															+    route_len_hot = len(vj_flight_route_list_hot)
														
 
															+    route_len_nothot = len(vj_flight_route_list_nothot)
														
 
															+
														
 
															+    # 调试代码
														
 
															+    # s = 38   # 菲律宾2025-12-08是节假日 s=38 选到马尼拉 
														
 
															+    # flight_route_list = vj_flight_route_list_hot[:0] + vj_flight_route_list_nothot[s:]
														
 
															+    # flight_route_list_len = len(flight_route_list)
														
 
															+    # route_len_hot = len(vj_flight_route_list_hot[:0])
														
 
															+    # route_len_nothot = len(vj_flight_route_list_nothot[s:])
														
 
															+    
														
 
															+    if local_rank == 0:
														
 
															+        print(f"flight_route_list_len:{flight_route_list_len}")
														
 
															+        print(f"route_len_hot:{route_len_hot}")
														
 
															+        print(f"route_len_nothot:{route_len_nothot}")
														
 
															+    
														
 
															+    chunks = chunk_list(flight_route_list, group_size)
														
 
															+
														
 
															+    for idx, group_route_list in enumerate(chunks, start=0):
														
 
															+        # 特殊处理，跳过不好的批次
														
 
															+        pass
														
 
															+        redis_client.set(lock_key, 0)
														
 
															+        redis_client.set(barrier_key, 0)
														
 
															+        # 所有 Rank 同步的标志变量
														
 
															+        valid_batch = torch.tensor([1], dtype=torch.int, device=device)  # 1表示有效批次
														
 
															+
														
 
															+        # 仅在 rank == 0 时要做的
														
 
															+        if rank == 0:
														
 
															+            # Rank0 设置 Redis 锁 key 的初始值为 0，表示数据加载尚未完成
														
 
															+            redis_client.set(lock_key, 0)
														
 
															+            print("rank0 开始数据加载...")
														
 
															+            # 使用默认配置
														
 
															+            client, db = mongo_con_parse()
														
 
															+            print(f"第 {idx} 组 :", group_route_list)
														
 
															+
														
 
															+            # 根据索引位置决定是 热门 还是 冷门
														
 
															+            if 0 <= idx < route_len_hot:
														
 
															+                is_hot = 1
														
 
															+                table_name = CLEAN_VJ_HOT_NEAR_INFO_TAB
														
 
															+            elif route_len_hot <= idx < route_len_hot + route_len_nothot:
														
 
															+                is_hot = 0
														
 
															+                table_name = CLEAN_VJ_NOTHOT_NEAR_INFO_TAB
														
 
															+            else:
														
 
															+                print(f"无法确定热门还是冷门, 跳过此批次。")
														
 
															+                continue_before_process(redis_client, lock_key)
														
 
															+                continue
														
 
															+            
														
 
															+            # 加载训练数据
														
 
															+            start_time = time.time()
														
 
															+            df_train = load_train_data(db, group_route_list, table_name, date_begin, date_end, output_dir, is_hot)
														
 
															+            end_time = time.time()
														
 
															+            run_time = round(end_time - start_time, 3)
														
 
															+            print(f"用时: {run_time} 秒")
														
 
															+
														
 
															+            client.close()
														
 
															+
														
 
															+            if df_train.empty:
														
 
															+                print(f"训练数据为空，跳过此批次。")
														
 
															+                continue_before_process(redis_client, lock_key)
														
 
															+                continue
														
 
															+            
														
 
															+            # 数据预处理
														
 
															+            df_train_inputs = preprocess_data(df_train, features, categorical_features, is_training=True)
														
 
															+            pass
														
 
															+
														
 
															+        else:
														
 
															+            pass
														
 
															+
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    start_train()