há 1 mês atrás · 977b8a0867
--- a/data_loader.py
+++ b/data_loader.py
@@ -6,6 +6,8 @@ from pymongo.errors import PyMongoError, ServerSelectionTimeoutError
 
															 import pandas as pd

														
 
															 import os

														
 
															 import random

														
 
															+import threading

														
 
															+from concurrent.futures import ThreadPoolExecutor, as_completed

														
 
															 import numpy as np

														
 
															 import matplotlib.pyplot as plt

														
 
															 from matplotlib import font_manager

														
@@ -67,7 +69,7 @@ def test_mongo_connection(db):
 
															 def query_flight_range_status(db, table_name, from_city, to_city, dep_date_begin, dep_date_end, flight_nums, 

														
 
															-                              limit=0, max_retries=3, base_sleep=1.0):

														
 
															+                              limit=0, max_retries=3, base_sleep=1.0, thread_id=0):

														
 
															     """

														
 
															     从指定表(4类)查询数据(指定起飞天的范围) （失败自动重试）

														
 
															     """

														
@@ -132,7 +134,7 @@ def query_flight_range_status(db, table_name, from_city, to_city, dep_date_begin
 
															                 # 1️⃣ 展开 segments

														
 
															                 print(f"📊 开始扩展segments 稍等...")

														
 
															                 t1 = time.time()

														
 
															-                df = expand_segments_columns(df)

														
 
															+                df = expand_segments_columns_optimized(df)  # 改为调用优化版

														
 
															                 t2 = time.time()

														
 
															                 rt = round(t2 - t1, 3)

														
 
															                 print(f"用时: {rt} 秒")

														
@@ -157,73 +159,148 @@ def query_flight_range_status(db, table_name, from_city, to_city, dep_date_begin
 
															             time.sleep(sleep_time)

														
 
															-def expand_segments_columns(df):

														
 
															-    """展开 segments"""

														
 
															+# def expand_segments_columns(df):

														
 
															+#     """展开 segments"""

														
 
															+#     df = df.copy()

														
 
															+

														
 
															+#     # 定义要展开的列

														
 
															+#     seg1_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time', 'cabin', 'baggage']

														
 
															+#     seg2_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time']

														
 
															+

														
 
															+#     # 定义 apply 函数一次返回字典

														
 
															+#     def extract_segments(row):

														
 
															+#         segments = row.get('segments')

														
 
															+#         result = {}

														
 
															+#         # 默认缺失使用 pd.NA（对字符串友好）

														
 
															+#         missing = pd.NA

														
 
															+#         if isinstance(segments, list):

														
 
															+#             # 第一段

														
 
															+#             if len(segments) >= 1 and isinstance(segments[0], dict):

														
 
															+#                 for col in seg1_cols:

														
 
															+#                     result[f'seg1_{col}'] = segments[0].get(col)

														
 
															+#             else:

														
 
															+#                 for col in seg1_cols:

														
 
															+#                     result[f'seg1_{col}'] = missing

														
 
															+#             # 第二段

														
 
															+#             if len(segments) >= 2 and isinstance(segments[1], dict):

														
 
															+#                 for col in seg2_cols:

														
 
															+#                     result[f'seg2_{col}'] = segments[1].get(col)

														
 
															+#             else:

														
 
															+#                 for col in seg2_cols:

														
 
															+#                     result[f'seg2_{col}'] = missing

														
 
															+#         else:

														
 
															+#             # segments 不是 list，全都置空

														
 
															+#             for col in seg1_cols:

														
 
															+#                 result[f'seg1_{col}'] = missing

														
 
															+#             for col in seg2_cols:

														
 
															+#                 result[f'seg2_{col}'] = missing

														
 
															+

														
 
															+#         return pd.Series(result)

														
 
															+

														
 
															+#     # 一次 apply

														
 
															+#     df_segments = df.apply(extract_segments, axis=1)

														
 
															+

														
 
															+#     # 拼回原 df

														
 
															+#     df = pd.concat([df.drop(columns=['segments'], errors='ignore'), df_segments], axis=1)

														
 
															+

														
 
															+#     # 统一转换时间字段为 datetime

														
 
															+#     time_cols = [

														
 
															+#         'seg1_dep_time', 'seg1_arr_time',

														
 
															+#         'seg2_dep_time', 'seg2_arr_time'

														
 
															+#     ]

														
 
															+#     for col in time_cols:

														
 
															+#         if col in df.columns:

														
 
															+#             df[col] = pd.to_datetime(

														
 
															+#                 df[col],

														
 
															+#                 format='%Y%m%d%H%M%S',

														
 
															+#                 errors='coerce'

														
 
															+#             )

														
 
															+

														
 
															+#     # 站点来源 -> 是否近期

														
 
															+#     df['source_website'] = np.where(

														
 
															+#         df['source_website'].str.contains('7_30'),

														
 
															+#         0,  # 远期 -> 0

														
 
															+#         np.where(df['source_website'].str.contains('0_7'),

														
 
															+#                  1,  # 近期 -> 1

														
 
															+#                  df['source_website'])  # 其他情况保持原值

														
 
															+#     )

														
 
															+

														
 
															+#     # 行李配额字符 -> 数字

														
 
															+#     conditions = [

														
 
															+#         df['seg1_baggage'] == '-;-;-;-',

														
 
															+#         df['seg1_baggage'] == '1-20',

														
 
															+#         df['seg1_baggage'] == '1-30',

														
 
															+#         df['seg1_baggage'] == '1-40',

														
 
															+#     ]

														
 
															+#     choices = [0, 20, 30, 40]

														
 
															+#     df['seg1_baggage'] = np.select(conditions, choices, default=df['seg1_baggage'])

														
 
															+

														
 
															+#     # 重命名字段

														
 
															+#     df = df.rename(columns={

														
 
															+#         'seg1_cabin': 'cabin',

														
 
															+#         'seg1_baggage': 'baggage',

														
 
															+#         'source_website': 'is_near',

														
 
															+#     })

														
 
															+

														
 
															+#     return df

														
 
															+

														
 
															+def expand_segments_columns_optimized(df):

														
 
															+    """优化版的展开segments函数（避免逐行apply）"""

														
 
															+    if df.empty:

														
 
															+        return df

														
 
															+    

														
 
															     df = df.copy()

														
 
															-    # 定义要展开的列

														
 
															-    seg1_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time', 'cabin', 'baggage']

														
 
															-    seg2_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time']

														
 
															-

														
 
															-    # 定义 apply 函数一次返回字典

														
 
															-    def extract_segments(row):

														
 
															-        segments = row.get('segments')

														
 
															-        result = {}

														
 
															-        # 默认缺失使用 pd.NA（对字符串友好）

														
 
															-        missing = pd.NA

														
 
															-        if isinstance(segments, list):

														
 
															-            # 第一段

														
 
															-            if len(segments) >= 1 and isinstance(segments[0], dict):

														
 
															+    # 直接操作segments列表，避免逐行apply

														
 
															+    if 'segments' in df.columns:

														
 
															+        # 提取第一段信息

														
 
															+        seg1_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time', 'cabin', 'baggage']

														
 
															+        # 提取第二段信息

														
 
															+        seg2_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time']

														
 
															+    

														
 
															+        # 使用列表推导式替代apply，大幅提升性能

														
 
															+        seg1_data = []

														
 
															+        seg2_data = []

														
 
															+

														
 
															+        for segments in df['segments']:

														
 
															+            seg1_dict = {}

														
 
															+            seg2_dict = {}

														
 
															+

														
 
															+            if isinstance(segments, list) and len(segments) >= 1 and isinstance(segments[0], dict):

														
 
															                 for col in seg1_cols:

														
 
															-                    result[f'seg1_{col}'] = segments[0].get(col)

														
 
															+                    seg1_dict[f'seg1_{col}'] = segments[0].get(col)

														
 
															             else:

														
 
															                 for col in seg1_cols:

														
 
															-                    result[f'seg1_{col}'] = missing

														
 
															-            # 第二段

														
 
															-            if len(segments) >= 2 and isinstance(segments[1], dict):

														
 
															+                    seg1_dict[f'seg1_{col}'] = pd.NA

														
 
															+            

														
 
															+            if isinstance(segments, list) and len(segments) >= 2 and isinstance(segments[1], dict):

														
 
															                 for col in seg2_cols:

														
 
															-                    result[f'seg2_{col}'] = segments[1].get(col)

														
 
															+                    seg2_dict[f'seg2_{col}'] = segments[1].get(col)

														
 
															             else:

														
 
															                 for col in seg2_cols:

														
 
															-                    result[f'seg2_{col}'] = missing

														
 
															-        else:

														
 
															-            # segments 不是 list，全都置空

														
 
															-            for col in seg1_cols:

														
 
															-                result[f'seg1_{col}'] = missing

														
 
															-            for col in seg2_cols:

														
 
															-                result[f'seg2_{col}'] = missing

														
 
															-

														
 
															-        return pd.Series(result)

														
 
															+                    seg2_dict[f'seg2_{col}'] = pd.NA

														
 
															+                    

														
 
															+            seg1_data.append(seg1_dict)

														
 
															+            seg2_data.append(seg2_dict)

														
 
															-    # 一次 apply

														
 
															-    df_segments = df.apply(extract_segments, axis=1)

														
 
															+        # 创建DataFrame

														
 
															+        df_seg1 = pd.DataFrame(seg1_data, index=df.index)

														
 
															+        df_seg2 = pd.DataFrame(seg2_data, index=df.index)

														
 
															-    # 拼回原 df

														
 
															-    df = pd.concat([df.drop(columns=['segments'], errors='ignore'), df_segments], axis=1)

														
 
															+        # 合并到原DataFrame

														
 
															+        df = pd.concat([df.drop(columns=['segments'], errors='ignore'), df_seg1, df_seg2], axis=1)

														
 
															-    # 统一转换时间字段为 datetime

														
 
															-    time_cols = [

														
 
															-        'seg1_dep_time', 'seg1_arr_time',

														
 
															-        'seg2_dep_time', 'seg2_arr_time'

														
 
															-    ]

														
 
															+    # 后续处理保持不变

														
 
															+    time_cols = ['seg1_dep_time', 'seg1_arr_time', 'seg2_dep_time', 'seg2_arr_time']

														
 
															     for col in time_cols:

														
 
															         if col in df.columns:

														
 
															-            df[col] = pd.to_datetime(

														
 
															-                df[col],

														
 
															-                format='%Y%m%d%H%M%S',

														
 
															-                errors='coerce'

														
 
															-            )

														
 
															-

														
 
															-    # 站点来源 -> 是否近期

														
 
															+            df[col] = pd.to_datetime(df[col], format='%Y%m%d%H%M%S', errors='coerce')

														
 
															+    

														
 
															     df['source_website'] = np.where(

														
 
															-        df['source_website'].str.contains('7_30'),

														
 
															-        0,  # 远期 -> 0

														
 
															-        np.where(df['source_website'].str.contains('0_7'),

														
 
															-                 1,  # 近期 -> 1

														
 
															-                 df['source_website'])  # 其他情况保持原值

														
 
															+        df['source_website'].str.contains('7_30'), 0,

														
 
															+        np.where(df['source_website'].str.contains('0_7'), 1, df['source_website'])

														
 
															     )

														
 
															-    # 行李配额字符 -> 数字

														
 
															     conditions = [

														
 
															         df['seg1_baggage'] == '-;-;-;-',

														
 
															         df['seg1_baggage'] == '1-20',

														
@@ -233,13 +310,12 @@ def expand_segments_columns(df):
 
															     choices = [0, 20, 30, 40]

														
 
															     df['seg1_baggage'] = np.select(conditions, choices, default=df['seg1_baggage'])

														
 
															-    # 重命名字段

														
 
															     df = df.rename(columns={

														
 
															         'seg1_cabin': 'cabin',

														
 
															         'seg1_baggage': 'baggage',

														
 
															         'source_website': 'is_near',

														
 
															     })

														
 
															-

														
 
															+    

														
 
															     return df

														
@@ -564,152 +640,217 @@ def plot_c12_trend(df, output_dir="."):
 
															     plt.close(fig)

														
 
															-def load_train_data(db, flight_route_list, table_name, date_begin, date_end, output_dir='.', is_hot=1):

														
 
															-    """加载训练数据"""

														
 
															+def process_flight_group(args):

														
 
															+    """处理单个航班号的线程函数（独立数据库连接）"""

														
 
															+    thread_id, db_config, each_group, from_city, to_city, date_begin_s, date_end_s, is_hot, plot_flag, output_dir = args

														
 
															+    flight_nums = each_group.get("flight_numbers")

														
 
															+    details = each_group.get("details")

														
 
															+

														
 
															+    print(f"[线程{thread_id}] 开始处理航班号: {flight_nums}")

														
 
															+

														
 
															+    # 为每个线程创建独立的数据库连接

														
 
															+    try:

														
 
															+        client, db = mongo_con_parse(db_config)

														
 
															+        print(f"[线程{thread_id}] ✅ 数据库连接创建成功")

														
 
															+    except Exception as e:

														
 
															+        print(f"[线程{thread_id}] ❌ 数据库连接创建失败: {e}")

														
 
															+        return pd.DataFrame()

														
 
															+

														
 
															+    try:

														
 
															+        # 查询远期表

														
 
															+        if is_hot == 1:

														
 
															+            df1 = query_flight_range_status(db, CLEAN_VJ_HOT_FAR_INFO_TAB, from_city, to_city,

														
 
															+                                            date_begin_s, date_end_s, flight_nums)

														
 
															+        else:

														
 
															+            df1 = query_flight_range_status(db, CLEAN_VJ_NOTHOT_FAR_INFO_TAB, from_city, to_city,

														
 
															+                                            date_begin_s, date_end_s, flight_nums)

														
 
															+        

														
 
															+        # 保证远期表里有数据

														
 
															+        if df1.empty:

														
 
															+            print(f"[线程{thread_id}] 航班号:{flight_nums} 远期表无数据, 跳过")

														
 
															+            return pd.DataFrame()

														
 
															+        

														
 
															+        # 查询近期表

														
 
															+        if is_hot == 1:

														
 
															+            df2 = query_flight_range_status(db, CLEAN_VJ_HOT_NEAR_INFO_TAB, from_city, to_city,

														
 
															+                                            date_begin_s, date_end_s, flight_nums)

														
 
															+        else:

														
 
															+            df2 = query_flight_range_status(db, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB, from_city, to_city,

														
 
															+                                            date_begin_s, date_end_s, flight_nums)

														
 
															+            

														
 
															+        # 保证近期表里有数据

														
 
															+        if df2.empty:

														
 
															+            print(f"[线程{thread_id}] 航班号:{flight_nums} 近期表无数据, 跳过")

														
 
															+            return pd.DataFrame()

														
 
															+        

														
 
															+        # 起飞天数、行李配额以近期表的为主

														
 
															+        if df2.empty:

														
 
															+            common_dep_dates = []

														
 
															+            common_baggages = []

														
 
															+        else:

														
 
															+            common_dep_dates = df2['search_dep_time'].unique()

														
 
															+            common_baggages = df2['baggage'].unique()

														
 
															+

														
 
															+        list_mid = []

														
 
															+        for dep_date in common_dep_dates:

														
 
															+            # 起飞日期筛选

														
 
															+            df_d1 = df1[df1["search_dep_time"] == dep_date].copy()

														
 
															+            if not df_d1.empty:

														
 
															+                for col in ["seg1_dep_time", "seg1_arr_time", "seg2_dep_time", "seg2_arr_time"]:

														
 
															+                    mode_series_1 = df_d1[col].mode()

														
 
															+                    if mode_series_1.empty:

														
 
															+                        zong_1 = pd.NaT

														
 
															+                    else:

														
 
															+                        zong_1 = mode_series_1.iloc[0]

														
 
															+                    df_d1[col] = zong_1

														
 
															+

														
 
															+            df_d2 = df2[df2["search_dep_time"] == dep_date].copy()

														
 
															+            if not df_d2.empty:

														
 
															+                for col in ["seg1_dep_time", "seg1_arr_time", "seg2_dep_time", "seg2_arr_time"]:

														
 
															+                    mode_series_2 = df_d2[col].mode()

														
 
															+                    if mode_series_2.empty:

														
 
															+                        zong_2 = pd.NaT

														
 
															+                    else:

														
 
															+                        zong_2 = mode_series_2.iloc[0]

														
 
															+                    df_d2[col] = zong_2

														
 
															+

														
 
															+            list_12 = []

														
 
															+            for baggage in common_baggages:

														
 
															+                # 行李配额筛选

														
 
															+                df_b1 = df_d1[df_d1["baggage"] == baggage].copy()

														
 
															+                df_b2 = df_d2[df_d2["baggage"] == baggage].copy()

														
 
															+

														
 
															+                # 合并前检查是否都有数据

														
 
															+                if df_b1.empty and df_b2.empty:

														
 
															+                    print(f"[线程{thread_id}] ⚠️ dep_date:{dep_date}, baggage:{baggage} 远期表和近期表都为空，跳过")

														
 
															+                    continue

														
 
															+

														
 
															+                cols = ["seg1_flight_number", "seg1_dep_air_port", "seg1_arr_air_port",

														
 
															+                        "seg2_flight_number", "seg2_dep_air_port", "seg2_arr_air_port"]

														
 
															+                df_b1[cols] = df_b1[cols].astype("string")

														
 
															+                df_b2[cols] = df_b2[cols].astype("string")

														
 
															+

														
 
															+                df_b12 = pd.concat([df_b1, df_b2]).reset_index(drop=True)

														
 
															+                # print(f"📊 dep_date:{dep_date}, baggage:{baggage} 已将远期表和近期表合并，形状: {df_b12.shape}")

														
 
															+                df_b12 = fill_hourly_crawl_date(df_b12, rear_fill=2)

														
 
															+                # print(f"📊 dep_date:{dep_date}, baggage:{baggage} 已合并且补齐为完整小时序列，形状: {df_b12.shape}")

														
 
															+                list_12.append(df_b12)

														
 
															+

														
 
															+                del df_b12

														
 
															+                del df_b2

														
 
															+                del df_b1

														
 
															+

														
 
															+            if list_12:

														
 
															+                df_c12 = pd.concat(list_12, ignore_index=True)

														
 
															+                if plot_flag:

														
 
															+                    print(f"[线程{thread_id}] ✅ dep_date:{dep_date}, 所有 baggage 数据合并完成，总形状: {df_c12.shape}")

														
 
															+                    plot_c12_trend(df_c12, output_dir)

														
 
															+                    print(f"[线程{thread_id}] ✅ dep_date:{dep_date}, 所有 baggage 数据绘图完成")

														
 
															+            else:

														
 
															+                df_c12 = pd.DataFrame()

														
 
															+                if plot_flag:

														
 
															+                    print(f"[线程{thread_id}] ⚠️ dep_date:{dep_date}, 所有 baggage 数据合并为空")

														
 
															+

														
 
															+            del list_12

														
 
															+            list_mid.append(df_c12)

														
 
															+

														
 
															+            del df_c12

														
 
															+            del df_d1

														
 
															+            del df_d2

														
 
															+            # print(f"结束处理起飞日期: {dep_date}")

														
 
															+

														
 
															+        if list_mid:

														
 
															+            df_mid = pd.concat(list_mid, ignore_index=True)

														
 
															+            print(f"[线程{thread_id}] ✅ 航班号:{flight_nums} 所有 起飞日期 数据合并完成，总形状: {df_mid.shape}")

														
 
															+        else:

														
 
															+            df_mid = pd.DataFrame()

														
 
															+            print(f"[线程{thread_id}] ⚠️ 航班号:{flight_nums} 所有 起飞日期 数据合并为空")

														
 
															+        

														
 
															+        del list_mid

														
 
															+        del df1

														
 
															+        del df2

														
 
															+        gc.collect()

														
 
															+        print(f"[线程{thread_id}] 结束处理航班号: {flight_nums}")

														
 
															+        return df_mid

														
 
															+    

														
 
															+    except Exception as e:

														
 
															+        print(f"[线程{thread_id}] ❌ 处理航班号:{flight_nums} 时发生异常: {e}")

														
 
															+        return pd.DataFrame()

														
 
															+    finally:

														
 
															+        # 确保关闭数据库连接

														
 
															+        try:

														
 
															+            client.close()

														
 
															+            print(f"[线程{thread_id}] ✅ 数据库连接已关闭")

														
 
															+        except:

														
 
															+            pass

														
 
															+

														
 
															+

														
 
															+def load_train_data(db_config, flight_route_list, table_name, date_begin, date_end, output_dir='.', is_hot=1, plot_flag=False,

														
 
															+                    use_multithread=False, max_workers=None):

														
 
															+    """加载训练数据（支持多线程）"""

														
 
															     timestamp_str = datetime.now().strftime("%Y%m%d%H%M%S")

														
 
															     date_begin_s = datetime.strptime(date_begin, "%Y-%m-%d").strftime("%Y%m%d")  # 查询时的格式

														
 
															     date_end_s = datetime.strptime(date_end, "%Y-%m-%d").strftime("%Y%m%d")

														
 
															     list_all = []

														
 
															+

														
 
															     # 每一航线对

														
 
															     for flight_route in flight_route_list:

														
 
															         from_city = flight_route.split('-')[0]

														
 
															         to_city = flight_route.split('-')[1]

														
 
															         route = f"{from_city}-{to_city}"

														
 
															         print(f"开始处理航线: {route}")

														
 
															-        all_groups = query_groups_of_city_code(db, from_city, to_city, table_name)

														
 
															-        all_groups_len = len(all_groups)

														
 
															-        print(f"该航线共有{all_groups_len}个航班号")

														
 
															-        # 每一组航班号

														
 
															-        for each_group in all_groups:

														
 
															-            flight_nums = each_group.get("flight_numbers")

														
 
															-            print(f"开始处理航班号: {flight_nums}")

														
 
															-            details = each_group.get("details")

														
 
															-

														
 
															-            print(f"查远期表")

														
 
															-            if is_hot == 1:

														
 
															-                df1 = query_flight_range_status(db, CLEAN_VJ_HOT_FAR_INFO_TAB, from_city, to_city,

														
 
															-                                                date_begin_s, date_end_s, flight_nums)

														
 
															-            else:

														
 
															-                df1 = query_flight_range_status(db, CLEAN_VJ_NOTHOT_FAR_INFO_TAB, from_city, to_city,

														
 
															-                                                date_begin_s, date_end_s, flight_nums)

														
 
															-

														
 
															-            # 保证远期表里有数据

														
 
															-            if df1.empty:

														
 
															-                print(f"航班号:{flight_nums} 远期表无数据, 跳过")

														
 
															-                continue

														
 
															-

														
 
															-            print(f"查近期表")

														
 
															-            if is_hot == 1:

														
 
															-                df2 = query_flight_range_status(db, CLEAN_VJ_HOT_NEAR_INFO_TAB, from_city, to_city,

														
 
															-                                                date_begin_s, date_end_s, flight_nums)

														
 
															-            else:

														
 
															-                df2 = query_flight_range_status(db, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB, from_city, to_city,

														
 
															-                                                date_begin_s, date_end_s, flight_nums)

														
 
															-

														
 
															-            # 保证近期表里有数据

														
 
															-            if df2.empty:

														
 
															-                print(f"航班号:{flight_nums} 近期表无数据, 跳过")

														
 
															-                continue

														
 
															-

														
 
															-            # 起飞天数、行李配额以近期表的为主

														
 
															-            if df2.empty:

														
 
															-                common_dep_dates = []

														
 
															-                common_baggages = []

														
 
															-            else:

														
 
															-                common_dep_dates = df2['search_dep_time'].unique()

														
 
															-                common_baggages = df2['baggage'].unique()

														
 
															-

														
 
															-            list_mid = []

														
 
															-            for dep_date in common_dep_dates:

														
 
															-                # 起飞日期筛选

														
 
															-                df_d1 = df1[df1["search_dep_time"] == dep_date].copy()

														
 
															-                if not df_d1.empty:

														
 
															-                    for col in ["seg1_dep_time", "seg1_arr_time", "seg2_dep_time", "seg2_arr_time"]:

														
 
															-                        mode_series_1 = df_d1[col].mode()

														
 
															-                        if mode_series_1.empty:

														
 
															-                            # 如果整个列都是 NaT，则众数为空，直接赋 NaT

														
 
															-                            zong_1 = pd.NaT

														
 
															-                        else:

														
 
															-                            zong_1 = mode_series_1.iloc[0]

														
 
															-                        df_d1[col] = zong_1

														
 
															-

														
 
															-                df_d2 = df2[df2["search_dep_time"] == dep_date].copy()

														
 
															-                if not df_d2.empty:

														
 
															-                    for col in ["seg1_dep_time", "seg1_arr_time", "seg2_dep_time", "seg2_arr_time"]:

														
 
															-                        mode_series_2 = df_d2[col].mode()

														
 
															-                        if mode_series_2.empty:

														
 
															-                            # 如果整个列都是 NaT，则众数为空，直接赋 NaT

														
 
															-                            zong_2 = pd.NaT

														
 
															-                        else:

														
 
															-                            zong_2 = mode_series_2.iloc[0]

														
 
															-                        df_d2[col] = zong_2

														
 
															-

														
 
															-                list_12 = []

														
 
															-                for baggage in common_baggages:

														
 
															-                    # 行李配额筛选

														
 
															-                    df_b1 = df_d1[df_d1["baggage"] == baggage].copy()

														
 
															-                    df_b2 = df_d2[df_d2["baggage"] == baggage].copy()

														
 
															-

														
 
															-                    # 合并前检查是否都有数据

														
 
															-                    if df_b1.empty and df_b2.empty:

														
 
															-                        print(f"⚠️ dep_date:{dep_date}, baggage:{baggage} 远期表和近期表都为空，跳过")

														
 
															-                        continue

														
 
															-

														
 
															-                    cols = ["seg1_flight_number", "seg1_dep_air_port", "seg1_arr_air_port",

														
 
															-                            "seg2_flight_number", "seg2_dep_air_port", "seg2_arr_air_port"]

														
 
															-                    # df_b1 = df_b1.copy()

														
 
															-                    # df_b2 = df_b2.copy()

														
 
															-                    df_b1[cols] = df_b1[cols].astype("string")

														
 
															-                    df_b2[cols] = df_b2[cols].astype("string")

														
 
															-

														
 
															-                    df_b12 = pd.concat([df_b1, df_b2]).reset_index(drop=True)

														
 
															-                    # print(f"📊 dep_date:{dep_date}, baggage:{baggage} 已将远期表和近期表合并，形状: {df_b12.shape}")

														
 
															-                    df_b12 = fill_hourly_crawl_date(df_b12, rear_fill=2)

														
 
															-                    # print(f"📊 dep_date:{dep_date}, baggage:{baggage} 已合并且补齐为完整小时序列，形状: {df_b12.shape}")

														
 
															-                    # print(df_b12.dtypes)

														
 
															-                    list_12.append(df_b12)

														
 
															-                    del df_b12

														
 
															-                    del df_b2

														
 
															-                    del df_b1

														
 
															-

														
 
															-                if list_12:

														
 
															-                    df_c12 = pd.concat(list_12, ignore_index=True)

														
 
															-                    # print(f"✅ dep_date:{dep_date}, 所有 baggage 数据合并完成，总形状: {df_c12.shape}")

														
 
															-                    # plot_c12_trend(df_c12, output_dir)

														
 
															-                    # print(f"✅ dep_date:{dep_date}, 所有 baggage 数据绘图完成")

														
 
															-                else:

														
 
															-                    df_c12 = pd.DataFrame()

														
 
															-                    # print(f"⚠️ dep_date:{dep_date}, 所有 baggage 数据合并为空")

														
 
															-

														
 
															-                del list_12

														
 
															-                list_mid.append(df_c12)

														
 
															-

														
 
															-                del df_c12

														
 
															-                del df_d1

														
 
															-                del df_d2

														
 
															-

														
 
															-                # print(f"结束处理起飞日期: {dep_date}")

														
 
															-

														
 
															-            if list_mid:

														
 
															-                df_mid = pd.concat(list_mid, ignore_index=True)

														
 
															-                print(f"✅ 航班号:{flight_nums} 所有 起飞日期 数据合并完成，总形状: {df_mid.shape}")

														
 
															-            else:

														
 
															-                df_mid = pd.DataFrame()

														
 
															-                print(f"⚠️ 航班号:{flight_nums} 所有 起飞日期 数据合并为空")

														
 
															-            del list_mid

														
 
															-            list_all.append(df_mid)

														
 
															+        # 在主线程中查询航班号分组（避免多线程重复查询）

														
 
															+        main_client, main_db = mongo_con_parse(db_config)

														
 
															+        all_groups = query_groups_of_city_code(main_db, from_city, to_city, table_name)

														
 
															+        main_client.close()

														
 
															-            del df1

														
 
															-            del df2

														
 
															-            

														
 
															-            # output_path = os.path.join(output_dir, f"./{route}_{timestamp_str}.csv")

														
 
															-            # df_mid.to_csv(output_path, index=False, encoding="utf-8-sig", mode="a", header=not os.path.exists(output_path))

														
 
															+        all_groups_len = len(all_groups)

														
 
															+        print(f"该航线共有{all_groups_len}个航班号")

														
 
															+        

														
 
															+        if use_multithread and all_groups_len > 1:

														
 
															+            print(f"启用多线程处理，最大线程数: {max_workers}")

														
 
															+            # 多线程处理

														
 
															+            thread_args = []

														
 
															+            thread_id = 0

														
 
															+            for each_group in all_groups:

														
 
															+                thread_id += 1

														
 
															+                args = (thread_id, db_config, each_group, from_city, to_city, date_begin_s, date_end_s, is_hot, plot_flag, output_dir)

														
 
															+                thread_args.append(args)

														
 
															-            del df_mid

														
 
															-            gc.collect()

														
 
															-            print(f"结束处理航班号: {flight_nums}")

														
 
															+            with ThreadPoolExecutor(max_workers=max_workers) as executor:

														
 
															+                future_to_group = {executor.submit(process_flight_group, args): each_group for args, each_group in zip(thread_args, all_groups)}

														
 
															+                

														
 
															+                for future in as_completed(future_to_group):

														
 
															+                    each_group = future_to_group[future]

														
 
															+                    flight_nums = each_group.get("flight_numbers", "未知")

														
 
															+                    try:

														
 
															+                        df_mid = future.result()

														
 
															+                        if not df_mid.empty:

														
 
															+                            list_all.append(df_mid)

														
 
															+                            print(f"✅ 航班号:{flight_nums} 处理完成")

														
 
															+                        else:

														
 
															+                            print(f"⚠️ 航班号:{flight_nums} 处理结果为空")

														
 
															+                    except Exception as e:

														
 
															+                        print(f"❌ 航班号:{flight_nums} 处理异常: {e}")

														
 
															+        else:

														
 
															+            # 单线程处理（线程编号为0）

														
 
															+            print("使用单线程处理")

														
 
															+            thread_id = 0

														
 
															+            for each_group in all_groups:

														
 
															+                args = (thread_id, db_config, each_group, from_city, to_city, date_begin_s, date_end_s, is_hot, plot_flag, output_dir)

														
 
															+                flight_nums = each_group.get("flight_numbers", "未知")

														
 
															+                try:

														
 
															+                    df_mid = process_flight_group(args)

														
 
															+                    if not df_mid.empty:

														
 
															+                        list_all.append(df_mid)

														
 
															+                        print(f"✅ 航班号:{flight_nums} 处理完成")

														
 
															+                    else:

														
 
															+                        print(f"⚠️ 航班号:{flight_nums} 处理结果为空")

														
 
															+                except Exception as e:

														
 
															+                    print(f"❌ 航班号:{flight_nums} 处理异常: {e}")

														
 
															+                

														
 
															         print(f"结束处理航线: {from_city}-{to_city}")

														
 
															     if list_all:

														
@@ -828,7 +969,7 @@ def validate_one_line(db, city_pair, flight_day, flight_number_1, flight_number_
 
															                 # 1️⃣ 展开 segments

														
 
															                 print(f"📊 开始扩展segments 稍等...")

														
 
															                 t1 = time.time()

														
 
															-                df = expand_segments_columns(df)

														
 
															+                df = expand_segments_columns_optimized(df)

														
 
															                 t2 = time.time()

														
 
															                 rt = round(t2 - t1, 3)

														
 
															                 print(f"用时: {rt} 秒")

														
@@ -856,46 +997,51 @@ def validate_one_line(db, city_pair, flight_day, flight_number_1, flight_number_
 
															 if __name__ == "__main__":

														
 
															     # test_mongo_connection(db)

														
 
															-

														
 
															-    # output_dir = f"./output"

														
 
															-    # os.makedirs(output_dir, exist_ok=True)

														
 
															-

														
 
															-    # # 加载热门航线数据

														
 
															-    # date_begin = "2025-11-20"

														
 
															-    # date_end = datetime.today().strftime("%Y-%m-%d")

														
 
															-

														
 
															-    # flight_route_list = vj_flight_route_list_hot[0:]  # 热门 vj_flight_route_list_hot  冷门 vj_flight_route_list_nothot

														
 
															-    # table_name = CLEAN_VJ_HOT_NEAR_INFO_TAB  # 热门 CLEAN_VJ_HOT_NEAR_INFO_TAB  冷门 CLEAN_VJ_NOTHOT_NEAR_INFO_TAB

														
 
															-    # is_hot = 1   # 1 热门 0 冷门

														
 
															-    # group_size = 1

														
 
															-    # chunks = chunk_list(flight_route_list, group_size)

														
 
															-

														
 
															-    # for idx, group_route_list in enumerate(chunks, 1):

														
 
															-    #     # 使用默认配置

														
 
															-    #     client, db = mongo_con_parse()

														
 
															-    #     print(f"第 {idx} 组 :", group_route_list)

														
 
															-    #     start_time = time.time()

														
 
															-    #     load_train_data(db, group_route_list, table_name, date_begin, date_end, output_dir, is_hot)

														
 
															-    #     end_time = time.time()

														
 
															-    #     run_time = round(end_time - start_time, 3)

														
 
															-    #     print(f"用时: {run_time} 秒")

														
 
															-

														
 
															-    #     client.close()

														
 
															-    #     time.sleep(3)

														
 
															-

														
 
															-    # print("整体结束")

														
 
															-

														
 
															-    client, db = mongo_con_parse()

														
 
															-    list_flight_number_1 = query_all_flight_number(db, CLEAN_VJ_HOT_NEAR_INFO_TAB)

														
 
															-    list_flight_number_2 = query_all_flight_number(db, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB)

														
 
															-

														
 
															-    list_flight_number_all = list_flight_number_1 + list_flight_number_2

														
 
															-    list_flight_number_all = list(set(list_flight_number_all))

														
 
															-    list_flight_number_all.sort()

														
 
															+    from utils import chunk_list_with_index

														
 
															+

														
 
															+    cpu_cores = os.cpu_count()  # 你的系统是72

														
 
															+    max_workers = min(16, cpu_cores)  # 最大不超过16个线程

														
 
															+

														
 
															+    output_dir = f"./output"

														
 
															+    os.makedirs(output_dir, exist_ok=True)

														
 
															+

														
 
															+    # 加载热门航线数据

														
 
															+    date_begin = "2025-12-07"

														
 
															+    date_end = datetime.today().strftime("%Y-%m-%d")

														
 
															+

														
 
															+    flight_route_list = vj_flight_route_list_hot[0:]  # 热门 vj_flight_route_list_hot  冷门 vj_flight_route_list_nothot

														
 
															+    table_name = CLEAN_VJ_HOT_NEAR_INFO_TAB  # 热门 CLEAN_VJ_HOT_NEAR_INFO_TAB  冷门 CLEAN_VJ_NOTHOT_NEAR_INFO_TAB

														
 
															+    is_hot = 1   # 1 热门 0 冷门

														
 
															+    group_size = 1

														
 
															+    chunks = chunk_list_with_index(flight_route_list, group_size)

														
 
															+

														
 
															+    for idx, (_, group_route_list) in enumerate(chunks, 1):

														
 
															+        # 使用默认配置

														
 
															+        # client, db = mongo_con_parse()

														
 
															+        print(f"第 {idx} 组 :", group_route_list)

														
 
															+        start_time = time.time()

														
 
															+        load_train_data(mongodb_config, group_route_list, table_name, date_begin, date_end, output_dir, is_hot, plot_flag=False,

														
 
															+                        use_multithread=False, max_workers=max_workers)

														
 
															+        end_time = time.time()

														
 
															+        run_time = round(end_time - start_time, 3)

														
 
															+        print(f"用时: {run_time} 秒")

														
 
															+

														
 
															+        # client.close()

														
 
															+        time.sleep(3)

														
 
															+

														
 
															+    print("整体结束")

														
 
															+

														
 
															+    # client, db = mongo_con_parse()

														
 
															+    # list_flight_number_1 = query_all_flight_number(db, CLEAN_VJ_HOT_NEAR_INFO_TAB)

														
 
															+    # list_flight_number_2 = query_all_flight_number(db, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB)

														
 
															+

														
 
															+    # list_flight_number_all = list_flight_number_1 + list_flight_number_2

														
 
															+    # list_flight_number_all = list(set(list_flight_number_all))

														
 
															+    # list_flight_number_all.sort()

														
 
															-    print(list_flight_number_all)

														
 
															-    print(len(list_flight_number_all))

														
 
															+    # print(list_flight_number_all)

														
 
															+    # print(len(list_flight_number_all))

														
 
															-    flight_map = {v: i for i, v in enumerate(list_flight_number_all, start=1)}

														
 
															-    print(flight_map)

														
 
															+    # flight_map = {v: i for i, v in enumerate(list_flight_number_all, start=1)}

														
 
															+    # print(flight_map)

														
--- a/data_preprocess.py
+++ b/data_preprocess.py
@@ -10,7 +10,7 @@ from utils import insert_df_col
 
															 COUNTRY_HOLIDAYS = build_country_holidays(city_to_country)
														
 
															-def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=48):
														
 
															+def preprocess_data(df_input, features, categorical_features, is_training=True, current_n_hours=36):
														
 
															     print(">>> 开始数据预处理") 
														
 
															     # 生成 城市对
														
@@ -110,10 +110,10 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
 
															         .ngroup()
														
 
															     )
														
 
															-    # 做一下时间段裁剪, 保留起飞前480小时之内的
														
 
															-    df_input = df_input[df_input['hours_until_departure'] < 480].reset_index(drop=True)
														
 
															-    pass
														
 
															-
														
 
															+    # 做一下时间段裁剪, 保留起飞前480小时之内且大于等于4小时的
														
 
															+    df_input = df_input[(df_input['hours_until_departure'] < 480) & 
														
 
															+                        (df_input['hours_until_departure'] >= 4)].reset_index(drop=True)
														
 
															+    
														
 
															     # 在 gid 与 baggage 内按时间降序
														
 
															     df_input = df_input.sort_values(
														
 
															         by=['gid', 'baggage', 'hours_until_departure'],
														
@@ -161,6 +161,160 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
 
															     df_input = df_input[new_order]
														
 
															     pass
														
 
															+    print(">>> 计算价格区间特征")
														
 
															+    #  1. 基于绝对价格水平的价格区间划分
														
 
															+    # 先计算每个(gid, baggage)的价格统计特征
														
 
															+    # g = df_input.groupby(['gid', 'baggage'])
														
 
															+    price_stats = df_input.groupby(['gid', 'baggage'])['adult_total_price'].agg(
														
 
															+        min_price='min',
														
 
															+        max_price='max',
														
 
															+        mean_price='mean',
														
 
															+        std_price='std'
														
 
															+    ).reset_index()
														
 
															+
														
 
															+    # 合并统计特征到原数据
														
 
															+    df_input = df_input.merge(price_stats, on=['gid', 'baggage'], how='left')
														
 
															+
														
 
															+    # 2. 基于绝对价格的价格区间划分 （可以删除，因为后面有更精细的基于频率加权的分类）
														
 
															+    # # 高价区间：超过均值+1倍标准差
														
 
															+    # df_input['price_absolute_high'] = (df_input['adult_total_price'] > 
														
 
															+    #                                   (df_input['mean_price'] + df_input['std_price'])).astype(int)
														
 
															+
														
 
															+    # # 中高价区间：均值到均值+1倍标准差
														
 
															+    # df_input['price_absolute_mid_high'] = ((df_input['adult_total_price'] > df_input['mean_price']) & 
														
 
															+    #                                        (df_input['adult_total_price'] <= (df_input['mean_price'] + df_input['std_price']))).astype(int)
														
 
															+
														
 
															+    # # 中低价区间：均值-1倍标准差到均值
														
 
															+    # df_input['price_absolute_mid_low'] = ((df_input['adult_total_price'] > (df_input['mean_price'] - df_input['std_price'])) & 
														
 
															+    #                                       (df_input['adult_total_price'] <= df_input['mean_price'])).astype(int)
														
 
															+
														
 
															+    # # 低价区间：低于均值-1倍标准差
														
 
															+    # df_input['price_absolute_low'] = (df_input['adult_total_price'] <= (df_input['mean_price'] - df_input['std_price'])).astype(int)
														
 
															+
														
 
															+    # 3. 基于频率加权的价格百分位数（改进版）
														
 
															+    # 计算每个价格出现的频率
														
 
															+    price_freq = df_input.groupby(['gid', 'baggage', 'adult_total_price']).size().reset_index(name='price_frequency')
														
 
															+    df_input = df_input.merge(price_freq, on=['gid', 'baggage', 'adult_total_price'], how='left')
														
 
															+
														
 
															+    # 计算频率加权的百分位数
														
 
															+    def weighted_percentile(group):
														
 
															+        if len(group) == 0:
														
 
															+            return pd.Series([np.nan] * 4, index=['price_weighted_percentile_25', 
														
 
															+                                                'price_weighted_percentile_50', 
														
 
															+                                                'price_weighted_percentile_75', 
														
 
															+                                                'price_weighted_percentile_90'])
														
 
															+        
														
 
															+        # 按价格排序，计算累积频率
														
 
															+        group = group.sort_values('adult_total_price')
														
 
															+        group['cum_freq'] = group['price_frequency'].cumsum()
														
 
															+        total_freq = group['price_frequency'].sum()
														
 
															+        
														
 
															+        # 计算加权百分位数
														
 
															+        percentiles = []
														
 
															+        for p in [0.25, 0.5, 0.75, 0.9]:
														
 
															+            threshold = total_freq * p
														
 
															+            # 找到第一个累积频率超过阈值的价格
														
 
															+            mask = group['cum_freq'] >= threshold
														
 
															+            if mask.any():
														
 
															+                percentile_value = group.loc[mask.idxmax(), 'adult_total_price']
														
 
															+            else:
														
 
															+                percentile_value = group['adult_total_price'].max()
														
 
															+            percentiles.append(percentile_value)
														
 
															+        
														
 
															+        return pd.Series(percentiles, index=['price_weighted_percentile_25', 
														
 
															+                                             'price_weighted_percentile_50', 
														
 
															+                                             'price_weighted_percentile_75', 
														
 
															+                                             'price_weighted_percentile_90'])
														
 
															+        
														
 
															+    # 按gid和baggage分组计算加权百分位数
														
 
															+    weighted_percentiles = df_input.groupby(['gid', 'baggage']).apply(weighted_percentile).reset_index()
														
 
															+    df_input = df_input.merge(weighted_percentiles, on=['gid', 'baggage'], how='left')
														
 
															+
														
 
															+    # 4. 结合绝对价格和频率的综合判断(改进版)
														
 
															+    freq_median = df_input.groupby(['gid', 'baggage'])['price_frequency'].transform('median')
														
 
															+
														
 
															+    # 计算价格相对于90%百分位数的倍数，用于区分不同级别的高价
														
 
															+    df_input['price_relative_to_90p'] = df_input['adult_total_price'] / df_input['price_weighted_percentile_90']
														
 
															+
														
 
															+    # 添加价格容忍度：避免相近价格被分到不同区间
														
 
															+    # 计算价格差异容忍度（使用各百分位数的1%作为容忍度阈值）
														
 
															+    # tolerance_90p = df_input['price_weighted_percentile_90'] * 0.01
														
 
															+    tolerance_75p = df_input['price_weighted_percentile_75'] * 0.01
														
 
															+    tolerance_50p = df_input['price_weighted_percentile_50'] * 0.01
														
 
															+    tolerance_25p = df_input['price_weighted_percentile_25'] * 0.01
														
 
															+
														
 
															+    # 重新设计价格区间分类（确保无重叠）：
														
 
															+    # 首先定义各个区间的mask
														
 
															+
														
 
															+    # 4.1 异常高价：价格远高于90%百分位数（超过1.5倍）且频率极低（低于中位数的1/3）
														
 
															+    price_abnormal_high_mask = ((df_input['price_relative_to_90p'] > 1.5) & 
														
 
															+                                (df_input['price_frequency'] < freq_median * 0.33))
														
 
															+
														
 
															+    # 4.2 真正高位：严格满足条件（价格 > 90%分位数 且 频率 < 中位数）
														
 
															+    price_real_high_mask = ((df_input['adult_total_price'] > df_input['price_weighted_percentile_90']) & 
														
 
															+                            (df_input['price_frequency'] < freq_median) &
														
 
															+                             ~price_abnormal_high_mask)
														
 
															+
														
 
															+    # 4.3 正常高位：使用容忍度（价格接近75%分位数）
														
 
															+    price_normal_high_mask = ((df_input['adult_total_price'] > df_input['price_weighted_percentile_75'] - tolerance_75p) & 
														
 
															+                               ~price_real_high_mask & ~price_abnormal_high_mask)
														
 
															+
														
 
															+    # 4.4 中高价：使用容忍度（价格在50%-75%分位数之间）
														
 
															+    price_mid_high_mask = ((df_input['adult_total_price'] > df_input['price_weighted_percentile_50'] - tolerance_50p) & 
														
 
															+                           (df_input['adult_total_price'] <= df_input['price_weighted_percentile_75'] + tolerance_75p) &
														
 
															+                            ~price_normal_high_mask & ~price_real_high_mask & ~price_abnormal_high_mask)
														
 
															+
														
 
															+    # 4.5 中低价：使用容忍度（价格在25%-50%分位数之间）
														
 
															+    price_mid_low_mask = ((df_input['adult_total_price'] > df_input['price_weighted_percentile_25'] - tolerance_25p) & 
														
 
															+                          (df_input['adult_total_price'] <= df_input['price_weighted_percentile_50'] + tolerance_50p) &
														
 
															+                           ~price_mid_high_mask & ~price_normal_high_mask & ~price_real_high_mask & ~price_abnormal_high_mask)
														
 
															+    
														
 
															+    # 4.6 低价：严格满足条件（价格 ≤ 25%分位数） 
														
 
															+    price_low_mask = ((df_input['adult_total_price'] <= df_input['price_weighted_percentile_25']) &
														
 
															+                 ~price_mid_low_mask & ~price_mid_high_mask & ~price_normal_high_mask & ~price_real_high_mask & ~price_abnormal_high_mask)
														
 
															+
														
 
															+    # 使用np.select确保互斥性
														
 
															+    price_zone_masks = [
														
 
															+        price_abnormal_high_mask,  # 异常高价区（5级）
														
 
															+        price_real_high_mask,      # 真正高价区（4级）
														
 
															+        price_normal_high_mask,    # 正常高价区（3级）
														
 
															+        price_mid_high_mask,       # 中高价区（2级）
														
 
															+        price_mid_low_mask,        # 中低价区（1级）
														
 
															+        price_low_mask,            # 低价区（0级）
														
 
															+    ]  
														
 
															+    price_zone_values = [5, 4, 3, 2, 1, 0]  # 5:异常高价, 4:真正高价, 3:正常高价, 2:中高价, 1:中低价, 0:低价 
														
 
															+
														
 
															+    # 使用np.select确保每个价格只被分到一个区间
														
 
															+    price_zone_result = np.select(price_zone_masks, price_zone_values, default=2)  # 默认中高价
														
 
															+    # 4.8 价格区间综合标记
														
 
															+    df_input['price_zone_comprehensive'] = price_zone_result
														
 
															+
														
 
															+    # 5. 价格异常度检测
														
 
															+    # 价格相对于均值的标准化偏差
														
 
															+    df_input['price_z_score'] = (df_input['adult_total_price'] - df_input['mean_price']) / df_input['std_price']
														
 
															+    
														
 
															+    # 价格异常度：基于Z-score的绝对值
														
 
															+    df_input['price_anomaly_score'] = np.abs(df_input['price_z_score'])
														
 
															+    
														
 
															+    # 6. 价格稳定性特征
														
 
															+    # 计算价格波动系数（标准差/均值）
														
 
															+    df_input['price_coefficient_variation'] = df_input['std_price'] / df_input['mean_price']
														
 
															+
														
 
															+    # 7. 价格趋势特征
														
 
															+    # 计算当前价格相对于历史价格的位置
														
 
															+    df_input['price_relative_position'] = (df_input['adult_total_price'] - df_input['min_price']) / (df_input['max_price'] - df_input['min_price'])
														
 
															+    df_input['price_relative_position'] = df_input['price_relative_position'].fillna(0.5)  # 兜底
														
 
															+
														
 
															+    # 删除中间计算列
														
 
															+    df_input.drop(columns=['price_frequency', 'price_z_score', 'price_relative_to_90p'], inplace=True, errors='ignore')
														
 
															+
														
 
															+    del price_freq
														
 
															+    del price_stats
														
 
															+    del weighted_percentiles
														
 
															+    del freq_median
														
 
															+
														
 
															+    print(">>> 改进版价格区间特征计算完成")
														
 
															+
														
 
															     # 生成第一机场对
														
 
															     df_input['airport_pair_1'] = (
														
 
															         df_input['seg1_dep_air_port'].astype(str) + "-" + df_input['seg1_arr_air_port'].astype(str)
														
@@ -492,6 +646,8 @@ def preprocess_data(df_input, features, categorical_features, is_training=True,
 
															         "flight_number_2", "flight_2_num", "airport_pair_2", "dep_time_2", "arr_time_2", "fly_duration_2", "fly_duration", "stop_duration", 
														
 
															         "global_dep_time", "dep_country", "dep_country_is_holiday", "is_cross_country",
														
 
															         "global_arr_time", "arr_country", "arr_country_is_holiday", "any_country_is_holiday",
														
 
															+        "price_weighted_percentile_25", "price_weighted_percentile_50", "price_weighted_percentile_75", "price_weighted_percentile_90",
														
 
															+        "price_zone_comprehensive", "price_relative_position",
														
 
															     ]
														
 
															     df_input = df_input[order_columns]
														
@@ -502,7 +658,9 @@ def standardization(df, feature_scaler, target_scaler=None, is_training=True, is
 
															     print(">>> 开始标准化处理")
														
 
															     # 准备走标准化的特征
														
 
															-    scaler_features = ['adult_total_price', 'fly_duration', 'stop_duration']
														
 
															+    scaler_features = ['adult_total_price', 'fly_duration', 'stop_duration', 
														
 
															+                       'price_weighted_percentile_25', 'price_weighted_percentile_50', 
														
 
															+                       'price_weighted_percentile_75', 'price_weighted_percentile_90']
														
 
															     if is_training:
														
 
															         print(">>> 特征数据标准化开始")
														
@@ -527,7 +685,8 @@ def standardization(df, feature_scaler, target_scaler=None, is_training=True, is
 
															         'flight_2_num': (0, 341),
														
 
															         'seats_remaining': (1, 5),
														
 
															         'price_change_times_total': (0, 30),     # 假设价格变更次数不会超过30次
														
 
															-        'price_last_change_hours': (0, 480),     
														
 
															+        'price_last_change_hours': (0, 480), 
														
 
															+        'price_zone_comprehensive': (0, 5),    
														
 
															         'days_to_departure': (0, 30),
														
 
															         'days_to_holiday': (0, 120),             # 最长的越南节假日间隔120天
														
 
															         'flight_by_hour': (0, 23),
														
--- a/main_pe.py
+++ b/main_pe.py
@@ -6,7 +6,7 @@ import numpy as np
 
															 import pickle
														
 
															 import time
														
 
															 from datetime import datetime, timedelta
														
 
															-from config import vj_flight_route_list_hot, vj_flight_route_list_nothot, CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB
														
 
															+from config import mongodb_config, vj_flight_route_list_hot, vj_flight_route_list_nothot, CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB
														
 
															 from data_loader import mongo_con_parse, load_train_data
														
 
															 from data_preprocess import preprocess_data, standardization
														
 
															 from utils import chunk_list_with_index, create_fixed_length_sequences
														
@@ -85,7 +85,7 @@ def start_predict():
 
															     # 测试阶段
														
 
															     for i, (_, group_route_list) in enumerate(chunks, start=resume_chunk_idx):
														
 
															         # 特殊处理，跳过不好的批次
														
 
															-        client, db = mongo_con_parse()
														
 
															+        # client, db = mongo_con_parse()
														
 
															         print(f"第 {i} 组 :", group_route_list)
														
 
															         # batch_flight_routes = group_route_list
														
@@ -102,12 +102,12 @@ def start_predict():
 
															         # 加载测试数据 (仅仅是时间段取到后天)
														
 
															         start_time = time.time()
														
 
															-        df_test = load_train_data(db, group_route_list, table_name, date_begin, date_end, output_dir, is_hot)
														
 
															+        df_test = load_train_data(mongodb_config, group_route_list, table_name, date_begin, date_end, output_dir, is_hot)
														
 
															         end_time = time.time()
														
 
															         run_time = round(end_time - start_time, 3)
														
 
															         print(f"用时: {run_time} 秒")
														
 
															-        client.close()
														
 
															+        # client.close()
														
 
															         if df_test.empty:
														
 
															             print(f"测试数据为空，跳过此批次。")
														
--- a/main_tr.py
+++ b/main_tr.py
@@ -38,9 +38,11 @@ common_features = ['hours_until_departure', 'days_to_departure', 'seats_remainin
 
															                    'flight_by_hour', 'flight_by_day', 'flight_day_of_month', 'flight_day_of_week', 'flight_day_of_quarter', 'flight_day_is_weekend',
														
 
															                    'dep_country_is_holiday', 'arr_country_is_holiday', 'any_country_is_holiday', 'days_to_holiday',
														
 
															                   ]
														
 
															+price_info_features = ['price_weighted_percentile_25', 'price_weighted_percentile_50', 'price_weighted_percentile_75', 'price_weighted_percentile_90',
														
 
															+                       'price_zone_comprehensive', 'price_relative_position']
														
 
															 price_features = ['adult_total_price', 'price_change_times_total', 'price_last_change_hours']
														
 
															 encoded_columns = ['from_city_num', 'to_city_num', 'flight_1_num', 'flight_2_num', 'baggage_level']
														
 
															-features = encoded_columns + price_features + common_features
														
 
															+features = encoded_columns + price_info_features + price_features + common_features
														
 
															 target_vars = ['target_will_price_drop']   # 是否降价
														
@@ -111,7 +113,7 @@ def start_train():
 
															     date_end = datetime.today().strftime("%Y-%m-%d")
														
 
															     # date_begin = (datetime.today() - timedelta(days=41)).strftime("%Y-%m-%d")
														
 
															-    date_begin = "2025-11-20"
														
 
															+    date_begin = "2025-12-01"
														
 
															     # 仅在 rank == 0 时要做的
														
 
															     if rank == 0:
														
@@ -239,7 +241,7 @@ def start_train():
 
															             redis_client.set(lock_key, 0)
														
 
															             print("rank0 开始数据加载...")
														
 
															             # 使用默认配置
														
 
															-            client, db = mongo_con_parse()
														
 
															+            # client, db = mongo_con_parse()
														
 
															             print(f"第 {i} 组 :", group_route_list)
														
 
															             batch_flight_routes = group_route_list
														
@@ -257,12 +259,12 @@ def start_train():
 
															             # 加载训练数据
														
 
															             start_time = time.time()
														
 
															-            df_train = load_train_data(db, group_route_list, table_name, date_begin, date_end, output_dir, is_hot)
														
 
															+            df_train = load_train_data(mongodb_config, group_route_list, table_name, date_begin, date_end, output_dir, is_hot)
														
 
															             end_time = time.time()
														
 
															             run_time = round(end_time - start_time, 3)
														
 
															             print(f"用时: {run_time} 秒")
														
 
															-            client.close()
														
 
															+            # client.close()
														
 
															             if df_train.empty:
														
 
															                 print(f"训练数据为空，跳过此批次。")
														
--- a/result_validate.py
+++ b/result_validate.py
@@ -36,6 +36,7 @@ def validate_process(node, date):
 
															         baggage = row['baggage']
														
 
															         valid_begin_hour = row['valid_begin_hour'] 
														
 
															         df_val= validate_one_line(db, city_pair, flight_day, flight_number_1, flight_number_2, baggage, valid_begin_hour)
														
 
															+        # 有可能在当前验证时刻，数据库里没有在valid_begin_hour之后的数据
														
 
															         if not df_val.empty:
														
 
															             df_val_f = fill_hourly_crawl_date(df_val, rear_fill=2)
														
 
															             df_val_f = df_val_f[df_val_f['is_filled']==0]    # 只要原始数据，不要补齐的
														
@@ -76,6 +77,7 @@ def validate_process(node, date):
 
															                 list_change_price = df_price_changes['adult_total_price'].tolist()
														
 
															                 list_change_hours = df_price_changes['hours_until_departure'].tolist()
														
 
															+        
														
 
															         else:
														
 
															             drop_flag = 0
														
 
															             first_drop_amount = pd.NA
														
--- a/utils.py
+++ b/utils.py
@@ -28,7 +28,7 @@ def insert_df_col(df, insert_col_name, base_col_name, inplace=True):
 
															     return df
														
 
															 # 真正创建序列过程
														
 
															-def create_fixed_length_sequences(df, features, target_vars, threshold=48, input_length=432, is_train=True):
														
 
															+def create_fixed_length_sequences(df, features, target_vars, threshold=36, input_length=444, is_train=True):
														
 
															     print(">>开始创建序列")
														
 
															     start_time = time.time()
														
@@ -49,7 +49,7 @@ def create_fixed_length_sequences(df, features, target_vars, threshold=48, input
 
															         df_group_bag_30 = df_group[df_group['baggage']==30]
														
 
															         df_group_bag_20 = df_group[df_group['baggage']==20]
														
 
															-        # 过滤训练时间段 (48 ~ 480)
														
 
															+        # 过滤训练时间段 (36 ~ 480)
														
 
															         df_group_bag_30_filtered = df_group_bag_30[(df_group_bag_30['Hours_Until_Departure'] >= threshold) & (df_group_bag_30['Hours_Until_Departure'] < threshold + input_length)]
														
 
															         df_group_bag_20_filtered = df_group_bag_20[(df_group_bag_20['Hours_Until_Departure'] >= threshold) & (df_group_bag_20['Hours_Until_Departure'] < threshold + input_length)]
														
@@ -62,7 +62,7 @@ def create_fixed_length_sequences(df, features, target_vars, threshold=48, input
 
															             seq_features_1 = df_group_bag_30_filtered[features].to_numpy()
														
 
															             seq_features_2 = df_group_bag_20_filtered[features].to_numpy()
														
 
															-            # 将几个特征序列沿着第 0 维拼接，得到形状为 (2, 432, 25)
														
 
															+            # 将几个特征序列沿着第 0 维拼接，得到形状为 (2, 444, 31)
														
 
															             combined_features = torch.stack([torch.tensor(seq_features_1, dtype=torch.float32),    
														
 
															                                              torch.tensor(seq_features_2, dtype=torch.float32)])