yz
/
jiangcang_vj


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745
							import gc
import time
from datetime import datetime
import pymongo
from pymongo.errors import PyMongoError, ServerSelectionTimeoutError
import pandas as pd
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager
import matplotlib.dates as mdates
from config import mongodb_config, vj_flight_route_list, vj_flight_route_list_hot, vj_flight_route_list_nothot, \
    CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_HOT_FAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_FAR_INFO_TAB


font_path = "./simhei.ttf"
font_prop = font_manager.FontProperties(fname=font_path)

def mongo_con_parse(config=None):
    if config is None:
        config = mongodb_config.copy()

    try:
        if config.get("URI", ""):
            motor_uri = config["URI"]
            client = pymongo.MongoClient(motor_uri, maxPoolSize=100)
            db = client[config['db']]
            print("motor_uri: ", motor_uri)
        else:
            client = pymongo.MongoClient(
                config['host'],
                config['port'],
                serverSelectionTimeoutMS=6000,  # 6秒
                connectTimeoutMS=6000,  # 6秒
                socketTimeoutMS=6000,  # 6秒,
                retryReads=True,    # 开启重试
                maxPoolSize=50
            )
            db = client[config['db']]
            if config.get('user'):
                db.authenticate(config['user'], config['pwd'])

        print(f"✅ MongoDB 连接对象创建成功")

    except Exception as e:
        print(f"❌ 创建 MongoDB 连接对象时发生错误: {e}")
        raise
    return client, db


def test_mongo_connection(db):
    try:
        # 获取客户端对象
        client = db.client

        # 方法1：使用 server_info() 测试连接
        info = client.server_info()
        print(f"✅ MongoDB 连接测试成功!")
        print(f"  服务器版本: {info.get('version')}")
        print(f"  数据库: {db.name}")
        return True

    except Exception as e:
        print(f"❌ 数据库连接测试失败: {e}")
        return False


def query_flight_range_status(db, table_name, from_city, to_city, dep_date_begin, dep_date_end, flight_nums, 
                              limit=0, max_retries=3, base_sleep=1.0):
    """
    从指定表(4类)查询数据(指定起飞天的范围) （失败自动重试）
    """
    for attempt in range(1, max_retries + 1):
        try:
            print(f"🔁 第 {attempt}/{max_retries} 次尝试查询")
            # 构建查询条件
            query_condition = {
                "from_city_code": from_city,
                "to_city_code": to_city,
                "search_dep_time": {
                    "$gte": dep_date_begin,
                    "$lte": dep_date_end,
                },
                "segments.baggage": {"$in": ["1-20", "1-30"]}  # 只查20公斤和30公斤行李的
            }
            # 动态添加航班号条件
            for i, flight_num in enumerate(flight_nums):
                query_condition[f"segments.{i}.flight_number"] = flight_num
    
            print(f"   查询条件: {query_condition}")
            # 定义要查询的字段
            projection = {
                # "_id": 1,
                "from_city_code": 1,
                "search_dep_time": 1,
                "to_city_code": 1,
                "currency": 1,
                "adult_price": 1,
                "adult_tax": 1,
                "adult_total_price": 1,
                "seats_remaining": 1,
                "segments": 1,
                "source_website": 1,
                "crawl_date": 1
            }
            # 执行查询
            cursor = db.get_collection(table_name).find(
                query_condition,
                projection=projection  # 添加投影参数
            ).sort(
                [
                    ("search_dep_time", 1),  # 多级排序要用列表+元组的格式
                    ("segments.0.baggage", 1),
                    ("crawl_date", 1)
                ]
            )
            if limit > 0:
                cursor = cursor.limit(limit)

            # 将结果转换为列表
            results = list(cursor)
            print(f"✅ 查询成功，找到 {len(results)} 条记录")

            if results:
                df = pd.DataFrame(results)
                # 处理特殊的 ObjectId 类型
                if '_id' in df.columns:
                    df = df.drop(columns=['_id'])
                print(f"📊 已转换为 DataFrame，形状: {df.shape}")

                # 1️⃣ 展开 segments
                print(f"📊 开始扩展segments 稍等...")
                t1 = time.time()
                df = expand_segments_columns(df)
                t2 = time.time()
                rt = round(t2 - t1, 3)
                print(f"用时: {rt} 秒")
                print(f"📊 已将segments扩展成字段，形状: {df.shape}")

                # 不用排序，因为mongo语句已经排好
                return df

            else:
                print("⚠️  查询结果为空")
                return pd.DataFrame()
            
        except (ServerSelectionTimeoutError, PyMongoError) as e:
            print(f"⚠️ Mongo 查询失败: {e}")
            if attempt == max_retries:
                print("❌ 达到最大重试次数，放弃")
                return pd.DataFrame()
            
            # 指数退避 + 随机抖动
            sleep_time = base_sleep * (2 ** (attempt - 1)) + random.random()
            print(f"⏳ {sleep_time:.2f}s 后重试...")
            time.sleep(sleep_time)


def expand_segments_columns(df):
    """展开 segments"""
    df = df.copy()

    # 定义要展开的列
    seg1_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time', 'cabin', 'baggage']
    seg2_cols = ['flight_number', 'dep_air_port', 'dep_time', 'arr_air_port', 'arr_time']

    # 定义 apply 函数一次返回字典
    def extract_segments(row):
        segments = row.get('segments')
        result = {}
        # 默认缺失使用 pd.NA（对字符串友好）
        missing = pd.NA
        if isinstance(segments, list):
            # 第一段
            if len(segments) >= 1 and isinstance(segments[0], dict):
                for col in seg1_cols:
                    result[f'seg1_{col}'] = segments[0].get(col)
            else:
                for col in seg1_cols:
                    result[f'seg1_{col}'] = missing
            # 第二段
            if len(segments) >= 2 and isinstance(segments[1], dict):
                for col in seg2_cols:
                    result[f'seg2_{col}'] = segments[1].get(col)
            else:
                for col in seg2_cols:
                    result[f'seg2_{col}'] = missing
        else:
            # segments 不是 list，全都置空
            for col in seg1_cols:
                result[f'seg1_{col}'] = missing
            for col in seg2_cols:
                result[f'seg2_{col}'] = missing

        return pd.Series(result)

    # 一次 apply
    df_segments = df.apply(extract_segments, axis=1)

    # 拼回原 df
    df = pd.concat([df.drop(columns=['segments'], errors='ignore'), df_segments], axis=1)

    # 统一转换时间字段为 datetime
    time_cols = [
        'seg1_dep_time', 'seg1_arr_time',
        'seg2_dep_time', 'seg2_arr_time'
    ]
    for col in time_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(
                df[col],
                format='%Y%m%d%H%M%S',
                errors='coerce'
            )

    # 站点来源 -> 是否近期
    df['source_website'] = np.where(
        df['source_website'].str.contains('7_30'),
        0,  # 远期 -> 0
        np.where(df['source_website'].str.contains('0_7'),
                 1,  # 近期 -> 1
                 df['source_website'])  # 其他情况保持原值
    )

    # 行李配额字符 -> 数字
    conditions = [
        df['seg1_baggage'] == '-;-;-;-',
        df['seg1_baggage'] == '1-20',
        df['seg1_baggage'] == '1-30',
        df['seg1_baggage'] == '1-40',
    ]
    choices = [0, 20, 30, 40]
    df['seg1_baggage'] = np.select(conditions, choices, default=df['seg1_baggage'])

    # 重命名字段
    df = df.rename(columns={
        'seg1_cabin': 'cabin',
        'seg1_baggage': 'baggage',
        'source_website': 'is_near',
    })

    return df


def fill_hourly_crawl_date(df, head_fill=0, rear_fill=0):
    """补齐成小时粒度数据"""
    df = df.copy()

    # 1. 转 datetime
    df['crawl_date'] = pd.to_datetime(df['crawl_date'])

    # 添加一个用于分组的小时字段
    df['update_hour'] = df['crawl_date'].dt.floor('h')

    # 2. 排序规则：同一小时内，按原始时间戳排序
    # 假设你想保留最早的一条
    df = df.sort_values(['update_hour', 'crawl_date'])

    # 3. 按小时去重，保留该小时内最早的一条
    df = df.drop_duplicates(subset=['update_hour'], keep='first')

    # 删除原始时间戳列
    # df = df.drop(columns=['crawl_date'])
    # df = df.drop(columns=['_id'])

    # 4. 标记原始数据
    df['is_filled'] = 0

    # 5. 排序 + 设索引
    df = df.sort_values('update_hour').set_index('update_hour')

    # 6. 构造完整小时轴
    start_of_day = df.index.min()  # 默认 第一天 最早 开始
    if head_fill == 1:
        start_of_day = df.index.min().normalize()  # 强制 第一天 00:00 开始

    end_of_day = df.index.max()  # 默认 最后一天 最晚 结束
    if rear_fill == 1:
        end_of_day = df.index.max().normalize() + pd.Timedelta(hours=23)  # 强制 最后一天 23:00 结束
    elif rear_fill == 2:
        if 'seg1_dep_time' in df.columns:
            last_dep_time = df['seg1_dep_time'].iloc[-1]
            if pd.notna(last_dep_time):
                # 对齐到整点小时（向下取整）
                end_of_day = last_dep_time.floor('h')

    full_index = pd.date_range(
        start=start_of_day,
        end=end_of_day,
        freq='1h'
    )

    # 7. 按小时补齐
    df = df.reindex(full_index)

    # 先恢复 dtype（关键！）
    df = df.infer_objects(copy=False)

    # 8. 新增出来的行标记为 1
    df['is_filled'] = df['is_filled'].fillna(1)

    # 9. 前向填充
    df = df.ffill()

    # 10. 还原整型字段
    int_cols = [
        'seats_remaining',
        'is_near',
        'baggage',
        'is_filled',
    ]
    for col in int_cols:
        if col in df.columns:
            df[col] = df[col].astype('int64')

    # 10.5 价格字段统一保留两位小数
    price_cols = [
        'adult_price',
        'adult_tax',
        'adult_total_price'
    ]
    for col in price_cols:
        if col in df.columns:
            df[col] = df[col].astype('float64').round(2)

    # 10.6 新增：距离起飞还有多少小时
    if 'seg1_dep_time' in df.columns:
        # 创建临时字段（整点）
        df['seg1_dep_hour'] = df['seg1_dep_time'].dt.floor('h')

        # 计算小时差 df.index 此时就是 update_hour
        df['hours_until_departure'] = (
                (df['seg1_dep_hour'] - df.index) / pd.Timedelta(hours=1)
        ).astype('int64')

        # 删除临时字段
        df = df.drop(columns=['seg1_dep_hour'])

    # 11. 写回 update_hour
    df['update_hour'] = df.index

    # 12. 恢复普通索引
    df = df.reset_index(drop=True)

    return df


def query_groups_of_city_code(db, from_city, to_city, table_name, min_days=15, max_retries=3, base_sleep=1.0):
    """
    从一组城市对中查找所有分组(航班号与起飞时间)的组合
    按：第一段航班号 → 第二段航班号 → 起飞时间 排序
    （失败自动重试）
    """
    print(f"{from_city}-{to_city} 查找所有分组")
    pipeline = [
        # 1️⃣ 先筛选城市对
        {
            "$match": {
                "from_city_code": from_city,
                "to_city_code": to_city
            }
        },
        # 2️⃣ 投影字段 + 拆第一、第二段航班号用于排序
        {
            "$project": {
                "flight_numbers": "$segments.flight_number",
                "search_dep_time": 1,
                "fn1": {"$arrayElemAt": ["$segments.flight_number", 0]},
                "fn2": {"$arrayElemAt": ["$segments.flight_number", 1]}
            }
        },
        # 3️⃣ 第一级分组：组合 + 每一天
        {
            "$group": {
                "_id": {
                    "flight_numbers": "$flight_numbers",
                    "search_dep_time": "$search_dep_time",
                    "fn1": "$fn1",
                    "fn2": "$fn2"
                },
                "count": {"$sum": 1}
            }
        },
        # 关键修复点：这里先按【时间】排好序！
        {
            "$sort": {
                "_id.fn1": 1,
                "_id.fn2": 1,
                "_id.search_dep_time": 1  # 确保 push 进去时是按天递增
            }
        },
        # 4️⃣ 第二级分组：只按【航班组合】聚合 → 统计“有多少天”
        {
            "$group": {
                "_id": {
                    "flight_numbers": "$_id.flight_numbers",
                    "fn1": "$_id.fn1",
                    "fn2": "$_id.fn2"
                },
                "days": {"$sum": 1},  # 不同起飞天数
                "details": {
                    "$push": {
                        "search_dep_time": "$_id.search_dep_time",
                        "count": "$count"
                    }
                }
            }
        },
        # 5️⃣ 关键：按“天数阈值”过滤
        {
            "$match": {
                "days": {"$gte": min_days}
            }
        },
        # 6️⃣ ✅ 按“第一段 → 第二段”排序
        {
            "$sort": {
                "_id.fn1": 1,
                "_id.fn2": 1,
            }
        }
    ]
    for attempt in range(1, max_retries + 1):
        try:
            print(f"  第 {attempt}/{max_retries} 次尝试查询")

            # 执行聚合查询
            collection = db[table_name]
            results = list(collection.aggregate(pipeline))

            # 格式化结果，将 _id 中的字段提取到外层
            formatted_results = []
            for item in results:
                formatted_item = {
                    "flight_numbers": item["_id"]["flight_numbers"],
                    "days": item["days"],       # 这个组合一共有多少天
                    "details": item["details"]  # 每一天的 count 明细
                }
                formatted_results.append(formatted_item)

            return formatted_results

        except (ServerSelectionTimeoutError, PyMongoError) as e:
            print(f"⚠️ Mongo 查询失败: {e}")
            if attempt == max_retries:
                print("❌ 达到最大重试次数，放弃")
                return []

            # 指数退避 + 随机抖动
            sleep_time = base_sleep * (2 ** (attempt - 1)) + random.random()
            print(f"⏳ {sleep_time:.2f}s 后重试...")
            time.sleep(sleep_time)


def plot_c12_trend(df, output_dir="."):
    """
    根据传入的 dataframe 绘制 adult_total_price 随 update_hour 的趋势图，
    并按照 baggage 分类进行分组绘制。
    """
    # output_dir_photo = output_dir

    # 颜色与线型配置（按顺序循环使用）
    colors = ['green', 'blue', 'red', 'brown']
    linestyles = ['--', '--', '--', '--']

    # 确保时间字段为 datetime 类型
    if not hasattr(df['update_hour'], 'dt'):
        df['update_hour'] = pd.to_datetime(df['update_hour'])

    from_city = df['from_city_code'].mode().iloc[0]
    to_city = df['to_city_code'].mode().iloc[0]
    flight_number_1 = df['seg1_flight_number'].mode().iloc[0]
    flight_number_2 = df['seg2_flight_number'].mode().get(0, "")
    dep_time = df['seg1_dep_time'].mode().iloc[0]

    route = f"{from_city}-{to_city}"
    flight_number = f"{flight_number_1},{flight_number_2}" if flight_number_2 else f"{flight_number_1}"

    output_dir_photo = os.path.join(output_dir, route)
    os.makedirs(output_dir_photo, exist_ok=True)

    # 创建图表对象
    fig = plt.figure(figsize=(14, 8))

    # 按 baggage 分类绘制
    for i, (baggage_value, group) in enumerate(df.groupby('baggage')):
        # 按时间排序
        g = group.sort_values('update_hour').reset_index(drop=True)

        # 找价格变化点：与前一行不同的价格即为变化点
        # keep first row + change rows + last row
        change_points = g.loc[
            (g['adult_total_price'] != g['adult_total_price'].shift(1)) |
            (g.index == 0) |
            (g.index == len(g) - 1)  # 终点
        ].drop_duplicates(subset=['update_hour'])

        # 绘制点和线条
        plt.plot(
            change_points['update_hour'],
            change_points['adult_total_price'],
            marker='o',
            color=colors[i % len(colors)],
            linestyle=linestyles[i % len(linestyles)],
            linewidth=2, markersize=6,
            markerfacecolor='white', markeredgewidth=2,
            label=f"Baggage {baggage_value}"
        )

        # 添加注释 (小时数, 价格)
        for _, row in change_points.iterrows():
            text = f"({row['hours_until_departure']}, {row['adult_total_price']})"
            plt.annotate(
                text,
                xy=(row['update_hour'], row['adult_total_price']),
                xytext=(0, 0),  # 向右偏移
                textcoords="offset points",
                ha='left',
                va='center',
                fontsize=5,  # 字体稍小
                color='gray',
                alpha=0.8,
                rotation=25,
            )

    # 自动优化日期显示
    plt.gcf().autofmt_xdate()
    plt.xlabel('时刻', fontsize=12, fontproperties=font_prop)
    plt.ylabel('价格', fontsize=12, fontproperties=font_prop)
    plt.title(f'价格变化趋势 - 航线: {route} 航班号: {flight_number}\n起飞时间: {dep_time}',
              fontsize=14, fontweight='bold', fontproperties=font_prop)

    # 设置 x 轴刻度为每天
    ax = plt.gca()
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=1))    # 每天一个主刻度
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))  # 显示月-日
    ax.xaxis.set_minor_locator(mdates.HourLocator(byhour=[12]))  # 指定在12:00显示副刻度
    ax.xaxis.set_minor_formatter(mdates.DateFormatter(''))       # 输出空字符串
    # ax.tick_params(axis='x', which='minor', labelsize=8, rotation=30)

    # 添加图例
    plt.legend(bbox_to_anchor=(1.05, 1), loc='best', prop=font_prop)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()

    safe_flight = flight_number.replace(",", "_")
    safe_dep_time = dep_time.strftime("%Y-%m-%d %H%M%S")
    save_file = f"{route} {safe_flight} {safe_dep_time}.png"
    output_path = os.path.join(output_dir_photo, save_file)
    # 保存图片（在显示之前）
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')

    # 关闭图形释放内存
    plt.close(fig)


def load_train_data(db, flight_route_list, table_name, date_begin, date_end, output_dir='.', is_hot=1):
    """加载训练数据"""
    timestamp_str = datetime.now().strftime("%Y%m%d%H%M%S")
    date_begin_s = datetime.strptime(date_begin, "%Y-%m-%d").strftime("%Y%m%d")  # 查询时的格式
    date_end_s = datetime.strptime(date_end, "%Y-%m-%d").strftime("%Y%m%d")
    list_all = []
    # 每一航线对
    for flight_route in flight_route_list:
        from_city = flight_route.split('-')[0]
        to_city = flight_route.split('-')[1]
        route = f"{from_city}-{to_city}"
        print(f"开始处理航线: {route}")
        all_groups = query_groups_of_city_code(db, from_city, to_city, table_name)
        # 每一组航班号
        for each_group in all_groups:
            flight_nums = each_group.get("flight_numbers")
            print(f"开始处理航班号: {flight_nums}")
            details = each_group.get("details")
            # 查远期表
            if is_hot == 1:
                df1 = query_flight_range_status(db, CLEAN_VJ_HOT_FAR_INFO_TAB, from_city, to_city,
                                                date_begin_s, date_end_s, flight_nums)
            else:
                df1 = query_flight_range_status(db, CLEAN_VJ_NOTHOT_FAR_INFO_TAB, from_city, to_city,
                                                date_begin_s, date_end_s, flight_nums)

            # 保证远期表里有数据
            if df1.empty:
                print(f"航班号:{flight_nums} 远期表无数据, 跳过")
                continue

            # 查近期表
            if is_hot == 1:
                df2 = query_flight_range_status(db, CLEAN_VJ_HOT_NEAR_INFO_TAB, from_city, to_city,
                                                date_begin_s, date_end_s, flight_nums)
            else:
                df2 = query_flight_range_status(db, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB, from_city, to_city,
                                                date_begin_s, date_end_s, flight_nums)

            # 保证近期表里有数据
            if df2.empty:
                print(f"航班号:{flight_nums} 近期表无数据, 跳过")
                continue

            # 起飞天数、行李配额以近期表的为主
            if df2.empty:
                common_dep_dates = []
                common_baggages = []
            else:
                common_dep_dates = df2['search_dep_time'].unique()
                common_baggages = df2['baggage'].unique()

            list_mid = []
            for dep_date in common_dep_dates:
                # 起飞日期筛选
                df_d1 = df1[df1["search_dep_time"] == dep_date].copy()
                if not df_d1.empty:
                    for col in ["seg1_dep_time", "seg1_arr_time", "seg2_dep_time", "seg2_arr_time"]:
                        mode_series_1 = df_d1[col].mode()
                        if mode_series_1.empty:
                            # 如果整个列都是 NaT，则众数为空，直接赋 NaT
                            zong_1 = pd.NaT
                        else:
                            zong_1 = mode_series_1.iloc[0]
                        df_d1[col] = zong_1

                df_d2 = df2[df2["search_dep_time"] == dep_date].copy()
                if not df_d2.empty:
                    for col in ["seg1_dep_time", "seg1_arr_time", "seg2_dep_time", "seg2_arr_time"]:
                        mode_series_2 = df_d2[col].mode()
                        if mode_series_2.empty:
                            # 如果整个列都是 NaT，则众数为空，直接赋 NaT
                            zong_2 = pd.NaT
                        else:
                            zong_2 = mode_series_2.iloc[0]
                        df_d2[col] = zong_2

                list_12 = []
                for baggage in common_baggages:
                    # 行李配额筛选
                    df_b1 = df_d1[df_d1["baggage"] == baggage].copy()
                    df_b2 = df_d2[df_d2["baggage"] == baggage].copy()

                    # 合并前检查是否都有数据
                    if df_b1.empty and df_b2.empty:
                        print(f"⚠️ dep_date:{dep_date}, baggage:{baggage} 远期表和近期表都为空，跳过")
                        continue

                    cols = ["seg1_flight_number", "seg1_dep_air_port", "seg1_arr_air_port",
                            "seg2_flight_number", "seg2_dep_air_port", "seg2_arr_air_port"]
                    # df_b1 = df_b1.copy()
                    # df_b2 = df_b2.copy()
                    df_b1[cols] = df_b1[cols].astype("string")
                    df_b2[cols] = df_b2[cols].astype("string")

                    df_b12 = pd.concat([df_b1, df_b2]).reset_index(drop=True)
                    print(f"📊 dep_date:{dep_date}, baggage:{baggage} 已将远期表和近期表合并，形状: {df_b12.shape}")
                    df_b12 = fill_hourly_crawl_date(df_b12, rear_fill=2)
                    print(f"📊 dep_date:{dep_date}, baggage:{baggage} 已合并且补齐为完整小时序列，形状: {df_b12.shape}")
                    # print(df_b12.dtypes)
                    list_12.append(df_b12)
                    del df_b12
                    del df_b2
                    del df_b1

                if list_12:
                    df_c12 = pd.concat(list_12, ignore_index=True)
                    print(f"✅ dep_date:{dep_date}, 所有 baggage 数据合并完成，总形状: {df_c12.shape}")
                    # plot_c12_trend(df_c12, output_dir)
                    # print(f"✅ dep_date:{dep_date}, 所有 baggage 数据绘图完成")
                else:
                    df_c12 = pd.DataFrame()
                    print(f"⚠️ dep_date:{dep_date}, 所有 baggage 数据合并为空")

                del list_12
                list_mid.append(df_c12)

                del df_c12
                del df_d1
                del df_d2

                print(f"结束处理起飞日期: {dep_date}")

            if list_mid:
                df_mid = pd.concat(list_mid, ignore_index=True)
                print(f"✅ 航班号:{flight_nums} 所有 起飞日期 数据合并完成，总形状: {df_mid.shape}")
            else:
                df_mid = pd.DataFrame()
                print(f"⚠️ 航班号:{flight_nums} 所有 起飞日期 数据合并为空")

            del list_mid
            list_all.append(df_mid)

            del df1
            del df2
            
            # output_path = os.path.join(output_dir, f"./{route}_{timestamp_str}.csv")
            # df_mid.to_csv(output_path, index=False, encoding="utf-8-sig", mode="a", header=not os.path.exists(output_path))
            
            del df_mid
            gc.collect()
            print(f"结束处理航班号: {flight_nums}")

        print(f"结束处理航线: {from_city}-{to_city}")

    df_all = pd.concat(list_all, ignore_index=True)
    print(f"本批次数据加载完毕, 总形状: {df_all.shape}")
    del list_all
    gc.collect()

    return df_all


def chunk_list(lst, group_size):
    return [lst[i:i + group_size] for i in range(0, len(lst), group_size)]


if __name__ == "__main__":

    # test_mongo_connection(db)

    output_dir = f"./output"
    os.makedirs(output_dir, exist_ok=True)

    # 加载热门航线数据
    date_begin = "2025-11-20"
    date_end = datetime.today().strftime("%Y-%m-%d")

    flight_route_list = vj_flight_route_list_hot[0:]  # 热门 vj_flight_route_list_hot  冷门 vj_flight_route_list_nothot
    table_name = CLEAN_VJ_HOT_NEAR_INFO_TAB  # 热门 CLEAN_VJ_HOT_NEAR_INFO_TAB  冷门 CLEAN_VJ_NOTHOT_NEAR_INFO_TAB
    is_hot = 1   # 1 热门 0 冷门
    group_size = 1
    chunks = chunk_list(flight_route_list, group_size)

    for idx, group_route_list in enumerate(chunks, 1):
        # 使用默认配置
        client, db = mongo_con_parse()
        print(f"第 {idx} 组 :", group_route_list)
        start_time = time.time()
        load_train_data(db, group_route_list, table_name, date_begin, date_end, output_dir, is_hot)
        end_time = time.time()
        run_time = round(end_time - start_time, 3)
        print(f"用时: {run_time} 秒")

        client.close()
        time.sleep(3)

    print("整体结束")