import os import pandas as pd def merge_and_overwrite_csv(df_new, csv_path, dedup_cols): key_cols = [c for c in dedup_cols if c in df_new.columns] # 若干天后的训练:如果本次 df_new 里某些 flight_day(连同航班键)在历史 CSV df_old 里已经出现过,就认为这一天已经处理过了, # 本次不再追加这一天的任何节点;只追加“历史里不存在的 flight_day(同航班键)”的数据 if os.path.exists(csv_path): df_old = pd.read_csv(csv_path, encoding='utf-8-sig') if key_cols and all(c in df_old.columns for c in key_cols): df_old_keys = df_old[key_cols].drop_duplicates() df_add = df_new.merge(df_old_keys, on=key_cols, how='left', indicator=True) # indicator=True 会在结果df中添加一个_merge列 df_add = df_add[df_add['_merge'] == 'left_only'].drop(columns=['_merge']) # left_only 只在左表(df_new)中存在 else: df_add = df_new.copy() df_merged = pd.concat([df_old, df_add], ignore_index=True) # 第一次训练:直接保留,不做去重 else: df_merged = df_new.copy() sort_cols = [c for c in dedup_cols if c in df_merged.columns] if sort_cols: df_merged = df_merged.sort_values(by=sort_cols).reset_index(drop=True) # 重新分组排序 df_merged.to_csv(csv_path, index=False, encoding='utf-8-sig')