| 1234567891011121314151617181920212223242526 |
- import os
- import pandas as pd
- def merge_and_overwrite_csv(df_new, csv_path, dedup_cols):
- key_cols = [c for c in dedup_cols if c in df_new.columns]
- # 若干天后的训练:如果本次 df_new 里某些 flight_day(连同航班键)在历史 CSV df_old 里已经出现过,就认为这一天已经处理过了,
- # 本次不再追加这一天的任何节点;只追加“历史里不存在的 flight_day(同航班键)”的数据
- if os.path.exists(csv_path):
- df_old = pd.read_csv(csv_path, encoding='utf-8-sig')
- if key_cols and all(c in df_old.columns for c in key_cols):
- df_old_keys = df_old[key_cols].drop_duplicates()
- df_add = df_new.merge(df_old_keys, on=key_cols, how='left', indicator=True) # indicator=True 会在结果df中添加一个_merge列
- df_add = df_add[df_add['_merge'] == 'left_only'].drop(columns=['_merge']) # left_only 只在左表(df_new)中存在
- else:
- df_add = df_new.copy()
- df_merged = pd.concat([df_old, df_add], ignore_index=True)
- # 第一次训练:直接保留,不做去重
- else:
- df_merged = df_new.copy()
- sort_cols = [c for c in dedup_cols if c in df_merged.columns]
- if sort_cols:
- df_merged = df_merged.sort_values(by=sort_cols).reset_index(drop=True) # 重新分组排序
- df_merged.to_csv(csv_path, index=False, encoding='utf-8-sig')
|