utils.py 1.4 KB

1234567891011121314151617181920212223242526
  1. import os
  2. import pandas as pd
  3. def merge_and_overwrite_csv(df_new, csv_path, dedup_cols):
  4. key_cols = [c for c in dedup_cols if c in df_new.columns]
  5. # 若干天后的训练:如果本次 df_new 里某些 flight_day(连同航班键)在历史 CSV df_old 里已经出现过,就认为这一天已经处理过了,
  6. # 本次不再追加这一天的任何节点;只追加“历史里不存在的 flight_day(同航班键)”的数据
  7. if os.path.exists(csv_path):
  8. df_old = pd.read_csv(csv_path, encoding='utf-8-sig')
  9. if key_cols and all(c in df_old.columns for c in key_cols):
  10. df_old_keys = df_old[key_cols].drop_duplicates()
  11. df_add = df_new.merge(df_old_keys, on=key_cols, how='left', indicator=True) # indicator=True 会在结果df中添加一个_merge列
  12. df_add = df_add[df_add['_merge'] == 'left_only'].drop(columns=['_merge']) # left_only 只在左表(df_new)中存在
  13. else:
  14. df_add = df_new.copy()
  15. df_merged = pd.concat([df_old, df_add], ignore_index=True)
  16. # 第一次训练:直接保留,不做去重
  17. else:
  18. df_merged = df_new.copy()
  19. sort_cols = [c for c in dedup_cols if c in df_merged.columns]
  20. if sort_cols:
  21. df_merged = df_merged.sort_values(by=sort_cols).reset_index(drop=True) # 重新分组排序
  22. df_merged.to_csv(csv_path, index=False, encoding='utf-8-sig')