follow_up.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import os
  2. import datetime
  3. import pandas as pd
  4. from config import mongodb_config
  5. def follow_up_handle():
  6. '''后续处理'''
  7. object_dir = "./predictions_0"
  8. # 检查目录是否存在
  9. if not os.path.exists(object_dir):
  10. print(f"目录不存在: {object_dir}")
  11. return
  12. # 获取所有以 future_predictions_ 开头的 CSV 文件
  13. csv_files = []
  14. for file in os.listdir(object_dir):
  15. if file.startswith("future_predictions_") and file.endswith(".csv"):
  16. csv_files.append(file)
  17. if not csv_files:
  18. print(f"在 {object_dir} 中没有找到 future_predictions_ 开头的 CSV 文件")
  19. return
  20. csv_files.sort()
  21. # 调试分支
  22. # target_time = '202602251300'
  23. # matching_files = [f for f in csv_files if target_time in f]
  24. # if matching_files:
  25. # last_csv_file = matching_files[0]
  26. # print(f"指定时间的文件: {last_csv_file}")
  27. # else:
  28. # print(f"未找到时间 {target_time} 的预测文件")
  29. # 正式分支
  30. last_csv_file = csv_files[-1] # 只看最新预测的文件
  31. print(f"最新预测文件: {last_csv_file}")
  32. # 读取最新预测文件
  33. last_csv_path = os.path.join(object_dir, last_csv_file)
  34. df_last_predict = pd.read_csv(last_csv_path)
  35. df_last_predict_will_drop = df_last_predict[df_last_predict["will_price_drop"] == 1].reset_index(drop=True)
  36. print(f"最新预测文件中,预测降价的航班有 {len(df_last_predict_will_drop)} 条")
  37. # 建一张 维护表 keep_info.csv
  38. keep_info_path = os.path.join(object_dir, "keep_info.csv")
  39. key_cols = ["city_pair", "flight_day", "flight_number_1", "flight_number_2"]
  40. df_last_predict_will_drop = df_last_predict_will_drop.drop_duplicates(
  41. subset=key_cols, keep="last"
  42. ).reset_index(drop=True)
  43. # 读取维护表
  44. if os.path.exists(keep_info_path):
  45. try:
  46. df_keep_info = pd.read_csv(keep_info_path)
  47. except Exception as e:
  48. print(f"读取维护表失败: {keep_info_path}, error: {str(e)}")
  49. df_keep_info = pd.DataFrame()
  50. else:
  51. df_keep_info = pd.DataFrame()
  52. # 初始化维护表
  53. if df_keep_info.empty:
  54. df_keep_info = df_last_predict_will_drop.copy()
  55. df_keep_info["keep_flag"] = 1
  56. df_keep_info.to_csv(keep_info_path, index=False, encoding="utf-8-sig")
  57. print(f"维护表已初始化: {keep_info_path} (rows={len(df_keep_info)})")
  58. # 已存在维护表
  59. else:
  60. if "keep_flag" not in df_keep_info.columns:
  61. df_keep_info["keep_flag"] = 0
  62. df_keep_info["keep_flag"] = (
  63. pd.to_numeric(df_keep_info["keep_flag"], errors="coerce")
  64. .fillna(0)
  65. .astype(int)
  66. )
  67. missing_cols = [c for c in key_cols if c not in df_keep_info.columns]
  68. if missing_cols:
  69. print(f"维护表缺少字段: {missing_cols}, path={keep_info_path}")
  70. return
  71. for c in key_cols:
  72. df_last_predict_will_drop[c] = df_last_predict_will_drop[c].astype(str)
  73. df_keep_info[c] = df_keep_info[c].astype(str)
  74. df_keep_info = df_keep_info.drop_duplicates(subset=key_cols, keep="last").reset_index(drop=True)
  75. # 提取两者的标志位
  76. df_last_keys = df_last_predict_will_drop[key_cols].drop_duplicates().reset_index(drop=True)
  77. df_keep_keys = df_keep_info[key_cols].drop_duplicates().reset_index(drop=True)
  78. df_last_with_merge = df_last_predict_will_drop.merge(
  79. df_keep_keys, on=key_cols, how="left", indicator=True
  80. )
  81. # 场景一: 如果某一行数据在 df_last_predict_will_drop 出现,没有在 df_keep_info 里
  82. df_to_add = (
  83. df_last_with_merge.loc[df_last_with_merge["_merge"] == "left_only"]
  84. .drop(columns=["_merge"])
  85. .copy()
  86. )
  87. # keep_flag 设为 1
  88. if not df_to_add.empty:
  89. df_to_add["keep_flag"] = 1
  90. df_keep_with_merge = df_keep_info.reset_index().merge(
  91. df_last_keys, on=key_cols, how="left", indicator=True
  92. )
  93. # 场景二: 如果某一行数据在 df_last_predict_will_drop 和 df_keep_info 里都出现
  94. matched_idx = df_keep_with_merge.loc[df_keep_with_merge["_merge"] == "both", "index"].tolist()
  95. # 场景三: 如果某一行数据在 df_last_predict_will_drop 没有出现,却在 df_keep_info 里都出现
  96. keep_only_idx = df_keep_with_merge.loc[df_keep_with_merge["_merge"] == "left_only", "index"].tolist()
  97. # 符合场景二的索引 (在 df_keep_with_merge 中)
  98. if matched_idx:
  99. df_matched_keys = df_keep_info.loc[matched_idx, key_cols]
  100. df_latest_matched = df_matched_keys.merge(
  101. df_last_predict_will_drop, on=key_cols, how="left"
  102. )
  103. # 将 df_keep_info 的 df_matched_keys 的内容更新为 df_last_predict_will_drop 里对应的内容
  104. update_cols = [c for c in df_last_predict_will_drop.columns if c not in key_cols]
  105. for c in update_cols:
  106. if c == "keep_flag":
  107. continue
  108. if c not in df_keep_info.columns:
  109. df_keep_info[c] = pd.NA
  110. df_keep_info.loc[matched_idx, c] = df_latest_matched[c].values
  111. # 重新标记 原来是1 -> 0 原来是0 -> 0 原来是-1 -> 1
  112. old_flags = df_keep_info.loc[matched_idx, "keep_flag"]
  113. df_keep_info.loc[matched_idx, "keep_flag"] = old_flags.apply(
  114. lambda x: 0 if x in (0, 1) else (1 if x == -1 else 1)
  115. )
  116. # 符合场景三的索引 (在 df_keep_with_merge 中)
  117. if keep_only_idx:
  118. # 如果 df_keep_info 的 keep_flag 为-1,此时标记为-2
  119. mask_keep_only = df_keep_info.index.isin(keep_only_idx) # 布尔索引序列
  120. mask_to_remove = mask_keep_only & (df_keep_info["keep_flag"] == -1)
  121. df_keep_info.loc[mask_to_remove, "keep_flag"] = -2
  122. # 如果 df_keep_info 的 keep_flag 大于等于0
  123. mask_need_observe = mask_keep_only & (df_keep_info["keep_flag"] >= 0) # 布尔索引序列
  124. if mask_need_observe.any():
  125. if "hours_until_departure" not in df_keep_info.columns:
  126. df_keep_info.loc[mask_need_observe, "keep_flag"] = -1
  127. else:
  128. hud = pd.to_numeric(
  129. df_keep_info.loc[mask_need_observe, "hours_until_departure"],
  130. errors="coerce",
  131. )
  132. # hours_until_departure自动减1
  133. new_hud = hud - 1
  134. df_keep_info.loc[mask_need_observe, "hours_until_departure"] = new_hud
  135. idx_eq13 = mask_need_observe.copy()
  136. idx_eq13.loc[idx_eq13] = hud.eq(13) # 原hours_until_departure等于13
  137. idx_gt13 = mask_need_observe.copy()
  138. idx_gt13.loc[idx_gt13] = hud.gt(13) # 原hours_until_departure大于13
  139. idx_other = mask_need_observe & ~(idx_eq13 | idx_gt13) # 原hours_until_departure小于13
  140. idx_eq13_gt4 = idx_eq13 & new_hud.gt(4)
  141. idx_eq13_eq4 = idx_eq13 & new_hud.eq(4)
  142. idx_eq13_lt4 = idx_eq13 & new_hud.lt(4)
  143. df_keep_info.loc[idx_eq13_gt4, "keep_flag"] = 0
  144. df_keep_info.loc[idx_eq13_eq4, "keep_flag"] = -1
  145. df_keep_info.loc[idx_eq13_lt4, "keep_flag"] = -2
  146. df_keep_info.loc[idx_gt13, "keep_flag"] = -1
  147. idx_other_gt4 = idx_other & new_hud.gt(4)
  148. idx_other_eq4 = idx_other & new_hud.eq(4)
  149. idx_other_lt4 = idx_other & new_hud.lt(4)
  150. df_keep_info.loc[idx_other_gt4, "keep_flag"] = 0
  151. df_keep_info.loc[idx_other_eq4, "keep_flag"] = -1
  152. df_keep_info.loc[idx_other_lt4, "keep_flag"] = -2
  153. # 将 df_to_add 添加到 df_keep_info 之后
  154. add_rows = len(df_to_add) if "df_to_add" in locals() else 0
  155. if add_rows:
  156. df_keep_info = pd.concat([df_keep_info, df_to_add], ignore_index=True)
  157. # 移除 keep_flag 为 -2 的行
  158. before_rm = len(df_keep_info)
  159. df_keep_info = df_keep_info.loc[df_keep_info["keep_flag"] != -2].reset_index(drop=True)
  160. rm_rows = before_rm - len(df_keep_info)
  161. # 保存更新后的 df_keep_info 到csv文件
  162. df_keep_info.to_csv(keep_info_path, index=False, encoding="utf-8-sig")
  163. print(
  164. f"维护表已更新: {keep_info_path} (rows={len(df_keep_info)} add={add_rows} rm={rm_rows})"
  165. )
  166. # ================================================================
  167. # for idx, row in df_last_predict_will_drop.iterrows():
  168. # city_pair = row['city_pair']
  169. # flight_day = row['flight_day']
  170. # flight_number_1 = row['flight_number_1']
  171. # flight_number_2 = row['flight_number_2']
  172. # baggage = row['baggage']
  173. # from_city_code = city_pair.split('-')[0]
  174. # to_city_code = city_pair.split('-')[1]
  175. # from_day = datetime.datetime.strptime(flight_day, '%Y-%m-%d').strftime('%Y%m%d')
  176. # baggage_str = f"1-{baggage}"
  177. # pass
  178. # adult_total_price = row['adult_total_price']
  179. # hours_until_departure = row['hours_until_departure']
  180. pass
  181. if __name__ == "__main__":
  182. follow_up_handle()