follow_up.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. import os
  2. import datetime
  3. import time
  4. import pandas as pd
  5. from config import mongodb_config
  6. def follow_up_handle():
  7. '''后续处理'''
  8. object_dir = "./predictions_0"
  9. output_dir = "./keep_0"
  10. # 创建输出目录
  11. os.makedirs(output_dir, exist_ok=True)
  12. # 检查目录是否存在
  13. if not os.path.exists(object_dir):
  14. print(f"目录不存在: {object_dir}")
  15. return
  16. # 获取所有以 future_predictions_ 开头的 CSV 文件
  17. csv_files = []
  18. for file in os.listdir(object_dir):
  19. if file.startswith("future_predictions_") and file.endswith(".csv"):
  20. csv_files.append(file)
  21. if not csv_files:
  22. print(f"在 {object_dir} 中没有找到 future_predictions_ 开头的 CSV 文件")
  23. return
  24. csv_files.sort()
  25. # 调试分支
  26. # target_time = "202603131400"
  27. # matching_files = [f for f in csv_files if target_time in f]
  28. # if matching_files:
  29. # last_csv_file = matching_files[0]
  30. # print(f"指定时间的文件: {last_csv_file}")
  31. # else:
  32. # print(f"未找到时间 {target_time} 的预测文件")
  33. # return
  34. # 正式分支
  35. last_csv_file = csv_files[-1] # 只看最新预测的文件
  36. print(f"最新预测文件: {last_csv_file}")
  37. if last_csv_file.startswith("future_predictions_") and last_csv_file.endswith(".csv"):
  38. target_time = last_csv_file.replace("future_predictions_", "").replace(".csv", "")
  39. else:
  40. target_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
  41. # 读取最新预测文件
  42. last_csv_path = os.path.join(object_dir, last_csv_file)
  43. df_last_predict = pd.read_csv(last_csv_path)
  44. df_last_predict_will_drop = df_last_predict[df_last_predict["will_price_drop"] == 1].reset_index(drop=True)
  45. df_last_predict_not_drop = df_last_predict[df_last_predict["will_price_drop"] == 0].reset_index(drop=True)
  46. print(f"最新预测文件中,预测降价的航班有 {len(df_last_predict_will_drop)} 条,预测不降价的航班有 {len(df_last_predict_not_drop)} 条")
  47. # 建一张 维护表 keep_info.csv 附加一个维护表快照 keep_info_{target_time}.csv
  48. keep_info_path = os.path.join(output_dir, "keep_info.csv")
  49. keep_info_snapshot_path = os.path.join(output_dir, f"keep_info_{target_time}.csv")
  50. key_cols = ["city_pair", "flight_day", "flight_number_1", "flight_number_2"]
  51. df_last_predict_will_drop = df_last_predict_will_drop.drop_duplicates(
  52. subset=key_cols, keep="last"
  53. ).reset_index(drop=True)
  54. # df_last_predict_not_drop = df_last_predict_not_drop.drop_duplicates(
  55. # subset=key_cols, keep="last"
  56. # ).reset_index(drop=True)
  57. # 读取维护表
  58. if os.path.exists(keep_info_path):
  59. try:
  60. df_keep_info = pd.read_csv(keep_info_path)
  61. except Exception as e:
  62. print(f"读取维护表失败: {keep_info_path}, error: {str(e)}")
  63. df_keep_info = pd.DataFrame()
  64. else:
  65. df_keep_info = pd.DataFrame()
  66. def _parse_dt(yyyymmddhhmm):
  67. try:
  68. return datetime.datetime.strptime(str(yyyymmddhhmm), "%Y%m%d%H%M")
  69. except Exception:
  70. return None
  71. current_dt = _parse_dt(target_time)
  72. prev_dt = None
  73. hud_decrement = 1
  74. # if not df_keep_info.empty and "last_predict_time" in df_keep_info.columns:
  75. # prev_candidates = (
  76. # df_keep_info["last_predict_time"].dropna().astype(str).tolist()
  77. # )
  78. # if prev_candidates:
  79. # prev_dt = _parse_dt(max(prev_candidates))
  80. if prev_dt is None:
  81. snapshot_times = []
  82. for f in os.listdir(output_dir):
  83. if (
  84. f.startswith("keep_info_")
  85. and f.endswith(".csv")
  86. and f != f"keep_info_{target_time}.csv"
  87. ):
  88. ts = f.replace("keep_info_", "").replace(".csv", "")
  89. dt = _parse_dt(ts)
  90. if dt is not None:
  91. snapshot_times.append(dt)
  92. if snapshot_times:
  93. prev_dt = max(snapshot_times)
  94. if current_dt is not None and prev_dt is not None:
  95. delta_seconds = (current_dt - prev_dt).total_seconds()
  96. if delta_seconds >= 0:
  97. hud_decrement = max(0, int(delta_seconds // 3600))
  98. else:
  99. hud_decrement = 0
  100. # 初始化维护表
  101. if df_keep_info.empty:
  102. df_keep_info = df_last_predict_will_drop.copy()
  103. df_keep_info["into_update_hour"] = df_keep_info['update_hour']
  104. # df_keep_info["into_price"] = df_keep_info['adult_total_price']
  105. df_keep_info["keep_flag"] = 1
  106. # df_keep_info["last_predict_time"] = target_time
  107. # 将长时间没更新的航班标记为-1
  108. dt_update_hour = pd.to_datetime(df_keep_info["update_hour"], errors="coerce")
  109. dt_crawl_date = pd.to_datetime(df_keep_info["crawl_date"], errors="coerce")
  110. mask_abnormal_time = (dt_update_hour - dt_crawl_date) > pd.Timedelta(hours=8)
  111. if mask_abnormal_time.any():
  112. df_keep_info.loc[mask_abnormal_time.fillna(False), "keep_flag"] = -1
  113. df_keep_info.to_csv(keep_info_snapshot_path, index=False, encoding="utf-8-sig")
  114. print(f"维护表快照已保存: {keep_info_snapshot_path} (rows={len(df_keep_info)})")
  115. # 移除 keep_flag 为 -1 的行
  116. # before_rm = len(df_keep_info)
  117. df_keep_info = df_keep_info.loc[df_keep_info["keep_flag"] != -1].reset_index(drop=True)
  118. # rm_rows = before_rm - len(df_keep_info)
  119. df_keep_info.to_csv(keep_info_path, index=False, encoding="utf-8-sig")
  120. print(f"维护表已初始化: {keep_info_path} (rows={len(df_keep_info)})")
  121. # 已存在维护表
  122. else:
  123. if "keep_flag" not in df_keep_info.columns:
  124. df_keep_info["keep_flag"] = 0
  125. df_keep_info["keep_flag"] = (
  126. pd.to_numeric(df_keep_info["keep_flag"], errors="coerce")
  127. .fillna(0)
  128. .astype(int)
  129. )
  130. missing_cols = [c for c in key_cols if c not in df_keep_info.columns]
  131. if missing_cols:
  132. print(f"维护表缺少字段: {missing_cols}, path={keep_info_path}")
  133. return
  134. for c in key_cols:
  135. df_last_predict_will_drop[c] = df_last_predict_will_drop[c].astype(str)
  136. # df_last_predict_not_drop[c] = df_last_predict_not_drop[c].astype(str)
  137. df_keep_info[c] = df_keep_info[c].astype(str)
  138. df_keep_info = df_keep_info.drop_duplicates(subset=key_cols, keep="last").reset_index(drop=True)
  139. # 提取两者的标志位
  140. df_last_keys = df_last_predict_will_drop[key_cols].drop_duplicates().reset_index(drop=True)
  141. df_keep_keys = df_keep_info[key_cols].drop_duplicates().reset_index(drop=True)
  142. df_last_with_merge = df_last_predict_will_drop.merge(
  143. df_keep_keys, on=key_cols, how="left", indicator=True
  144. )
  145. # 场景一: 如果某一行数据在 df_last_predict_will_drop 出现,没有在 df_keep_info 里
  146. df_to_add = (
  147. df_last_with_merge.loc[df_last_with_merge["_merge"] == "left_only"]
  148. .drop(columns=["_merge"])
  149. .copy()
  150. )
  151. # keep_flag 设为 1
  152. if not df_to_add.empty:
  153. df_to_add['into_update_hour'] = df_to_add['update_hour']
  154. # df_to_add['into_price'] = df_to_add['adult_total_price']
  155. df_to_add["keep_flag"] = 1
  156. df_keep_with_merge = df_keep_info.reset_index().merge(
  157. df_last_keys, on=key_cols, how="left", indicator=True
  158. )
  159. # 场景二: 如果某一行数据在 df_last_predict_will_drop 和 df_keep_info 里都出现
  160. matched_idx = df_keep_with_merge.loc[df_keep_with_merge["_merge"] == "both", "index"].tolist()
  161. # 场景三: 如果某一行数据在 df_last_predict_will_drop 没有出现,却在 df_keep_info 里都出现
  162. keep_only_idx = df_keep_with_merge.loc[df_keep_with_merge["_merge"] == "left_only", "index"].tolist()
  163. # 符合场景二的索引 (在 df_keep_with_merge 中)
  164. if matched_idx:
  165. df_matched_keys = df_keep_info.loc[matched_idx, key_cols]
  166. df_latest_matched = df_matched_keys.merge(
  167. df_last_predict_will_drop, on=key_cols, how="left"
  168. )
  169. # 将 df_keep_info 的 df_matched_keys 的内容更新为 df_last_predict_will_drop 里对应的内容
  170. update_cols = [c for c in df_last_predict_will_drop.columns if c not in key_cols]
  171. for c in update_cols:
  172. if c == "keep_flag":
  173. continue
  174. if c not in df_keep_info.columns:
  175. df_keep_info[c] = pd.NA
  176. df_keep_info.loc[matched_idx, c] = df_latest_matched[c].values
  177. # 重新标记 原来是1 -> 0 原来是0 -> 0 原来是2 -> 0, 原来是-1 -> 1
  178. old_flags = df_keep_info.loc[matched_idx, "keep_flag"]
  179. df_keep_info.loc[matched_idx, "keep_flag"] = old_flags.apply(
  180. lambda x: 0 if x in (0, 1, 2) else (1 if x == -1 else 1)
  181. )
  182. # 符合场景三的索引 (在 df_keep_with_merge 中)
  183. if keep_only_idx:
  184. mask_keep_only = df_keep_info.index.isin(keep_only_idx) # 布尔索引序列
  185. # 如果 df_keep_info 的 keep_flag 为-1,此时标记为-2
  186. # mask_to_remove = mask_keep_only & (df_keep_info["keep_flag"] == -1)
  187. # df_keep_info.loc[mask_to_remove, "keep_flag"] = -2
  188. # 如果 df_keep_info 的 keep_flag 大于等于0
  189. mask_need_observe = mask_keep_only & (df_keep_info["keep_flag"] >= 0) # 布尔索引序列
  190. if mask_need_observe.any():
  191. if "hours_until_departure" not in df_keep_info.columns:
  192. df_keep_info.loc[mask_need_observe, "keep_flag"] = -1
  193. else:
  194. hud = pd.to_numeric(
  195. df_keep_info.loc[mask_need_observe, "hours_until_departure"],
  196. errors="coerce",
  197. )
  198. # hours_until_departure自动减1
  199. # new_hud = hud - 1
  200. new_hud = hud - hud_decrement
  201. df_keep_info.loc[mask_need_observe, "hours_until_departure"] = new_hud
  202. df_keep_info.loc[mask_need_observe, "keep_flag"] = -1 # 删除标志
  203. # df_keep_only_keys = df_keep_info.loc[mask_keep_only, key_cols].copy()
  204. # df_keep_only_keys["_row_idx"] = df_keep_only_keys.index
  205. # # 检查 df_keep_only_keys 是否在 df_last_predict_not_drop 中
  206. # df_keep_only_keys = df_keep_only_keys.merge(
  207. # df_last_predict_not_drop[key_cols].drop_duplicates(),
  208. # on=key_cols,
  209. # how="left",
  210. # indicator=True,
  211. # )
  212. # idx_in_not_drop = df_keep_only_keys.loc[
  213. # df_keep_only_keys["_merge"] == "both", "_row_idx"
  214. # ].tolist()
  215. # mask_in_not_drop = df_keep_info.index.isin(idx_in_not_drop) # 在 df_last_predict_not_drop 中出现 只是will_price_drop为0 未达边界
  216. # mask_not_drop_observe = mask_need_observe & mask_in_not_drop # 判断为不降价的布尔索引数组
  217. # mask_boundary_observe = mask_need_observe & ~mask_in_not_drop # 判断为到达边界的布尔索引数组
  218. # df_keep_info.loc[mask_not_drop_observe, "keep_flag"] = -1 # 删除标志
  219. # if mask_boundary_observe.any():
  220. # new_hud_full = pd.to_numeric(
  221. # df_keep_info["hours_until_departure"], errors="coerce"
  222. # )
  223. # df_keep_info.loc[mask_boundary_observe, "keep_flag"] = -1 # 默认删除标志
  224. # df_keep_info.loc[
  225. # mask_boundary_observe & new_hud_full.gt(4), "keep_flag" # 如果达到边界且hours_until_departure大于4 则给保留标志
  226. # ] = 2
  227. pass
  228. # 对于这些边界保持状态(keep_flag为2) 检查其是否在最新一次验价后的文件里存在, 如果不存在 则标记为-1
  229. # df_temp_2 = df_keep_info.loc[df_keep_info["keep_flag"] == 2, key_cols].copy()
  230. # if not df_temp_2.empty:
  231. # end_dir = "/home/node04/descending_cabin_files"
  232. # end_candidates = []
  233. # if os.path.isdir(end_dir):
  234. # for f in os.listdir(end_dir):
  235. # if f.startswith("keep_info_end_") and f.endswith(".csv"):
  236. # ts = f.replace("keep_info_end_", "").replace(".csv", "")
  237. # if ts.isdigit():
  238. # end_candidates.append((ts, f)) #(时间戳,文件名)
  239. # if end_candidates:
  240. # end_candidates.sort(key=lambda x: x[0])
  241. # end_last_path = os.path.join(end_dir, end_candidates[-1][1]) # 最新一次验价后的文件
  242. # try:
  243. # df_end_last = pd.read_csv(end_last_path)
  244. # except Exception:
  245. # df_end_last = pd.DataFrame()
  246. # if not df_end_last.empty and all(c in df_end_last.columns for c in key_cols): # key_cols作为比对条件
  247. # df_temp_2["_row_idx"] = df_temp_2.index
  248. # df_end_keys = df_end_last[key_cols].drop_duplicates().copy()
  249. # for c in key_cols:
  250. # df_temp_2[c] = df_temp_2[c].astype(str)
  251. # df_end_keys[c] = df_end_keys[c].astype(str)
  252. # df_temp_2_with_merge = df_temp_2.merge(
  253. # df_end_keys, on=key_cols, how="left", indicator=True
  254. # )
  255. # # 对于只在 df_temp_2 出现,而不在 df_end_keys 出现的索引,在 df_keep_info 中标记为-1
  256. # idx_to_rm_2 = df_temp_2_with_merge.loc[
  257. # df_temp_2_with_merge["_merge"] == "left_only", "_row_idx"
  258. # ].tolist()
  259. # if idx_to_rm_2:
  260. # df_keep_info.loc[idx_to_rm_2, "keep_flag"] = -1
  261. # 将 df_to_add 添加到 df_keep_info 之后
  262. add_rows = len(df_to_add) if "df_to_add" in locals() else 0
  263. if add_rows:
  264. df_keep_info = pd.concat([df_keep_info, df_to_add], ignore_index=True)
  265. # 将长时间没更新的航班标记为-1
  266. dt_update_hour = pd.to_datetime(df_keep_info["update_hour"], errors="coerce")
  267. dt_crawl_date = pd.to_datetime(df_keep_info["crawl_date"], errors="coerce")
  268. mask_abnormal_time = (dt_update_hour - dt_crawl_date) > pd.Timedelta(hours=8)
  269. if mask_abnormal_time.any():
  270. df_keep_info.loc[mask_abnormal_time.fillna(False), "keep_flag"] = -1
  271. df_keep_info_snapshot = df_keep_info.copy()
  272. df_keep_info_snapshot.to_csv(keep_info_snapshot_path, index=False, encoding="utf-8-sig")
  273. print(
  274. f"维护表快照已保存: {keep_info_snapshot_path} (rows={len(df_keep_info_snapshot)})"
  275. )
  276. # 移除 keep_flag 为 -1 的行
  277. before_rm = len(df_keep_info)
  278. df_keep_info = df_keep_info.loc[df_keep_info["keep_flag"] != -1].reset_index(drop=True)
  279. rm_rows = before_rm - len(df_keep_info)
  280. # 保存更新后的 df_keep_info 到维护表csv文件
  281. df_keep_info.to_csv(keep_info_path, index=False, encoding="utf-8-sig")
  282. print(
  283. f"维护表已更新: {keep_info_path} (rows={len(df_keep_info)} add={add_rows} rm={rm_rows})"
  284. )
  285. # ================================================================
  286. # for idx, row in df_last_predict_will_drop.iterrows():
  287. # city_pair = row['city_pair']
  288. # flight_day = row['flight_day']
  289. # flight_number_1 = row['flight_number_1']
  290. # flight_number_2 = row['flight_number_2']
  291. # baggage = row['baggage']
  292. # from_city_code = city_pair.split('-')[0]
  293. # to_city_code = city_pair.split('-')[1]
  294. # from_day = datetime.datetime.strptime(flight_day, '%Y-%m-%d').strftime('%Y%m%d')
  295. # baggage_str = f"1-{baggage}"
  296. # pass
  297. # adult_total_price = row['adult_total_price']
  298. # hours_until_departure = row['hours_until_departure']
  299. pass
  300. if __name__ == "__main__":
  301. time.sleep(5)
  302. follow_up_handle()
  303. time.sleep(5)
  304. from descending_cabin_task import main as descending_cabin_task_main
  305. descending_cabin_task_main()