main_pe_0.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. import os
  2. import time
  3. from datetime import datetime, timedelta
  4. from config import mongodb_config, vj_flight_route_list_hot, vj_flight_route_list_nothot, CLEAN_VJ_HOT_NEAR_INFO_TAB, CLEAN_VJ_NOTHOT_NEAR_INFO_TAB
  5. from data_loader import load_train_data
  6. from data_preprocess import preprocess_data_simple
  7. from utils import chunk_list_with_index
  8. def start_predict():
  9. print(f"开始预测")
  10. output_dir = "./data_shards_0"
  11. photo_dir = "./photo_0"
  12. predict_dir = "./predictions_0"
  13. # 确保目录存在
  14. os.makedirs(output_dir, exist_ok=True)
  15. os.makedirs(photo_dir, exist_ok=True)
  16. os.makedirs(predict_dir, exist_ok=True)
  17. cpu_cores = os.cpu_count() # 你的系统是72
  18. max_workers = min(4, cpu_cores) # 最大不超过4个进程
  19. # 当前时间,取整时
  20. current_time = datetime.now()
  21. current_time_str = current_time.strftime("%Y%m%d%H%M")
  22. hourly_time = current_time.replace(minute=0, second=0, microsecond=0)
  23. hourly_time_str = hourly_time.strftime("%Y%m%d%H%M")
  24. print(f"预测时间:{current_time_str}, (取整): {hourly_time_str}")
  25. # 预测时间范围,满足起飞时间 在18小时后到54小时后
  26. pred_hour_begin = hourly_time + timedelta(hours=18)
  27. pred_hour_end = hourly_time + timedelta(hours=54)
  28. pred_date_end = pred_hour_end.strftime("%Y-%m-%d")
  29. pred_date_begin = pred_hour_begin.strftime("%Y-%m-%d")
  30. print(f"预测起飞时间范围: {pred_date_begin} 到 {pred_date_end}")
  31. # 主干代码 (排除冷门航线)
  32. flight_route_list = vj_flight_route_list_hot + vj_flight_route_list_nothot[:0]
  33. flight_route_list_len = len(flight_route_list)
  34. route_len_hot = len(vj_flight_route_list_hot)
  35. route_len_nothot = len(vj_flight_route_list_nothot[:0])
  36. group_size = 1 # 每几组作为一个批次
  37. chunks = chunk_list_with_index(flight_route_list, group_size)
  38. # 如果从中途某个批次预测, 修改起始索引
  39. resume_chunk_idx = 0
  40. chunks = chunks[resume_chunk_idx:]
  41. batch_starts = [start_idx for start_idx, _ in chunks]
  42. print(f"预测阶段起始索引顺序:{batch_starts}")
  43. # 预测阶段
  44. for i, (_, group_route_list) in enumerate(chunks, start=resume_chunk_idx):
  45. # 特殊处理,跳过不好的批次
  46. # client, db = mongo_con_parse()
  47. print(f"第 {i} 组 :", group_route_list)
  48. # batch_flight_routes = group_route_list
  49. # 根据索引位置决定是 热门 还是 冷门
  50. if 0 <= i < route_len_hot:
  51. is_hot = 1
  52. table_name = CLEAN_VJ_HOT_NEAR_INFO_TAB
  53. elif route_len_hot <= i < route_len_hot + route_len_nothot:
  54. is_hot = 0
  55. table_name = CLEAN_VJ_NOTHOT_NEAR_INFO_TAB
  56. else:
  57. print(f"无法确定热门还是冷门, 跳过此批次。")
  58. continue
  59. # 加载测试数据 (仅仅是时间段取到后天)
  60. start_time = time.time()
  61. df_test = load_train_data(mongodb_config, group_route_list, table_name, pred_date_begin, pred_date_end, output_dir, is_hot,
  62. use_multiprocess=True, max_workers=max_workers)
  63. end_time = time.time()
  64. run_time = round(end_time - start_time, 3)
  65. print(f"用时: {run_time} 秒")
  66. if df_test.empty:
  67. print(f"测试数据为空,跳过此批次。")
  68. continue
  69. # 按起飞时间过滤
  70. # 创建临时字段:seg1_dep_time 的整点时间
  71. df_test['seg1_dep_hour'] = df_test['seg1_dep_time'].dt.floor('h')
  72. # 使用整点时间进行比较过滤
  73. mask = (df_test['seg1_dep_hour'] >= pred_hour_begin) & (df_test['seg1_dep_hour'] < pred_hour_end)
  74. original_count = len(df_test)
  75. df_test = df_test[mask].reset_index(drop=True)
  76. filtered_count = len(df_test)
  77. # 删除临时字段
  78. df_test = df_test.drop(columns=['seg1_dep_hour'])
  79. print(f"按起飞时间过滤:过滤前 {original_count} 条,过滤后 {filtered_count} 条")
  80. if filtered_count == 0:
  81. print(f"起飞时间在 {pred_hour_begin} 到 {pred_hour_end} 之间没有航班,跳过此批次。")
  82. continue
  83. df_test_inputs = preprocess_data_simple(df_test)
  84. # 保存临时文件
  85. csv_path = os.path.join(output_dir, f'temp.csv')
  86. df_test_inputs.to_csv(csv_path, mode='a', index=False, header=not os.path.exists(csv_path), encoding='utf-8-sig')
  87. del df_test_inputs
  88. pass
  89. pass
  90. if __name__ == "__main__":
  91. start_predict()