|
@@ -152,6 +152,131 @@ def verify_process(min_batch_time_str, max_batch_time_str):
|
|
|
print()
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def verify_process_2(min_batch_time_str, max_batch_time_str):
|
|
|
|
|
+
|
|
|
|
|
+ object_dir = "/home/node04/descending_cabin_files_uo"
|
|
|
|
|
+
|
|
|
|
|
+ output_dir = f"./validate/keep"
|
|
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+ timestamp_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
|
|
|
|
+ save_scv = f"result_keep_verify_{timestamp_str}.csv"
|
|
|
|
|
+ output_path = os.path.join(output_dir, save_scv)
|
|
|
|
|
+
|
|
|
|
|
+ # 检查目录是否存在
|
|
|
|
|
+ if not os.path.exists(object_dir):
|
|
|
|
|
+ print(f"目录不存在: {object_dir}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ # 获取所有以 keep_info_end_ 开头的 CSV 文件
|
|
|
|
|
+ csv_files = []
|
|
|
|
|
+ for file in os.listdir(object_dir):
|
|
|
|
|
+ if file.startswith("keep_info_end_") and file.endswith(".csv"):
|
|
|
|
|
+ csv_files.append(file)
|
|
|
|
|
+
|
|
|
|
|
+ if not csv_files:
|
|
|
|
|
+ print(f"在 {object_dir} 中没有找到 keep_info_end_ 开头的 CSV 文件")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ csv_files.sort()
|
|
|
|
|
+
|
|
|
|
|
+ min_batch_dt = datetime.datetime.strptime(min_batch_time_str, "%Y%m%d%H%M")
|
|
|
|
|
+ min_batch_dt = min_batch_dt.replace(minute=0, second=0, microsecond=0)
|
|
|
|
|
+ max_batch_dt = datetime.datetime.strptime(max_batch_time_str, "%Y%m%d%H%M")
|
|
|
|
|
+ max_batch_dt = max_batch_dt.replace(minute=0, second=0, microsecond=0)
|
|
|
|
|
+
|
|
|
|
|
+ if min_batch_dt is not None and max_batch_dt is not None and min_batch_dt > max_batch_dt:
|
|
|
|
|
+ print(f"时间范围非法: min_batch_time_str({min_batch_time_str}) > max_batch_time_str({max_batch_time_str}),退出")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ list_df = []
|
|
|
|
|
+
|
|
|
|
|
+ # 从所有的 keep_info_end_ 文件中
|
|
|
|
|
+ for csv_file in csv_files:
|
|
|
|
|
+ batch_time_str = csv_file.replace("keep_info_end_", "").replace(".csv", "")
|
|
|
|
|
+ batch_dt = datetime.datetime.strptime(batch_time_str, "%Y%m%d%H%M%S")
|
|
|
|
|
+ batch_hour_dt = batch_dt.replace(minute=0, second=0, microsecond=0)
|
|
|
|
|
+
|
|
|
|
|
+ if min_batch_dt is not None and batch_hour_dt < min_batch_dt:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if max_batch_dt is not None and batch_hour_dt > max_batch_dt:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 读取 CSV 文件
|
|
|
|
|
+ csv_path = os.path.join(object_dir, csv_file)
|
|
|
|
|
+ try:
|
|
|
|
|
+ df_keep_info = pd.read_csv(csv_path)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"read {csv_path} error: {str(e)}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if df_keep_info.empty:
|
|
|
|
|
+ print(f"keep_info数据为空: {csv_file}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ df_keep_info["batch_time_str"] = batch_hour_dt.strftime("%Y%m%d%H%M")
|
|
|
|
|
+ # df_keep_info["src_file"] = csv_file
|
|
|
|
|
+ list_df.append(df_keep_info)
|
|
|
|
|
+ del df_keep_info
|
|
|
|
|
+
|
|
|
|
|
+ if not list_df:
|
|
|
|
|
+ print("时间范围内没有可用 keep_info_end_ 数据")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ df_keep_all = pd.concat(list_df, ignore_index=True)
|
|
|
|
|
+ del list_df
|
|
|
|
|
+
|
|
|
|
|
+ sort_cols = ["citypair", "flight_numbers", "baggage_weight", "from_date", "into_update_hour"]
|
|
|
|
|
+ df_keep_all = df_keep_all.sort_values(sort_cols, kind="mergesort").reset_index(drop=True)
|
|
|
|
|
+ df_keep_all["gid"] = df_keep_all.groupby(sort_cols, sort=False).ngroup().astype("int64") + 1
|
|
|
|
|
+
|
|
|
|
|
+ client, db = mongo_con_parse(mongo_config)
|
|
|
|
|
+ list_base_row = []
|
|
|
|
|
+
|
|
|
|
|
+ for gid, df_gid in df_keep_all.groupby("gid", sort=False):
|
|
|
|
|
+ city_pair = df_gid["citypair"].iloc[0]
|
|
|
|
|
+ flight_numbers = df_gid["flight_numbers"].iloc[0]
|
|
|
|
|
+ baggage_weight = df_gid["baggage_weight"].iloc[0]
|
|
|
|
|
+ from_date = df_gid["from_date"].iloc[0]
|
|
|
|
|
+ into_update_hour = df_gid["into_update_hour"].iloc[0]
|
|
|
|
|
+ valid_end_hour = df_gid["valid_end_hour"].iloc[0]
|
|
|
|
|
+
|
|
|
|
|
+ into_update_dt = pd.to_datetime(
|
|
|
|
|
+ df_gid.get("into_update_hour"), format="%Y-%m-%d %H:%M:%S", errors="coerce"
|
|
|
|
|
+ ).min()
|
|
|
|
|
+ batch_dt_series = pd.to_datetime(
|
|
|
|
|
+ df_gid.get("batch_time_str"), format="%Y%m%d%H%M", errors="coerce"
|
|
|
|
|
+ )
|
|
|
|
|
+ batch_dt = batch_dt_series.max()
|
|
|
|
|
+
|
|
|
|
|
+ entry_price = float("nan")
|
|
|
|
|
+ if batch_dt_series.notna().any():
|
|
|
|
|
+ idx_latest = batch_dt_series.idxmax()
|
|
|
|
|
+ entry_price = pd.to_numeric(df_gid.loc[idx_latest].get("price_total"), errors="coerce")
|
|
|
|
|
+
|
|
|
|
|
+ valid_end_dt = pd.to_datetime(valid_end_hour, format="%Y-%m-%d %H:%M:%S", errors="coerce")
|
|
|
|
|
+
|
|
|
|
|
+ flag = 0 # 等待(弹出)标记
|
|
|
|
|
+ if batch_dt >= valid_end_dt:
|
|
|
|
|
+ flag = 2 # 超时标记
|
|
|
|
|
+
|
|
|
|
|
+ if pd.isna(into_update_dt) or pd.isna(batch_dt):
|
|
|
|
|
+ print(f"gid={gid} 时间字段解析失败,跳过")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ create_time_begin = (batch_dt + pd.Timedelta(hours=0)).strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
+ create_time_end = (batch_dt + pd.Timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
+
|
|
|
|
|
+ df_query = validate_keep_one_line(db, mongo_table_uo, city_pair, flight_numbers, baggage_weight, from_date, entry_price, into_update_hour, create_time_end)
|
|
|
|
|
+
|
|
|
|
|
+ df_g1 = df_gid.copy()
|
|
|
|
|
+ df_g2 = df_query.copy()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
verify_process("202604071700", "202604081400")
|
|
verify_process("202604071700", "202604081400")
|
|
|
pass
|
|
pass
|