|
@@ -0,0 +1,157 @@
|
|
|
|
|
+import os
|
|
|
|
|
+import datetime
|
|
|
|
|
+import pandas as pd
|
|
|
|
|
+from data_loader import mongo_con_parse, validate_keep_one_line
|
|
|
|
|
+from config import mongo_config, mongo_table_uo
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _validate_keep_info_df(df_keep_info_part):
|
|
|
|
|
+ client, db = mongo_con_parse(mongo_config)
|
|
|
|
|
+ count = 0
|
|
|
|
|
+
|
|
|
|
|
+ if "price_diff" not in df_keep_info_part.columns:
|
|
|
|
|
+ df_keep_info_part["price_diff"] = 0
|
|
|
|
|
+ if "time_diff_hours" not in df_keep_info_part.columns:
|
|
|
|
|
+ df_keep_info_part["time_diff_hours"] = 0
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ for idx, row in df_keep_info_part.iterrows():
|
|
|
|
|
+ df_keep_info_part.at[idx, "price_diff"] = 0
|
|
|
|
|
+ df_keep_info_part.at[idx, "time_diff_hours"] = 0
|
|
|
|
|
+
|
|
|
|
|
+ city_pair = row['citypair']
|
|
|
|
|
+ flight_numbers = row['flight_numbers']
|
|
|
|
|
+ baggage_weight = row['baggage_weight']
|
|
|
|
|
+ from_date = row['from_date']
|
|
|
|
|
+
|
|
|
|
|
+ into_update_hour = row['into_update_hour']
|
|
|
|
|
+ into_update_dt = pd.to_datetime(into_update_hour, format='%Y-%m-%d %H:%M:%S')
|
|
|
|
|
+ del_batch_time_str = row['del_batch_time_str']
|
|
|
|
|
+ del_batch_dt = pd.to_datetime(del_batch_time_str, format='%Y%m%d%H%M')
|
|
|
|
|
+ del_batch_std_str = del_batch_dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
+
|
|
|
|
|
+ entry_price = pd.to_numeric(row.get('price_total'), errors='coerce')
|
|
|
|
|
+
|
|
|
|
|
+ df_query = validate_keep_one_line(db, mongo_table_uo, city_pair, flight_numbers, baggage_weight, from_date, entry_price, into_update_hour, del_batch_std_str)
|
|
|
|
|
+
|
|
|
|
|
+ if (not df_query.empty) and pd.notna(entry_price):
|
|
|
|
|
+ if ("price_total" in df_query.columns) and ("create_time" in df_query.columns):
|
|
|
|
|
+ df_query["price_total"] = pd.to_numeric(df_query["price_total"], errors="coerce")
|
|
|
|
|
+ df_query["create_dt"] = pd.to_datetime(df_query["create_time"], errors="coerce")
|
|
|
|
|
+ df_query = (
|
|
|
|
|
+ df_query.dropna(subset=["price_total", "create_dt"])
|
|
|
|
|
+ .sort_values("create_dt")
|
|
|
|
|
+ .reset_index(drop=True)
|
|
|
|
|
+ )
|
|
|
|
|
+ mask_drop = df_query["price_total"] < entry_price
|
|
|
|
|
+ if mask_drop.any():
|
|
|
|
|
+ first_row = df_query.loc[mask_drop].iloc[0]
|
|
|
|
|
+ price_diff = entry_price - first_row["price_total"]
|
|
|
|
|
+ time_diff_hours = (first_row["create_dt"] - into_update_dt) / pd.Timedelta(hours=1)
|
|
|
|
|
+ df_keep_info_part.at[idx, "price_diff"] = round(float(price_diff), 2)
|
|
|
|
|
+ df_keep_info_part.at[idx, "time_diff_hours"] = round(float(time_diff_hours), 2)
|
|
|
|
|
+
|
|
|
|
|
+ del df_query
|
|
|
|
|
+
|
|
|
|
|
+ count += 1
|
|
|
|
|
+ if count % 5 == 0:
|
|
|
|
|
+ print(f"cal count: {count}")
|
|
|
|
|
+
|
|
|
|
|
+ print(f"计算结束")
|
|
|
|
|
+ client.close()
|
|
|
|
|
+
|
|
|
|
|
+ return df_keep_info_part
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def verify_process(min_batch_time_str, max_batch_time_str):
|
|
|
|
|
+ object_dir = "./keep"
|
|
|
|
|
+
|
|
|
|
|
+ output_dir = f"./validate/keep"
|
|
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+ timestamp_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
|
|
|
|
+ save_scv = f"result_keep_verify_{timestamp_str}.csv"
|
|
|
|
|
+ output_path = os.path.join(output_dir, save_scv)
|
|
|
|
|
+
|
|
|
|
|
+ # 检查目录是否存在
|
|
|
|
|
+ if not os.path.exists(object_dir):
|
|
|
|
|
+ print(f"目录不存在: {object_dir}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ # 获取所有以 keep_info_ 开头的 CSV 文件
|
|
|
|
|
+ csv_files = []
|
|
|
|
|
+ for file in os.listdir(object_dir):
|
|
|
|
|
+ if file.startswith("keep_info_") and file.endswith(".csv"):
|
|
|
|
|
+ csv_files.append(file)
|
|
|
|
|
+
|
|
|
|
|
+ if not csv_files:
|
|
|
|
|
+ print(f"在 {object_dir} 中没有找到 keep_info_ 开头的 CSV 文件")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ csv_files.sort()
|
|
|
|
|
+
|
|
|
|
|
+ min_batch_dt = datetime.datetime.strptime(min_batch_time_str, "%Y%m%d%H%M")
|
|
|
|
|
+ min_batch_dt = min_batch_dt.replace(minute=0, second=0, microsecond=0)
|
|
|
|
|
+ max_batch_dt = datetime.datetime.strptime(max_batch_time_str, "%Y%m%d%H%M")
|
|
|
|
|
+ max_batch_dt = max_batch_dt.replace(minute=0, second=0, microsecond=0)
|
|
|
|
|
+
|
|
|
|
|
+ if min_batch_dt is not None and max_batch_dt is not None and min_batch_dt > max_batch_dt:
|
|
|
|
|
+ print(f"时间范围非法: min_batch_time_str({min_batch_time_str}) > max_batch_time_str({max_batch_time_str}),退出")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ # 从所有的 keep_info 文件中
|
|
|
|
|
+ for csv_file in csv_files:
|
|
|
|
|
+ batch_time_str = (
|
|
|
|
|
+ csv_file.replace("keep_info_", "").replace(".csv", "")
|
|
|
|
|
+ )
|
|
|
|
|
+ batch_dt = datetime.datetime.strptime(batch_time_str, "%Y%m%d%H%M")
|
|
|
|
|
+ batch_hour_dt = batch_dt.replace(minute=0, second=0, microsecond=0)
|
|
|
|
|
+
|
|
|
|
|
+ if min_batch_dt is not None and batch_hour_dt < min_batch_dt:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if max_batch_dt is not None and batch_hour_dt > max_batch_dt:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 读取 CSV 文件
|
|
|
|
|
+ csv_path = os.path.join(object_dir, csv_file)
|
|
|
|
|
+ try:
|
|
|
|
|
+ df_keep_info = pd.read_csv(csv_path)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"read {csv_path} error: {str(e)}")
|
|
|
|
|
+ df_keep_info = pd.DataFrame()
|
|
|
|
|
+
|
|
|
|
|
+ if df_keep_info.empty:
|
|
|
|
|
+ print(f"keep_info数据为空: {csv_file}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ df_keep_info_del = df_keep_info[df_keep_info['keep_flag'] == -1].reset_index(drop=True)
|
|
|
|
|
+ df_keep_info_del['del_batch_time_str'] = batch_time_str
|
|
|
|
|
+ df_keep_info_del = _validate_keep_info_df(df_keep_info_del)
|
|
|
|
|
+
|
|
|
|
|
+ # 根据价格变化情况, 移出时间与验证终点时间的对比, 计算 status_flag 状态
|
|
|
|
|
+ price_diff_num = pd.to_numeric(df_keep_info_del.get("price_diff"), errors="coerce").fillna(0)
|
|
|
|
|
+ del_batch_dt = pd.to_datetime(
|
|
|
|
|
+ df_keep_info_del.get("del_batch_time_str"), format="%Y%m%d%H%M", errors="coerce"
|
|
|
|
|
+ )
|
|
|
|
|
+ valid_end_dt = pd.to_datetime(
|
|
|
|
|
+ df_keep_info_del.get("valid_end_hour"), format="%Y-%m-%d %H:%M:%S", errors="coerce"
|
|
|
|
|
+ )
|
|
|
|
|
+ status_flag = pd.Series(0, index=df_keep_info_del.index, dtype="int64") # 其它场景
|
|
|
|
|
+ status_flag.loc[price_diff_num > 0] = 1 # 降价场景
|
|
|
|
|
+ mask_zero = price_diff_num == 0
|
|
|
|
|
+ mask_time_ok = mask_zero & del_batch_dt.notna() & valid_end_dt.notna() & (del_batch_dt >= valid_end_dt)
|
|
|
|
|
+ status_flag.loc[mask_time_ok] = 2 # 超时场景
|
|
|
|
|
+ df_keep_info_del["status_flag"] = status_flag
|
|
|
|
|
+
|
|
|
|
|
+ write_header = not os.path.exists(output_path)
|
|
|
|
|
+ df_keep_info_del.to_csv(output_path, mode="a", header=write_header, index=False, encoding="utf-8-sig")
|
|
|
|
|
+ del df_keep_info_del
|
|
|
|
|
+ print(f"批次:{batch_time_str} 检验结束")
|
|
|
|
|
+
|
|
|
|
|
+ print("检验结束")
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ verify_process("202604071700", "202604081400")
|
|
|
|
|
+ pass
|