|
@@ -27,6 +27,29 @@ def _validate_predict_df(df_predict):
|
|
|
update_dt
|
|
update_dt
|
|
|
).strftime('%Y-%m-%d %H:%M:%S')
|
|
).strftime('%Y-%m-%d %H:%M:%S')
|
|
|
df_val= validate_one_line(db, city_pair, flight_day, flight_number_1, flight_number_2, baggage, valid_begin_hour_modify)
|
|
df_val= validate_one_line(db, city_pair, flight_day, flight_number_1, flight_number_2, baggage, valid_begin_hour_modify)
|
|
|
|
|
+
|
|
|
|
|
+ entry_price = pd.to_numeric(row.get('adult_total_price'), errors='coerce')
|
|
|
|
|
+ crawl_dt = pd.to_datetime(row.get('crawl_date'), errors='coerce')
|
|
|
|
|
+ batch_dt = pd.to_datetime(row.get('batch_time'), format="%Y%m%d%H%M", errors='coerce')
|
|
|
|
|
+ wait_start_dt = pd.NaT
|
|
|
|
|
+ wait_end_dt = pd.NaT
|
|
|
|
|
+ dep_hour_dt = pd.to_datetime(row.get('from_time'), errors='coerce')
|
|
|
|
|
+ if pd.notna(batch_dt):
|
|
|
|
|
+ wait_start_dt = batch_dt.floor('h')
|
|
|
|
|
+ if pd.notna(crawl_dt):
|
|
|
|
|
+ crawl_floor = crawl_dt.floor('h')
|
|
|
|
|
+ if pd.isna(wait_start_dt):
|
|
|
|
|
+ wait_start_dt = crawl_floor
|
|
|
|
|
+ else:
|
|
|
|
|
+ wait_start_dt = max(wait_start_dt, crawl_floor) # 等待近端接近预测批次时间
|
|
|
|
|
+ if pd.notna(wait_start_dt):
|
|
|
|
|
+ wait_end_dt = wait_start_dt + pd.Timedelta(hours=48) # 等待窗口48小时
|
|
|
|
|
+ if pd.notna(dep_hour_dt):
|
|
|
|
|
+ dep_hour_dt = dep_hour_dt.floor('h')
|
|
|
|
|
+ cutoff_dt = dep_hour_dt - pd.Timedelta(hours=4)
|
|
|
|
|
+ if pd.notna(wait_end_dt):
|
|
|
|
|
+ wait_end_dt = min(wait_end_dt, cutoff_dt) # 等待远端不能越过起飞前4小时
|
|
|
|
|
+
|
|
|
# 有可能在当前验证时刻,数据库里没有在valid_begin_hour之后的数据
|
|
# 有可能在当前验证时刻,数据库里没有在valid_begin_hour之后的数据
|
|
|
if not df_val.empty:
|
|
if not df_val.empty:
|
|
|
df_val_f = fill_hourly_crawl_date(df_val, rear_fill=2)
|
|
df_val_f = fill_hourly_crawl_date(df_val, rear_fill=2)
|
|
@@ -42,12 +65,96 @@ def _validate_predict_df(df_predict):
|
|
|
last_update_hour = pd.NA
|
|
last_update_hour = pd.NA
|
|
|
list_change_price = []
|
|
list_change_price = []
|
|
|
list_change_hours = []
|
|
list_change_hours = []
|
|
|
|
|
+ drop_flag_window = 0
|
|
|
|
|
+ first_lower_price = pd.NA
|
|
|
|
|
+ first_lower_update_hour = pd.NA
|
|
|
|
|
+ boundary_final_price = pd.NA
|
|
|
|
|
+ boundary_final_update_hour = pd.NA
|
|
|
|
|
+ trigger_type = pd.NA
|
|
|
|
|
+ trigger_price = pd.NA
|
|
|
|
|
+ trigger_update_hour = pd.NA
|
|
|
|
|
+ pnl = pd.NA
|
|
|
|
|
+ pnl_pct = pd.NA
|
|
|
else:
|
|
else:
|
|
|
# 有效数据的最后一行
|
|
# 有效数据的最后一行
|
|
|
last_row = df_val_f.iloc[-1]
|
|
last_row = df_val_f.iloc[-1]
|
|
|
last_hours_util = last_row['hours_until_departure']
|
|
last_hours_util = last_row['hours_until_departure']
|
|
|
last_update_hour = last_row['update_hour']
|
|
last_update_hour = last_row['update_hour']
|
|
|
|
|
|
|
|
|
|
+ df_val_f['update_hour'] = pd.to_datetime(df_val_f['update_hour'], errors='coerce')
|
|
|
|
|
+
|
|
|
|
|
+ # 使用 batch_time 对齐的实际价格作为 entry_price
|
|
|
|
|
+ if pd.notna(batch_dt):
|
|
|
|
|
+ df_entry = df_val_f[df_val_f['update_hour'] <= batch_dt].copy()
|
|
|
|
|
+ if not df_entry.empty:
|
|
|
|
|
+ entry_price = df_entry.iloc[-1]['adult_total_price']
|
|
|
|
|
+
|
|
|
|
|
+ df_window = df_val_f
|
|
|
|
|
+ if pd.notna(wait_start_dt) and pd.notna(wait_end_dt):
|
|
|
|
|
+ df_window = df_val_f[
|
|
|
|
|
+ (df_val_f['update_hour'] >= wait_start_dt) &
|
|
|
|
|
+ (df_val_f['update_hour'] <= wait_end_dt)
|
|
|
|
|
+ ].copy() # 构建观测窗口
|
|
|
|
|
+ else:
|
|
|
|
|
+ df_window = df_val_f.iloc[0:0].copy() # 空切片
|
|
|
|
|
+
|
|
|
|
|
+ if not df_window.empty:
|
|
|
|
|
+ df_window = df_window.sort_values('update_hour')
|
|
|
|
|
+ df_window_price_changes = df_window.loc[
|
|
|
|
|
+ df_window["adult_total_price"].shift() != df_window["adult_total_price"]
|
|
|
|
|
+ ].copy()
|
|
|
|
|
+ df_window_price_changes['change_amount'] = (
|
|
|
|
|
+ df_window_price_changes['adult_total_price'].diff().fillna(0)
|
|
|
|
|
+ )
|
|
|
|
|
+ df_first_window_negative = df_window_price_changes[
|
|
|
|
|
+ df_window_price_changes['change_amount'] < -5
|
|
|
|
|
+ ].head(1)
|
|
|
|
|
+ drop_flag_window = 1 if not df_first_window_negative.empty else 0 # 在观测窗口中的发生降价判定
|
|
|
|
|
+ else:
|
|
|
|
|
+ drop_flag_window = 0
|
|
|
|
|
+
|
|
|
|
|
+ first_lower_price = pd.NA
|
|
|
|
|
+ first_lower_update_hour = pd.NA
|
|
|
|
|
+ if not df_window.empty and pd.notna(entry_price) and pd.notna(wait_start_dt):
|
|
|
|
|
+ df_lower = df_window[
|
|
|
|
|
+ (df_window['update_hour'] > wait_start_dt) &
|
|
|
|
|
+ (df_window['adult_total_price'] <= entry_price - 5)
|
|
|
|
|
+ ].head(1)
|
|
|
|
|
+ if not df_lower.empty: # 首次出现低于 entry_price - 5 的价格与时间
|
|
|
|
|
+ first_lower_price = df_lower['adult_total_price'].iloc[0].round(2)
|
|
|
|
|
+ first_lower_update_hour = df_lower['update_hour'].iloc[0]
|
|
|
|
|
+
|
|
|
|
|
+ boundary_final_price = pd.NA
|
|
|
|
|
+ boundary_final_update_hour = pd.NA
|
|
|
|
|
+ if not df_window.empty: # 观测窗口远端边界的价格与时间
|
|
|
|
|
+ boundary_row = df_window.iloc[-1]
|
|
|
|
|
+ boundary_final_price = boundary_row['adult_total_price']
|
|
|
|
|
+ boundary_final_update_hour = boundary_row['update_hour']
|
|
|
|
|
+
|
|
|
|
|
+ trigger_type = pd.NA
|
|
|
|
|
+ trigger_price = pd.NA
|
|
|
|
|
+ trigger_update_hour = pd.NA
|
|
|
|
|
+ if pd.notna(first_lower_price):
|
|
|
|
|
+ trigger_type = "first_lower" # 发生降价
|
|
|
|
|
+ trigger_price = first_lower_price
|
|
|
|
|
+ trigger_update_hour = first_lower_update_hour
|
|
|
|
|
+ elif pd.notna(boundary_final_price):
|
|
|
|
|
+ trigger_type = "boundary" # 到达边界
|
|
|
|
|
+ trigger_price = boundary_final_price
|
|
|
|
|
+ trigger_update_hour = boundary_final_update_hour
|
|
|
|
|
+ else:
|
|
|
|
|
+ trigger_type = "no_data"
|
|
|
|
|
+
|
|
|
|
|
+ if pd.notna(entry_price) and pd.notna(trigger_price):
|
|
|
|
|
+ pnl = round(float(entry_price - trigger_price), 2) # 盈利(亏损)额度,基于entry_price
|
|
|
|
|
+ if entry_price != 0:
|
|
|
|
|
+ pnl_pct = round(float(pnl) / float(entry_price) * 100, 2) # 盈利(亏损)百分比,基于entry_price
|
|
|
|
|
+ else:
|
|
|
|
|
+ pnl_pct = pd.NA
|
|
|
|
|
+ else:
|
|
|
|
|
+ pnl = pd.NA
|
|
|
|
|
+ pnl_pct = pd.NA
|
|
|
|
|
+
|
|
|
# 价格变化过滤
|
|
# 价格变化过滤
|
|
|
df_price_changes = df_val_f.loc[
|
|
df_price_changes = df_val_f.loc[
|
|
|
df_val_f["adult_total_price"].shift() != df_val_f["adult_total_price"]
|
|
df_val_f["adult_total_price"].shift() != df_val_f["adult_total_price"]
|
|
@@ -86,6 +193,16 @@ def _validate_predict_df(df_predict):
|
|
|
last_update_hour = pd.NA
|
|
last_update_hour = pd.NA
|
|
|
list_change_price = []
|
|
list_change_price = []
|
|
|
list_change_hours = []
|
|
list_change_hours = []
|
|
|
|
|
+ drop_flag_window = 0
|
|
|
|
|
+ first_lower_price = pd.NA
|
|
|
|
|
+ first_lower_update_hour = pd.NA
|
|
|
|
|
+ boundary_final_price = pd.NA
|
|
|
|
|
+ boundary_final_update_hour = pd.NA
|
|
|
|
|
+ trigger_type = pd.NA
|
|
|
|
|
+ trigger_price = pd.NA
|
|
|
|
|
+ trigger_update_hour = pd.NA
|
|
|
|
|
+ pnl = pd.NA
|
|
|
|
|
+ pnl_pct = pd.NA
|
|
|
|
|
|
|
|
safe_sep = "; "
|
|
safe_sep = "; "
|
|
|
|
|
|
|
@@ -98,6 +215,18 @@ def _validate_predict_df(df_predict):
|
|
|
df_predict.at[idx, 'first_drop_hours_until_departure'] = first_drop_hours_until_departure
|
|
df_predict.at[idx, 'first_drop_hours_until_departure'] = first_drop_hours_until_departure
|
|
|
df_predict.at[idx, 'first_drop_update_hour'] = first_drop_update_hour
|
|
df_predict.at[idx, 'first_drop_update_hour'] = first_drop_update_hour
|
|
|
df_predict.at[idx, 'drop_flag'] = drop_flag
|
|
df_predict.at[idx, 'drop_flag'] = drop_flag
|
|
|
|
|
+ df_predict.at[idx, 'wait_start_hour'] = wait_start_dt
|
|
|
|
|
+ df_predict.at[idx, 'wait_end_hour'] = wait_end_dt
|
|
|
|
|
+ df_predict.at[idx, 'drop_flag_window'] = drop_flag_window
|
|
|
|
|
+ df_predict.at[idx, 'first_lower_price'] = first_lower_price
|
|
|
|
|
+ df_predict.at[idx, 'first_lower_update_hour'] = first_lower_update_hour
|
|
|
|
|
+ df_predict.at[idx, 'boundary_final_price'] = boundary_final_price
|
|
|
|
|
+ df_predict.at[idx, 'boundary_final_update_hour'] = boundary_final_update_hour
|
|
|
|
|
+ df_predict.at[idx, 'trigger_type'] = trigger_type
|
|
|
|
|
+ df_predict.at[idx, 'trigger_price'] = trigger_price
|
|
|
|
|
+ df_predict.at[idx, 'trigger_update_hour'] = trigger_update_hour
|
|
|
|
|
+ df_predict.at[idx, 'pnl'] = pnl
|
|
|
|
|
+ df_predict.at[idx, 'pnl_pct'] = pnl_pct
|
|
|
|
|
|
|
|
count += 1
|
|
count += 1
|
|
|
if count % 5 == 0:
|
|
if count % 5 == 0:
|
|
@@ -452,12 +581,12 @@ if __name__ == "__main__":
|
|
|
# validate_process(node, interval_hours, pred_time_str)
|
|
# validate_process(node, interval_hours, pred_time_str)
|
|
|
# node = "node0127"
|
|
# node = "node0127"
|
|
|
# validate_process_zong(node) # 无条件汇总
|
|
# validate_process_zong(node) # 无条件汇总
|
|
|
- node = "node0127"
|
|
|
|
|
- validate_process_zong(node, True, None, "202602051400") # 有条件汇总
|
|
|
|
|
|
|
+ # node = "node0127"
|
|
|
|
|
+ # validate_process_zong(node, True, None, "202602051400") # 有条件汇总
|
|
|
# node = "node0203"
|
|
# node = "node0203"
|
|
|
# validate_process_zong(node, True, "202602041100", "202602051400") # 有条件汇总
|
|
# validate_process_zong(node, True, "202602041100", "202602051400") # 有条件汇总
|
|
|
- # node = "node0205"
|
|
|
|
|
- # validate_process_zong(node, True, "202602061000") # 有条件汇总
|
|
|
|
|
|
|
+ node = "node0205"
|
|
|
|
|
+ validate_process_zong(node, True, "202602061000", "202602091000") # 有条件汇总
|
|
|
# 1 自动验证
|
|
# 1 自动验证
|
|
|
else:
|
|
else:
|
|
|
node = "node0127"
|
|
node = "node0127"
|