|
@@ -926,7 +926,7 @@ def preprocess_data_simple(df_input, is_train=False):
|
|
|
|
|
|
|
|
# 训练过程
|
|
# 训练过程
|
|
|
if is_train:
|
|
if is_train:
|
|
|
- df_target = df_input[(df_input['hours_until_departure'] >= 8) & (df_input['hours_until_departure'] <= 240)].copy() # 扩展至240小时(10天)
|
|
|
|
|
|
|
+ df_target = df_input[(df_input['hours_until_departure'] >= 72) & (df_input['hours_until_departure'] <= 240)].copy() # 扩展至240小时(10天)
|
|
|
df_target = df_target.sort_values(
|
|
df_target = df_target.sort_values(
|
|
|
by=['gid', 'hours_until_departure'],
|
|
by=['gid', 'hours_until_departure'],
|
|
|
ascending=[True, False]
|
|
ascending=[True, False]
|
|
@@ -1073,7 +1073,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
).reset_index(drop=True)
|
|
).reset_index(drop=True)
|
|
|
|
|
|
|
|
df_sorted = df_sorted[
|
|
df_sorted = df_sorted[
|
|
|
- df_sorted['hours_until_departure'].between(8, 240)
|
|
|
|
|
|
|
+ df_sorted['hours_until_departure'].between(72, 240)
|
|
|
].reset_index(drop=True)
|
|
].reset_index(drop=True)
|
|
|
|
|
|
|
|
# 每个 gid 取 hours_until_departure 最小的一条
|
|
# 每个 gid 取 hours_until_departure 最小的一条
|
|
@@ -1082,9 +1082,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
.reset_index(drop=True)
|
|
.reset_index(drop=True)
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- # 确保 hours_until_departure 在 [8, 240] 的 范围内
|
|
|
|
|
|
|
+ # 确保 hours_until_departure 在 [72, 240] 的 范围内
|
|
|
# df_min_hours = df_min_hours[
|
|
# df_min_hours = df_min_hours[
|
|
|
- # df_min_hours['hours_until_departure'].between(8, 240)
|
|
|
|
|
|
|
+ # df_min_hours['hours_until_departure'].between(72, 240)
|
|
|
# ].reset_index(drop=True)
|
|
# ].reset_index(drop=True)
|
|
|
|
|
|
|
|
drop_info_csv_path = os.path.join(output_dir, f'{group_route_str}_drop_info.csv')
|
|
drop_info_csv_path = os.path.join(output_dir, f'{group_route_str}_drop_info.csv')
|
|
@@ -1193,14 +1193,14 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
|
|
|
|
|
# ==================== 综合评分:包络高位 × 降价潜力 ====================
|
|
# ==================== 综合评分:包络高位 × 降价潜力 ====================
|
|
|
# target_score = 包络位置(越高越好)× 降价潜力(越高越好)
|
|
# target_score = 包络位置(越高越好)× 降价潜力(越高越好)
|
|
|
- thres_ep = 0.7
|
|
|
|
|
- thres_dp = 0.3
|
|
|
|
|
|
|
+ thres_ep = 0.6
|
|
|
|
|
+ thres_dp = 0.4
|
|
|
df_min_hours['target_score'] = (
|
|
df_min_hours['target_score'] = (
|
|
|
df_min_hours['envelope_position'] * thres_ep + df_min_hours['drop_potential'] * thres_dp
|
|
df_min_hours['envelope_position'] * thres_ep + df_min_hours['drop_potential'] * thres_dp
|
|
|
).round(4)
|
|
).round(4)
|
|
|
|
|
|
|
|
# 综合评分阈值:大于阈值的都认为值得投放
|
|
# 综合评分阈值:大于阈值的都认为值得投放
|
|
|
- target_score_threshold = 0.7
|
|
|
|
|
|
|
+ target_score_threshold = 0.75
|
|
|
# df_min_hours['target_score_threshold'] = target_score_threshold
|
|
# df_min_hours['target_score_threshold'] = target_score_threshold
|
|
|
df_min_hours['is_good_target'] = (df_min_hours['target_score'] >= target_score_threshold).astype(int)
|
|
df_min_hours['is_good_target'] = (df_min_hours['target_score'] >= target_score_threshold).astype(int)
|
|
|
|
|
|
|
@@ -1489,7 +1489,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
df_min_hours["update_hour"] = _pred_dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
df_min_hours["update_hour"] = _pred_dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
_dep_hour = pd.to_datetime(df_min_hours["from_time"], errors="coerce").dt.floor("h")
|
|
_dep_hour = pd.to_datetime(df_min_hours["from_time"], errors="coerce").dt.floor("h")
|
|
|
df_min_hours["valid_begin_hour"] = (_dep_hour - pd.to_timedelta(240, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
df_min_hours["valid_begin_hour"] = (_dep_hour - pd.to_timedelta(240, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
- df_min_hours["valid_end_hour"] = (_dep_hour - pd.to_timedelta(8, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
|
+ df_min_hours["valid_end_hour"] = (_dep_hour - pd.to_timedelta(72, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
|
|
# 要展示在预测表里的字段
|
|
# 要展示在预测表里的字段
|
|
|
order_cols = ['city_pair', 'flight_day', 'flight_number_1', 'flight_number_2', 'from_time',
|
|
order_cols = ['city_pair', 'flight_day', 'flight_number_1', 'flight_number_2', 'from_time',
|
|
@@ -1523,12 +1523,12 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
|
|
|
na_position='last',
|
|
na_position='last',
|
|
|
).reset_index(drop=True)
|
|
).reset_index(drop=True)
|
|
|
|
|
|
|
|
- # 时间段过滤 过滤掉异常时间(update_hour 早于 crawl_date, 以及超过8小时不更新的数据)
|
|
|
|
|
|
|
+ # 时间段过滤 过滤掉异常时间(update_hour 早于 crawl_date, 以及超过12小时不更新的数据)
|
|
|
update_dt = pd.to_datetime(df_predict["update_hour"], errors="coerce")
|
|
update_dt = pd.to_datetime(df_predict["update_hour"], errors="coerce")
|
|
|
crawl_dt = pd.to_datetime(df_predict["crawl_date"], errors="coerce")
|
|
crawl_dt = pd.to_datetime(df_predict["crawl_date"], errors="coerce")
|
|
|
dt_diff = update_dt - crawl_dt
|
|
dt_diff = update_dt - crawl_dt
|
|
|
df_predict = df_predict.loc[
|
|
df_predict = df_predict.loc[
|
|
|
- (dt_diff >= pd.Timedelta(0)) & (dt_diff <= pd.Timedelta(hours=8))
|
|
|
|
|
|
|
+ (dt_diff >= pd.Timedelta(0)) & (dt_diff <= pd.Timedelta(hours=12))
|
|
|
# (dt_diff >= pd.Timedelta(0))
|
|
# (dt_diff >= pd.Timedelta(0))
|
|
|
].reset_index(drop=True)
|
|
].reset_index(drop=True)
|
|
|
print("更新时间过滤完成")
|
|
print("更新时间过滤完成")
|