Explorar el Código

调整预测与其它

node04 hace 6 días
padre
commit
7f017b2fac
Se han modificado 5 ficheros con 90 adiciones y 76 borrados
  1. 14 0
      .gitignore
  2. 3 3
      data_loader.py
  3. 68 68
      data_preprocess.py
  4. 1 1
      main_tr_0.py
  5. 4 4
      result_validate_0.py

+ 14 - 0
.gitignore

@@ -0,0 +1,14 @@
+output/
+photo/
+photo_2/
+photo_4/
+data_shards/
+data_shards_0/
+data_shards_2/
+data_shards_4/
+predictions/
+predictions_0/
+predictions_2/
+predictions_4/
+validate/
+__pycache__/

+ 3 - 3
data_loader.py

@@ -545,8 +545,8 @@ def plot_c12_trend(df, output_dir="."):
     # output_dir_photo = output_dir
 
     # 颜色与线型配置(按顺序循环使用)
-    colors = ['green', 'blue', 'red', 'brown']
-    linestyles = ['--', '--', '--', '--']
+    colors = ['blue', 'red', 'brown']
+    linestyles = ['--', '--', '--']
 
     # 确保时间字段为 datetime 类型
     if not hasattr(df['update_hour'], 'dt'):
@@ -1005,7 +1005,7 @@ if __name__ == "__main__":
     os.makedirs(output_dir, exist_ok=True)
 
     # 加载热门航线数据
-    date_begin = "2026-01-15"
+    date_begin = "2026-01-20"
     date_end = datetime.today().strftime("%Y-%m-%d")
 
     flight_route_list = vj_flight_route_list_hot[:]  # 热门 vj_flight_route_list_hot  冷门 vj_flight_route_list_nothot

+ 68 - 68
data_preprocess.py

@@ -1081,7 +1081,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
             # 降价前 增幅阈值的匹配 与 高价历史持续时间 得出降价时间的概率
             if not df_drop_nodes_part.empty and pd.notna(price_change_percent):   
                 # 增幅太小的去掉
-                # df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.1]
+                df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.01]
                 # pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
                 # df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
                 
@@ -1101,9 +1101,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                 if not df_match.empty:
                     dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
                     hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
-                    seats_base = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
+                    # seats_base = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
 
-                    if pd.notna(dur_base) and pd.notna(hud_base) and pd.notna(seats_base):
+                    if pd.notna(dur_base) and pd.notna(hud_base):  # and pd.notna(seats_base)
                         df_match_chk = df_match.copy()
                         dur_vals = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
                         df_match_chk = df_match_chk.loc[dur_vals.notna()].copy()
@@ -1113,9 +1113,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                         df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
                         df_match_chk = df_match_chk.loc[(drop_hud_vals.loc[drop_hud_vals.notna()] - float(hud_base)).abs() <= 12].copy()
 
-                        seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
-                        df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
-                        df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
+                        # seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
+                        # df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
+                        # df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
 
                         # 持续时间、距离起飞时间、座位变化都匹配上
                         if not df_match_chk.empty:
@@ -1202,63 +1202,63 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                 #     continue
                 
                 # 特殊判定场景
-                if price_change_percent < 0:
-
-                    df_tmp = df_keep_nodes_part.copy()
-                    # 确保组内顺序正确(如果前面已经排过,这行可省略)
-                    df_tmp = df_tmp.sort_values(
-                        by=["flight_day", "keep_hours_until_departure"],
-                        ascending=[True, False]
-                    )
-                    # 是否为负值
-                    df_tmp["is_negative"] = df_tmp["keep_price_change_percent"] < 0
+                # if price_change_percent < 0:
+
+                #     df_tmp = df_keep_nodes_part.copy()
+                #     # 确保组内顺序正确(如果前面已经排过,这行可省略)
+                #     df_tmp = df_tmp.sort_values(
+                #         by=["flight_day", "keep_hours_until_departure"],
+                #         ascending=[True, False]
+                #     )
+                #     # 是否为负值
+                #     df_tmp["is_negative"] = df_tmp["keep_price_change_percent"] < 0
                     
-                    if df_tmp["is_negative"].any():
-                        # 标记“负值段”的开始
-                        # 当 is_negative 为 True 且 前一行不是负值时,认为是一个新段
-                        df_tmp["neg_block_id"] = (
-                            df_tmp["is_negative"]
-                            & ~df_tmp.groupby("flight_day")["is_negative"].shift(fill_value=False)
-                        ).groupby(df_tmp["flight_day"]).cumsum()
-                        # 在每个负值段内计数(第几个负值)
-                        df_tmp["neg_rank_in_block"] = (
-                            df_tmp.groupby(["flight_day", "neg_block_id"])
-                            .cumcount() + 1
-                        )
-                        # 每个连续负值段的长度
-                        df_tmp["neg_block_size"] = (
-                            df_tmp.groupby(["flight_day", "neg_block_id"])["is_negative"]
-                            .transform("sum")
-                        )
-                        # 只保留:
-                        # 1) 是负值
-                        # 2) 且不是该连续负值段的最后一个
-                        df_continuous_price_drop = df_tmp[
-                            (df_tmp["is_negative"]) &
-                            (df_tmp["neg_rank_in_block"] < df_tmp["neg_block_size"])
-                        ].drop(
-                            columns=[
-                                "is_negative",
-                                "neg_block_id",
-                                "neg_rank_in_block",
-                                "neg_block_size",
-                            ]
-                        )
-                        pct_diff_c = (df_continuous_price_drop['keep_price_change_percent'] - float(price_change_percent)).abs()
-                        df_match_c = df_continuous_price_drop.loc[pct_diff_c <= pct_threshold_c, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
-
-                        # 符合连续降价条件
-                        if not df_match_c.empty and pd.notna(price_duration_hours):
-                            vals_c = df_match_c['keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
-                            if not vals_c.empty:
-                                min_val_c = vals_c.min()
-                                if min_val_c <= float(price_duration_hours):
-                                    df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
-                                    df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
-                                    df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
-                                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
-                                    length_drop = df_match_c.shape[0]        
-                                    # continue   # 已经判定降价 后面不再做
+                #     if df_tmp["is_negative"].any():
+                #         # 标记“负值段”的开始
+                #         # 当 is_negative 为 True 且 前一行不是负值时,认为是一个新段
+                #         df_tmp["neg_block_id"] = (
+                #             df_tmp["is_negative"]
+                #             & ~df_tmp.groupby("flight_day")["is_negative"].shift(fill_value=False)
+                #         ).groupby(df_tmp["flight_day"]).cumsum()
+                #         # 在每个负值段内计数(第几个负值)
+                #         df_tmp["neg_rank_in_block"] = (
+                #             df_tmp.groupby(["flight_day", "neg_block_id"])
+                #             .cumcount() + 1
+                #         )
+                #         # 每个连续负值段的长度
+                #         df_tmp["neg_block_size"] = (
+                #             df_tmp.groupby(["flight_day", "neg_block_id"])["is_negative"]
+                #             .transform("sum")
+                #         )
+                #         # 只保留:
+                #         # 1) 是负值
+                #         # 2) 且不是该连续负值段的最后一个
+                #         df_continuous_price_drop = df_tmp[
+                #             (df_tmp["is_negative"]) &
+                #             (df_tmp["neg_rank_in_block"] < df_tmp["neg_block_size"])
+                #         ].drop(
+                #             columns=[
+                #                 "is_negative",
+                #                 "neg_block_id",
+                #                 "neg_rank_in_block",
+                #                 "neg_block_size",
+                #             ]
+                #         )
+                #         pct_diff_c = (df_continuous_price_drop['keep_price_change_percent'] - float(price_change_percent)).abs()
+                #         df_match_c = df_continuous_price_drop.loc[pct_diff_c <= pct_threshold_c, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
+
+                #         # 符合连续降价条件
+                #         if not df_match_c.empty and pd.notna(price_duration_hours):
+                #             vals_c = df_match_c['keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
+                #             if not vals_c.empty:
+                #                 min_val_c = vals_c.min()
+                #                 if min_val_c <= float(price_duration_hours):
+                #                     df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
+                #                     df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                #                     df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
+                #                     df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
+                #                     length_drop = df_match_c.shape[0]        
+                #                     # continue   # 已经判定降价 后面不再做
 
                 # 一般判定场景
                 pct_base_1 = float(price_change_percent)
@@ -1281,21 +1281,21 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
 
                     dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
                     # hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
-                    seats_base_1 = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
+                    # seats_base_1 = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
 
-                    if pd.notna(dur_base_1) and pd.notna(seats_base_1):
+                    if pd.notna(dur_base_1):   #  and pd.notna(seats_base_1)
                         df_match_chk_1 = df_match_1.copy()
                         dur_vals_1 = pd.to_numeric(df_match_chk_1['modify_keep_price_duration_hours'], errors='coerce')
                         df_match_chk_1 = df_match_chk_1.loc[dur_vals_1.notna()].copy()
-                        df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 6].copy()
+                        df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 12].copy()
 
                         # drop_hud_vals_1 = pd.to_numeric(df_match_chk_1['keep_hours_until_departure'], errors='coerce')
                         # df_match_chk_1 = df_match_chk_1.loc[drop_hud_vals_1.notna()].copy()
                         # df_match_chk_1 = df_match_chk_1.loc[(drop_hud_vals_1.loc[drop_hud_vals_1.notna()] - float(hud_base_1)).abs() <= 12].copy()
 
-                        seats_vals_1 = pd.to_numeric(df_match_chk_1['keep_seats_remaining_change_amount'], errors='coerce')
-                        df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
-                        df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.loc[seats_vals_1.notna()] == float(seats_base_1)].copy()
+                        # seats_vals_1 = pd.to_numeric(df_match_chk_1['keep_seats_remaining_change_amount'], errors='coerce')
+                        # df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
+                        # df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.loc[seats_vals_1.notna()] == float(seats_base_1)].copy()
 
                         # 持续时间、距离起飞时间、座位变化都匹配上
                         if not df_match_chk_1.empty:

+ 1 - 1
main_tr_0.py

@@ -49,7 +49,7 @@ def start_train():
     # date_end = datetime.today().strftime("%Y-%m-%d")
     date_end = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
     # date_begin = (datetime.today() - timedelta(days=32)).strftime("%Y-%m-%d")
-    date_begin = "2026-01-27"   # 2025-12-01  2026-01-27 
+    date_begin = "2026-02-03"   # 2025-12-01  2026-01-27  2026-02-03 
 
     print(f"训练时间范围: {date_begin} 到 {date_end}")
 

+ 4 - 4
result_validate_0.py

@@ -441,10 +441,10 @@ if __name__ == "__main__":
     if interval_hours == 0:
         # node, pred_time_str = "node0127", "202601301500"
         # validate_process(node, interval_hours, pred_time_str)
-        node = "node0127"
-        validate_process_zong(node)  # 无条件汇总
-        # node = "node0203"
-        # validate_process_zong(node, True, "202602031100")  # 有条件汇总
+        # node = "node0127"
+        # validate_process_zong(node)  # 无条件汇总
+        node = "node0203"
+        validate_process_zong(node, True, "202602041100")  # 有条件汇总
     # 1 自动验证
     else:
         node = "node0127"