Procházet zdrojové kódy

修改预测时的判定逻辑

node04 před 2 týdny
rodič
revize
531a02f50c
2 změnil soubory, kde provedl 209 přidání a 123 odebrání
  1. 206 120
      data_preprocess.py
  2. 3 3
      main_pe_0.py

+ 206 - 120
data_preprocess.py

@@ -1011,7 +1011,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
     ).reset_index(drop=True)
 
     df_sorted = df_sorted[
-        df_sorted['hours_until_departure'].between(18, 54)
+        df_sorted['hours_until_departure'].between(12, 60)
     ].reset_index(drop=True)
 
     # 每个 gid 取 hours_until_departure 最小的一条
@@ -1020,9 +1020,9 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
         .reset_index(drop=True)
     )
 
-    # 确保 hours_until_departure 在 [18, 54] 的 范围内
+    # 确保 hours_until_departure 在 [12, 60] 的 范围内
     # df_min_hours = df_min_hours[
-    #     df_min_hours['hours_until_departure'].between(18, 54)
+    #     df_min_hours['hours_until_departure'].between(12, 60)
     # ].reset_index(drop=True)
 
     drop_info_csv_path = os.path.join(output_dir, f'{group_route_str}_drop_info.csv')
@@ -1043,18 +1043,26 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
     df_min_hours['simple_drop_in_hours_dist'] = ''   # 空串 表示未知
     
     # 这个阈值取多少?
-    pct_threshold = 0.01
+    pct_threshold = 0.001
     # pct_threshold = 2
-    pct_threshold_1 = 0.01
+    pct_threshold_1 = 0.001
     pct_threshold_c = 0.001
 
     for idx, row in df_min_hours.iterrows(): 
         city_pair = row['city_pair']
         flight_number_1 = row['flight_number_1']
         flight_number_2 = row['flight_number_2']
+        if flight_number_1 == 'VJ878':  # 调试时用
+            pass
         price_change_percent = row['price_change_percent']
+        price_change_amount = row['price_change_amount']
         price_duration_hours = row['price_duration_hours']
         hours_until_departure = row['hours_until_departure']
+        seats_remaining_change_amount = row['seats_remaining_change_amount']
+        
+        length_drop = 0
+        length_keep = 0
+
         # 针对历史上发生的 高价->低价
         if not df_drop_nodes.empty:
             # 对准航班号, 不同起飞日期
@@ -1073,72 +1081,98 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
             # 降价前 增幅阈值的匹配 与 高价历史持续时间 得出降价时间的概率
             if not df_drop_nodes_part.empty and pd.notna(price_change_percent):   
                 # 增幅太小的去掉
-                df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.1]
-                # pct_vals = df_drop_nodes_part['high_price_change_percent'].replace([np.inf, -np.inf], np.nan).dropna()
-                # # 保留百分位 10% ~ 90% 之间的 数据
-                # if not pct_vals.empty:
-                #     q10 = float(pct_vals.quantile(0.10))
-                #     q90 = float(pct_vals.quantile(0.90))
-                #     df_drop_nodes_part = df_drop_nodes_part[
-                #         df_drop_nodes_part['high_price_change_percent'].between(q10, q90)
-                #     ]
-                # if df_drop_nodes_part.empty:
-                #     continue
-                pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
-                df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
-
-                # 历史上出现的降价幅度
-                if not df_match.empty and pd.notna(price_duration_hours):
-                    remaining_hours = (df_match['high_price_duration_hours'] - float(price_duration_hours)).clip(lower=0)
-                    remaining_hours = remaining_hours.round().astype(int)
-
-                    counts = remaining_hours.value_counts().sort_index()
-                    probs = (counts / counts.sum()).round(4)
-
-                    top_hours = int(probs.idxmax())
-                    top_prob = float(probs.max())
-
-                    dist_items = list(zip(probs.index.tolist(), probs.tolist()))
-                    dist_items = dist_items[:10]
-                    dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
-
-                    df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
-                    df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
-                    df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = top_prob
-                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
-
-                    continue   # 已经判定降价 后面不再做
+                # df_drop_nodes_part = df_drop_nodes_part[df_drop_nodes_part['high_price_change_percent'] >= 0.1]
+                # pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
+                # df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
+                
+                pct_base = float(price_change_percent)
+                pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
+                df_drop_gap = df_drop_nodes_part.loc[
+                    pct_vals.notna(),
+                    ['drop_hours_until_departure', 'high_price_duration_hours', 'high_price_change_percent', 
+                     'high_price_change_amount', 'high_price_seats_remaining_change_amount']
+                ].copy()
+                df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
+                df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
+                df_drop_gap = df_drop_gap.sort_values(['pct_abs_gap'], ascending=True)
+                df_match = df_drop_gap[df_drop_gap['pct_abs_gap'] <= pct_threshold]
+
+                # 历史上出现的极近似的增长幅度后的降价场景
+                if not df_match.empty:
+                    dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
+                    hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
+                    seats_base = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
+
+                    if pd.notna(dur_base) and pd.notna(hud_base) and pd.notna(seats_base):
+                        df_match_chk = df_match.copy()
+                        dur_vals = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
+                        df_match_chk = df_match_chk.loc[dur_vals.notna()].copy()
+                        df_match_chk = df_match_chk.loc[dur_vals.loc[dur_vals.notna()] - 12 <= float(dur_base)].copy()
+
+                        drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
+                        df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
+                        df_match_chk = df_match_chk.loc[(drop_hud_vals.loc[drop_hud_vals.notna()] - float(hud_base)).abs() <= 12].copy()
+
+                        seats_vals = pd.to_numeric(df_match_chk['high_price_seats_remaining_change_amount'], errors='coerce')
+                        df_match_chk = df_match_chk.loc[seats_vals.notna()].copy()
+                        df_match_chk = df_match_chk.loc[seats_vals.loc[seats_vals.notna()] == float(seats_base)].copy()
+
+                        # 持续时间、距离起飞时间、座位变化都匹配上
+                        if not df_match_chk.empty:
+                            remaining_hours = (
+                                pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
+                            ).clip(lower=0)
+                            remaining_hours = remaining_hours.round().astype(int)
+
+                            counts = remaining_hours.value_counts().sort_index()
+                            probs = (counts / counts.sum()).round(4)
+
+                            top_hours = int(probs.idxmax())
+                            top_prob = float(probs.max())
+
+                            dist_items = list(zip(probs.index.tolist(), probs.tolist()))
+                            dist_items = dist_items[:10]
+                            dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
+
+                            df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
+                            df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
+                            df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = top_prob
+                            df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
+
+                            length_drop = df_match_chk.shape[0]
+                            # continue   # 已经判定降价 后面不再做
                 
-                # 历史上未出现的降价幅度
+                # 历史上未出现的极近似的增长幅度后的降价场景
                 else:
-                    if pd.notna(price_duration_hours) and price_change_percent >= 0.1:
-                        pct_vals = pd.to_numeric(
-                            df_drop_nodes_part['high_price_change_percent'],
-                            errors='coerce'
-                        ).replace([np.inf, -np.inf], np.nan).dropna()
-                        dur_vals = pd.to_numeric(
-                            df_drop_nodes_part['high_price_duration_hours'],
-                            errors='coerce'
-                        ).replace([np.inf, -np.inf], np.nan).dropna()
-
-                        if not pct_vals.empty and not dur_vals.empty:
-                            pct_min = float(pct_vals.min())
-                            pct_max = float(pct_vals.max())
-                            dur_min = float(dur_vals.min())
-                            dur_max = float(dur_vals.max())
-
-                            if (pct_min <= float(price_change_percent) <= pct_max) and (dur_min <= float(price_duration_hours) <= dur_max):
-                                df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
-                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
-                                df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
-                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.5'
-                                continue  # 已经判定降价 后面不再做
-                            elif (pct_min <= float(price_change_percent)) and (dur_min <= float(price_duration_hours)):
-                                df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
-                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
-                                df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.3
-                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.3'
-                                continue  # 已经判定降价 后面不再做
+                    pass
+                    # if pd.notna(price_duration_hours) and price_change_percent >= 0.1:
+                    #     pct_vals = pd.to_numeric(
+                    #         df_drop_nodes_part['high_price_change_percent'],
+                    #         errors='coerce'
+                    #     ).replace([np.inf, -np.inf], np.nan).dropna()
+                    #     dur_vals = pd.to_numeric(
+                    #         df_drop_nodes_part['high_price_duration_hours'],
+                    #         errors='coerce'
+                    #     ).replace([np.inf, -np.inf], np.nan).dropna()
+
+                    #     if not pct_vals.empty and not dur_vals.empty:
+                    #         pct_min = float(pct_vals.min())
+                    #         pct_max = float(pct_vals.max())
+                    #         dur_min = float(dur_vals.min())
+                    #         dur_max = float(dur_vals.max())
+
+                    #         if (pct_min <= float(price_change_percent) <= pct_max) and (dur_min <= float(price_duration_hours) <= dur_max):
+                    #             df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.5'
+                    #             continue  # 已经判定降价 后面不再做
+                    #         elif (pct_min <= float(price_change_percent)) and (dur_min <= float(price_duration_hours)):
+                    #             df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.3
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.3'
+                    #             continue  # 已经判定降价 后面不再做
                             
         # 针对历史上发生 一直低价、一直高价、低价->高价、连续低价 等 
         if not df_keep_nodes.empty:
@@ -1223,71 +1257,116 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                                     df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
                                     df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
                                     df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
-                                    continue   # 已经判定降价 后面不再做
+                                    length_drop = df_match_c.shape[0]        
+                                    # continue   # 已经判定降价 后面不再做
 
                 # 一般判定场景
-                pct_diff_1 = (df_keep_nodes_part['keep_price_change_percent'] - float(price_change_percent)).abs()
-                df_match_1 = df_keep_nodes_part.loc[pct_diff_1 <= pct_threshold_1, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
-
-                # 历史上出现过的保持低价场景
-                if not df_match_1.empty and pd.notna(price_duration_hours):
-                    df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
-                    df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
-                    df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
-                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k0'    
-                    
+                pct_base_1 = float(price_change_percent)
+                pct_vals_1 = pd.to_numeric(df_keep_nodes_part['keep_price_change_percent'], errors='coerce')
+                df_drop_gap_1 = df_keep_nodes_part.loc[
+                    pct_vals_1.notna(),
+                    ['keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent', 
+                     'keep_price_change_amount', 'keep_seats_remaining_change_amount']
+                ].copy()
+                df_drop_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
+                df_drop_gap_1['pct_abs_gap'] = df_drop_gap_1['pct_gap'].abs()
+                df_drop_gap_1 = df_drop_gap_1.sort_values(['pct_abs_gap'], ascending=True)
+                df_match_1 = df_drop_gap_1.loc[df_drop_gap_1['pct_abs_gap'] <= pct_threshold_1].copy()
+
+                # 历史上出现过近似变化幅度后保持低价场景
+                if not df_match_1.empty:
                     df_match_1['hours_delta'] = hours_until_departure - df_match_1['keep_hours_until_departure']
                     df_match_1['modify_keep_price_duration_hours'] = df_match_1['keep_price_duration_hours'] - df_match_1['hours_delta']
                     # df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
 
-                    # 比较 price_duration_hours 在 modify_keep_price_duration_hours 的百分位                    
-                    vals = df_match_1['modify_keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
-                    if not vals.empty:
-                        # q10_11 = float(vals.quantile(0.10))
-                        min_val = vals.min()
-                        if min_val <= float(price_duration_hours):
-                            df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
-                            df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
-                            df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
-                            df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
-
-                # 历史上没有出现过的保持低价场景
-                else:
-                    df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
-                    df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
-                    df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
-                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n0'
-
-                    if pd.notna(price_duration_hours) and price_change_percent <= 0.1:
-                        df_keep_nodes_part_1 = df_keep_nodes_part[df_keep_nodes_part['keep_price_change_percent'] <= 0.1]
-                        pct_vals_1 = pd.to_numeric(
-                            df_keep_nodes_part_1['keep_price_change_percent'],
-                            errors='coerce'
-                        ).replace([np.inf, -np.inf], np.nan).dropna()
-                        dur_vals_1 = pd.to_numeric(
-                            df_keep_nodes_part_1['keep_price_duration_hours'],
-                            errors='coerce'
-                        ).replace([np.inf, -np.inf], np.nan).dropna()
-
-                        if not pct_vals_1.empty and not dur_vals_1.empty:
-                            pct_min_1 = float(pct_vals_1.min())
-                            pct_max_1 = float(pct_vals_1.max())
-                            dur_min_1 = float(dur_vals_1.min())
-                            dur_max_1 = float(dur_vals_1.max())
-
-                            if (pct_min_1 <= float(price_change_percent) <= pct_max_1) and (dur_min_1 <= float(price_duration_hours) <= dur_max_1):
+                    dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
+                    # hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
+                    seats_base_1 = pd.to_numeric(seats_remaining_change_amount, errors='coerce')
+
+                    if pd.notna(dur_base_1) and pd.notna(seats_base_1):
+                        df_match_chk_1 = df_match_1.copy()
+                        dur_vals_1 = pd.to_numeric(df_match_chk_1['modify_keep_price_duration_hours'], errors='coerce')
+                        df_match_chk_1 = df_match_chk_1.loc[dur_vals_1.notna()].copy()
+                        df_match_chk_1 = df_match_chk_1.loc[(dur_vals_1.loc[dur_vals_1.notna()] - float(dur_base_1)).abs() <= 6].copy()
+
+                        # drop_hud_vals_1 = pd.to_numeric(df_match_chk_1['keep_hours_until_departure'], errors='coerce')
+                        # df_match_chk_1 = df_match_chk_1.loc[drop_hud_vals_1.notna()].copy()
+                        # df_match_chk_1 = df_match_chk_1.loc[(drop_hud_vals_1.loc[drop_hud_vals_1.notna()] - float(hud_base_1)).abs() <= 12].copy()
+
+                        seats_vals_1 = pd.to_numeric(df_match_chk_1['keep_seats_remaining_change_amount'], errors='coerce')
+                        df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.notna()].copy()
+                        df_match_chk_1 = df_match_chk_1.loc[seats_vals_1.loc[seats_vals_1.notna()] == float(seats_base_1)].copy()
+
+                        # 持续时间、距离起飞时间、座位变化都匹配上
+                        if not df_match_chk_1.empty:
+                            length_keep = df_match_chk_1.shape[0]
+                            if length_keep > length_drop:      # 不降价的多数压倒降价的少数
+                    
                                 df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
                                 df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
                                 df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
-                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n1'
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k0'
+                            
+                            elif length_keep == length_drop:   # 不降价与降价相同, 取0.5概率
+
+                                df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
+                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
+                        
+                                # df_match_1['hours_delta'] = hours_until_departure - df_match_1['keep_hours_until_departure']
+                                # df_match_1['modify_keep_price_duration_hours'] = df_match_1['keep_price_duration_hours'] - df_match_1['hours_delta']
+                                # df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
+
+                                # 比较 price_duration_hours 在 modify_keep_price_duration_hours 的百分位                    
+                                # vals = df_match_1['modify_keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
+                                # if not vals.empty:
+                                #     # q10_11 = float(vals.quantile(0.10))
+                                #     min_val = vals.min()
+                                #     if min_val <= float(price_duration_hours):
+                                #         df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
+                                #         df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                                #         df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
+                                #         df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
+
+                # 历史上未出现过近似变化幅度后保持低价场景
+                else:
+                    pass
+                    # df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
+                    # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                    # df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
+                    # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n0'
+
+                    # if pd.notna(price_duration_hours) and price_change_percent <= 0.1:
+                    #     df_keep_nodes_part_1 = df_keep_nodes_part[df_keep_nodes_part['keep_price_change_percent'] <= 0.1]
+                    #     pct_vals_1 = pd.to_numeric(
+                    #         df_keep_nodes_part_1['keep_price_change_percent'],
+                    #         errors='coerce'
+                    #     ).replace([np.inf, -np.inf], np.nan).dropna()
+                    #     dur_vals_1 = pd.to_numeric(
+                    #         df_keep_nodes_part_1['keep_price_duration_hours'],
+                    #         errors='coerce'
+                    #     ).replace([np.inf, -np.inf], np.nan).dropna()
+
+                    #     if not pct_vals_1.empty and not dur_vals_1.empty:
+                    #         pct_min_1 = float(pct_vals_1.min())
+                    #         pct_max_1 = float(pct_vals_1.max())
+                    #         dur_min_1 = float(dur_vals_1.min())
+                    #         dur_max_1 = float(dur_vals_1.max())
+
+                    #         if (pct_min_1 <= float(price_change_percent) <= pct_max_1) and (dur_min_1 <= float(price_duration_hours) <= dur_max_1):
+                    #             df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
+                    #             df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n1'
                 pass
 
     df_min_hours = df_min_hours.rename(columns={'seg1_dep_time': 'from_time'})
     _pred_dt = pd.to_datetime(str(pred_time_str), format="%Y%m%d%H%M", errors="coerce")
-    df_min_hours["update_hour"] = _pred_dt
+    df_min_hours["update_hour"] = _pred_dt.strftime("%Y-%m-%d %H:%M:%S")
     _dep_hour = pd.to_datetime(df_min_hours["from_time"], errors="coerce").dt.floor("h")
-    df_min_hours["valid_begin_hour"] = _dep_hour - pd.to_timedelta(54, unit="h")
-    df_min_hours["valid_end_hour"] = _dep_hour - pd.to_timedelta(18, unit="h")
+    df_min_hours["valid_begin_hour"] = (_dep_hour - pd.to_timedelta(60, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
+    df_min_hours["valid_end_hour"] = (_dep_hour - pd.to_timedelta(12, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
 
     order_cols = ['city_pair', 'flight_day', 'flight_number_1', 'flight_number_2', 'from_time', 'baggage', 'currency', 
                   'adult_total_price', 'hours_until_departure', 'price_change_percent', 'price_duration_hours',
@@ -1304,6 +1383,13 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
         }
     )
 
+    # 排序
+    df_predict = df_predict.sort_values(
+        by=['city_pair', 'flight_number_1', 'flight_number_2', 'flight_day'],
+        kind='mergesort',
+        na_position='last',
+    ).reset_index(drop=True)
+
     csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
     df_predict.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
 

+ 3 - 3
main_pe_0.py

@@ -38,9 +38,9 @@ def start_predict():
         except Exception as e:
             print(f"remove {csv_path} info: {str(e)}")
 
-    # 预测时间范围,满足起飞时间 在18小时后到54小时后
-    pred_hour_begin = hourly_time + timedelta(hours=18)
-    pred_hour_end = hourly_time + timedelta(hours=54)
+    # 预测时间范围,满足起飞时间 在12小时后到60小时后
+    pred_hour_begin = hourly_time + timedelta(hours=12)
+    pred_hour_end = hourly_time + timedelta(hours=60)
 
     pred_date_end = pred_hour_end.strftime("%Y-%m-%d")
     pred_date_begin = pred_hour_begin.strftime("%Y-%m-%d")