Kaynağa Gözat

修改预测判定逻辑

node04 2 hafta önce
ebeveyn
işleme
5b56e8a45b
2 değiştirilmiş dosya ile 86 ekleme ve 16 silme
  1. 84 16
      data_preprocess.py
  2. 2 0
      main_pe_0.py

+ 84 - 16
data_preprocess.py

@@ -1016,15 +1016,15 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
     else:
         df_keep_nodes = pd.DataFrame()
 
-    df_min_hours['simple_will_price_drop'] = -1   # -1 表示未知
+    df_min_hours['simple_will_price_drop'] = 0   
     df_min_hours['simple_drop_in_hours'] = 0
     df_min_hours['simple_drop_in_hours_prob'] = 0.0
-    df_min_hours['simple_drop_in_hours_dist'] = ''
+    df_min_hours['simple_drop_in_hours_dist'] = ''   # 空串 表示未知
     
     # 这个阈值取多少?
     pct_threshold = 0.01
     # pct_threshold = 2
-    pct_threshold_1 = 0.001
+    pct_threshold_1 = 0.01
     pct_threshold_c = 0.001
 
     for idx, row in df_min_hours.iterrows(): 
@@ -1066,6 +1066,7 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                 pct_diff = (df_drop_nodes_part['high_price_change_percent'] - float(price_change_percent)).abs()
                 df_match = df_drop_nodes_part.loc[pct_diff <= pct_threshold, ['high_price_duration_hours', 'high_price_change_percent']].copy()
 
+                # 历史上出现的降价幅度
                 if not df_match.empty and pd.notna(price_duration_hours):
                     remaining_hours = (df_match['high_price_duration_hours'] - float(price_duration_hours)).clip(lower=0)
                     remaining_hours = remaining_hours.round().astype(int)
@@ -1078,15 +1079,46 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
 
                     dist_items = list(zip(probs.index.tolist(), probs.tolist()))
                     dist_items = dist_items[:10]
-                    dist_str = ' | '.join([f"{int(h)}:{float(p)}" for h, p in dist_items])
+                    dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
 
                     df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
                     df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
                     df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = top_prob
                     df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
 
-                    continue   # 已经判定降价 后面不再做  
-
+                    continue   # 已经判定降价 后面不再做
+                
+                # 历史上未出现的降价幅度
+                else:
+                    if pd.notna(price_duration_hours) and price_change_percent >= 0.1:
+                        pct_vals = pd.to_numeric(
+                            df_drop_nodes_part['high_price_change_percent'],
+                            errors='coerce'
+                        ).replace([np.inf, -np.inf], np.nan).dropna()
+                        dur_vals = pd.to_numeric(
+                            df_drop_nodes_part['high_price_duration_hours'],
+                            errors='coerce'
+                        ).replace([np.inf, -np.inf], np.nan).dropna()
+
+                        if not pct_vals.empty and not dur_vals.empty:
+                            pct_min = float(pct_vals.min())
+                            pct_max = float(pct_vals.max())
+                            dur_min = float(dur_vals.min())
+                            dur_max = float(dur_vals.max())
+
+                            if (pct_min <= float(price_change_percent) <= pct_max) and (dur_min <= float(price_duration_hours) <= dur_max):
+                                df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
+                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.5'
+                                continue  # 已经判定降价 后面不再做
+                            elif (pct_min <= float(price_change_percent)) and (dur_min <= float(price_duration_hours)):
+                                df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
+                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.3
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = '0h->0.3'
+                                continue  # 已经判定降价 后面不再做
+                            
         # 针对历史上发生 一直低价、一直高价、低价->高价、连续低价 等 
         if not df_keep_nodes.empty:
             # 对准航班号, 不同起飞日期
@@ -1164,35 +1196,71 @@ def predict_data_simple(df_input, group_route_str, output_dir, predict_dir=".",
                         if not df_match_c.empty and pd.notna(price_duration_hours):
                             vals_c = df_match_c['keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
                             if not vals_c.empty:
-                                min_val = vals_c.min()
-                                if min_val <= float(price_duration_hours):
+                                min_val_c = vals_c.min()
+                                if min_val_c <= float(price_duration_hours):
                                     df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
                                     df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
                                     df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.5
-                                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = ''
-                                    continue
+                                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'c1'
+                                    continue   # 已经判定降价 后面不再做
 
                 # 一般判定场景
                 pct_diff_1 = (df_keep_nodes_part['keep_price_change_percent'] - float(price_change_percent)).abs()
                 df_match_1 = df_keep_nodes_part.loc[pct_diff_1 <= pct_threshold_1, ['flight_day', 'keep_hours_until_departure', 'keep_price_duration_hours', 'keep_price_change_percent']].copy()
 
+                # 历史上出现过的保持低价场景
                 if not df_match_1.empty and pd.notna(price_duration_hours):
+                    df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
+                    df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                    df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
+                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k0'    
                     
                     df_match_1['hours_delta'] = hours_until_departure - df_match_1['keep_hours_until_departure']
                     df_match_1['modify_keep_price_duration_hours'] = df_match_1['keep_price_duration_hours'] - df_match_1['hours_delta']
-                    df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
+                    # df_match_1 = df_match_1[df_match_1['modify_keep_price_duration_hours'] > 0]
 
                     # 比较 price_duration_hours 在 modify_keep_price_duration_hours 的百分位                    
                     vals = df_match_1['modify_keep_price_duration_hours'].replace([np.inf, -np.inf], np.nan).dropna()
                     if not vals.empty:
-                        q10_11 = float(vals.quantile(0.10))
-                        # q90_11 = float(vals.quantile(0.90))
-                        if q10_11 <= float(price_duration_hours):
+                        # q10_11 = float(vals.quantile(0.10))
+                        min_val = vals.min()
+                        if min_val <= float(price_duration_hours):
                             df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
                             df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
                             df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
-                            df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = ''
-    
+                            df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'k1'
+
+                # 历史上没有出现过的保持低价场景
+                else:
+                    df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
+                    df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                    df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
+                    df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n0'
+
+                    if pd.notna(price_duration_hours) and price_change_percent <= 0.1:
+                        df_keep_nodes_part_1 = df_keep_nodes_part[df_keep_nodes_part['keep_price_change_percent'] <= 0.1]
+                        pct_vals_1 = pd.to_numeric(
+                            df_keep_nodes_part_1['keep_price_change_percent'],
+                            errors='coerce'
+                        ).replace([np.inf, -np.inf], np.nan).dropna()
+                        dur_vals_1 = pd.to_numeric(
+                            df_keep_nodes_part_1['keep_price_duration_hours'],
+                            errors='coerce'
+                        ).replace([np.inf, -np.inf], np.nan).dropna()
+
+                        if not pct_vals_1.empty and not dur_vals_1.empty:
+                            pct_min_1 = float(pct_vals_1.min())
+                            pct_max_1 = float(pct_vals_1.max())
+                            dur_min_1 = float(dur_vals_1.min())
+                            dur_max_1 = float(dur_vals_1.max())
+
+                            if (pct_min_1 <= float(price_change_percent) <= pct_max_1) and (dur_min_1 <= float(price_duration_hours) <= dur_max_1):
+                                df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
+                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
+                                df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'n1'
+                pass
+
     df_min_hours = df_min_hours.rename(columns={'seg1_dep_time': 'from_time'})
     _pred_dt = pd.to_datetime(str(pred_time_str), format="%Y%m%d%H%M", errors="coerce")
     df_min_hours["update_hour"] = _pred_dt

+ 2 - 0
main_pe_0.py

@@ -117,6 +117,8 @@ def start_predict():
         
         del df_test_inputs
         del df_predict
+        print(f"第 {i} 组 预测完成")
+        print()
         time.sleep(1)
 
     print("所有批次的预测结束")