Selaa lähdekoodia

调整UO的预测判定方式,过滤掉低样本数的投放

node04 3 viikkoa sitten
vanhempi
commit
0e8e16c3e5
2 muutettua tiedostoa jossa 30 lisäystä ja 23 poistoa
  1. 25 22
      data_process.py
  2. 5 1
      descending_cabin_task.py

+ 25 - 22
data_process.py

@@ -29,7 +29,7 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
     ).reset_index(drop=True)
 
     df_input = df_input[df_input['hours_until_departure'] <= 480]
-    df_input = df_input[df_input['baggage_weight'] == 20]   # 先保留20公斤行李的
+    df_input = df_input[df_input['baggage_weight'] == 0]   # 先保留0公斤行李的
 
     # 在hours_until_departure 的末尾 保留到当前时刻的数据
     if not is_train:
@@ -161,8 +161,8 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
         
         # 制作历史包络线
         envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
-        idx_peak = df_input.groupby(envelope_group)['price_total'].idxmax()
-        df_envelope = df_input.loc[idx_peak, envelope_group + [
+        idx_peak = df_target.groupby(envelope_group)['price_total'].idxmax()
+        df_envelope = df_target.loc[idx_peak, envelope_group + [
             'from_time', 'price_total', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week',
         ]].rename(columns={
             'price_total': 'peak_price',
@@ -288,13 +288,16 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
     df_min_hours['rise_price_sample_size'] = 0
 
     # 这个阈值取多少?
-    # pct_threshold = 0.01
-    # pct_threshold_1 = 0.01
+    pct_threshold = 0.01
+    pct_threshold_1 = 0.01
 
     for idx, row in df_min_hours.iterrows(): 
         city_pair = row['citypair']
         flight_numbers = row['flight_numbers']
         baggage_weight = row['baggage_weight']
+        from_date = row['from_date']
+        if flight_numbers == "UO235" and from_date == "2026-04-25":   # 调试时用
+            pass
         days_to_departure = row['days_to_departure']
         hours_until_departure = row['hours_until_departure']
         price_change_percent = row['price_change_percent']
@@ -314,26 +317,26 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                 (df_drop_nodes['baggage_weight'] == baggage_weight)
             ]
             # 降价前 增量阈值、当前阈值 的匹配
-            if not df_drop_nodes_part.empty and pd.notna(price_change_amount):   
+            if not df_drop_nodes_part.empty and pd.notna(price_change_percent):   
                 
-                pca_base = float(price_change_amount)
-                pca_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_amount'], errors='coerce')
+                pct_base = float(price_change_percent)
+                pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
                 df_drop_gap = df_drop_nodes_part.loc[
-                    pca_vals.notna(),
+                    pct_vals.notna(),
                     ['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount', 
                      'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'relative_position'
                      ]
                 ].copy()
-                df_drop_gap['pca_gap'] = (pca_vals.loc[pca_vals.notna()] - pca_base)
-                df_drop_gap['pca_abs_gap'] = df_drop_gap['pca_gap'].abs()
+                df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
+                df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
 
                 price_base = pd.to_numeric(price_amount, errors='coerce')
                 high_price_vals = pd.to_numeric(df_drop_gap['high_price_amount'], errors='coerce')
                 df_drop_gap['price_gap'] = high_price_vals - price_base
                 df_drop_gap['price_abs_gap'] = df_drop_gap['price_gap'].abs()
 
-                df_drop_gap = df_drop_gap.sort_values(['price_abs_gap', 'pca_abs_gap'], ascending=[True, True])
-                df_match = df_drop_gap[(df_drop_gap['price_abs_gap'] <= 5.0) & (df_drop_gap['pca_abs_gap'] <= 10.0)].copy()
+                df_drop_gap = df_drop_gap.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
+                df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 3.0)].copy()
 
                 # 历史上出现的极近似的增长(下降)幅度后的降价场景
                 if not df_match.empty:
@@ -390,24 +393,24 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                 (df_rise_nodes['baggage_weight'] == baggage_weight)
             ]
             # 升价前 增量阈值、当前阈值 的匹配
-            if not df_rise_nodes_part.empty and pd.notna(price_change_amount):
-                pca_base_1 = float(price_change_amount)
-                pca_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_amount'], errors='coerce')
+            if not df_rise_nodes_part.empty and pd.notna(price_change_percent):
+                pct_base_1 = float(price_change_percent)
+                pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
                 df_rise_gap_1 = df_rise_nodes_part.loc[
-                    pca_vals_1.notna(),
+                    pct_vals_1.notna(),
                     ['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
                      'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'relative_position']
                 ].copy()
-                df_rise_gap_1['pca_gap'] = (pca_vals_1.loc[pca_vals_1.notna()] - pca_base_1)
-                df_rise_gap_1['pca_abs_gap'] = df_rise_gap_1['pca_gap'].abs()
+                df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
+                df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
 
                 price_base_1 = pd.to_numeric(price_amount, errors='coerce')
                 rise_price_vals_1 = pd.to_numeric(df_rise_gap_1['prev_rise_amount'], errors='coerce')
                 df_rise_gap_1['price_gap'] = rise_price_vals_1 - price_base_1
                 df_rise_gap_1['price_abs_gap'] = df_rise_gap_1['price_gap'].abs()
 
-                df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap', 'pca_abs_gap'], ascending=[True, True])
-                df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['price_abs_gap'] <= 5.0) & (df_rise_gap_1['pca_abs_gap'] <= 10.0)].copy()
+                df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
+                df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 3.0)].copy()
 
                 # 历史上出现的极近似的增长(下降)幅度后的升价场景
                 if not df_match_1.empty:
@@ -443,7 +446,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                             else:
                                 drop_prob = round(length_drop / (length_rise + length_drop), 2)
                                 # 依旧保持之前的降价判定,概率修改
-                                if drop_prob >= 0.4:
+                                if drop_prob >= 0.6:
                                     df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
                                     # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'd1'
                                     df_min_hours.loc[idx, 'flag_dist'] = 'd1'

+ 5 - 1
descending_cabin_task.py

@@ -73,7 +73,11 @@ def _process_one_task(row):
     drop_price_change_upper = float(task.get("drop_price_change_upper") or 0)   # 最小的降价幅度
     max_threshold = round(drop_price_change_upper * 1.0)
 
-    if abs(max_threshold) < 10:
+    if abs(max_threshold) < 10:  # 丢弃小于10人民币的降价幅度
+        return None
+    
+    drop_price_sample_size = int(task.get("drop_price_sample_size", "0"))
+    if drop_price_sample_size < 2:  # 丢弃历史降价样本数过少(小于2)的
         return None
 
     end_segments = []