Explorar el Código

再次修改训练与预测判定过程

node04 hace 1 día
padre
commit
8d4aa7d3ec
Se han modificado 2 ficheros con 52 adiciones y 32 borrados
  1. 51 31
      data_process.py
  2. 1 1
      result_keep_verify.py

+ 51 - 31
data_process.py

@@ -102,13 +102,13 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
 
         # 对于先升后降(先降再降)的分析
         seg_start_mask = df_target['price_duration_hours'].eq(1)   # 开始变价节点
-        # 正例库仅保留24小时内发生的降价:上一价格段持续时长需<=24h
+        # 正例库
         prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
         drop_mask = (
             seg_start_mask
             & prev_pct_num.notna()
             & (df_target['price_change_percent'] < 0)
-            & prev_dur.le(24)
+            # & prev_dur.le(24)  # 仅保留24小时内发生的降价:上一价格段持续时长需<=24h
         )
 
         df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
@@ -142,19 +142,21 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
         df_drop_nodes = df_drop_nodes[flight_info_cols + drop_info_cols]
         df_drop_nodes['start_hours_until_departure'] = (df_drop_nodes['drop_hours_until_departure'] + df_drop_nodes['high_price_duration_hours']).round().astype('Int64')
         df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_hours_until_departure'] <= 360]
-        df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72] 
+        df_drop_nodes = df_drop_nodes[df_drop_nodes['start_hours_until_departure'] >= 72]
+        df_drop_nodes = df_drop_nodes[df_drop_nodes['high_price_duration_hours'] > 2.0]       # 维持时间太短的不计
+        df_drop_nodes = df_drop_nodes[df_drop_nodes['drop_price_change_amount'].abs() > 1]    # 1¥之内的降价不计 
         
-        # 反例库:所有有效节点(不限升价)中,未来24小时内未发生降价
+        # 反例库:所有升价节点
         # seg_start_mask = df_target['price_duration_hours'].eq(1)
         # rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
         prev_pct_num = pd.to_numeric(prev_pct, errors='coerce')
         valid_mask = seg_start_mask & prev_pct_num.notna()
 
         curr_pct = pd.to_numeric(df_target['price_change_percent'], errors='coerce')
-        prev_dur_num = pd.to_numeric(prev_dur, errors='coerce')
+        # prev_dur_num = pd.to_numeric(prev_dur, errors='coerce')
         pos_case_mask = curr_pct.ge(0)
-        neg_case_mask = curr_pct.lt(0) & prev_dur_num.gt(24)
-        rise_mask = valid_mask & (pos_case_mask | neg_case_mask)
+        # neg_case_mask = curr_pct.lt(0) & prev_dur_num.gt(24)
+        rise_mask = valid_mask & pos_case_mask  # (pos_case_mask | neg_case_mask)
 
         df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
         df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
@@ -182,6 +184,8 @@ def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
         df_rise_nodes['start_hours_until_departure'] = (df_rise_nodes['rise_hours_until_departure'] + df_rise_nodes['prev_rise_duration_hours']).round().astype('Int64')
         df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_hours_until_departure'] <= 360]
         df_rise_nodes = df_rise_nodes[df_rise_nodes['start_hours_until_departure'] >= 72]
+        df_rise_nodes = df_rise_nodes[df_rise_nodes['prev_rise_duration_hours'] > 2.0]        # 维持时间太短的不计
+        df_rise_nodes = df_rise_nodes[df_rise_nodes['rise_price_change_amount'].abs() > 1]    # 1¥之内的改变不计
         
         # 制作历史包络线
         envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
@@ -408,8 +412,8 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
     df_min_hours['rise_price_sample_size'] = 0
 
     # 这个阈值取多少?
-    pct_threshold = 0.1
-    pct_threshold_1 = 0.1
+    pct_threshold = 0.2
+    pct_threshold_1 = 0.2
 
     for idx, row in df_min_hours.iterrows(): 
         city_pair = row['citypair']
@@ -444,8 +448,10 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                 pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
                 df_drop_gap = df_drop_nodes_part.loc[
                     pct_vals.notna(),
-                    ['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount', 
-                     'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'high_price_cabins', 'relative_position'
+                    ['from_date',
+                     'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount', 
+                     'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'relative_position', 
+                     'high_price_cabins', 'start_hours_until_departure',
                      ]
                 ].copy()
                 df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
@@ -463,34 +469,40 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                 )
                 df_match = df_drop_gap[
                     (df_drop_gap['pct_abs_gap'] <= pct_threshold) 
-                    & (df_drop_gap['price_abs_gap'] <= 5.0)
+                    & (df_drop_gap['price_abs_gap'] <= 10.0)
                     & same_sign_mask
                     & (df_drop_gap['high_price_cabins'] == cabins)
+                    & (df_drop_gap['high_price_duration_hours'] <= 48)
                 ].copy()
 
                 # 历史上出现的极近似的增长(下降)幅度后的降价场景
                 if not df_match.empty:
                     dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
                     hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
-                    dtd_base = pd.to_numeric(days_to_departure, errors='coerce')
+                    # dtd_base = pd.to_numeric(days_to_departure, errors='coerce')
 
-                    if pd.notna(dur_base) and pd.notna(dtd_base) and pd.notna(hud_base): 
+                    if pd.notna(dur_base) and pd.notna(hud_base): 
                         df_match_chk = df_match.copy()
 
                         # drop_dtd_vals = pd.to_numeric(df_match_chk['drop_days_to_departure'], errors='coerce')
                         # df_match_chk = df_match_chk.loc[drop_dtd_vals.notna()].copy()
                         # df_match_chk = df_match_chk.loc[(drop_dtd_vals.loc[drop_dtd_vals.notna()] - float(dtd_base)).abs() <= 3].copy()
 
-                        # drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
-                        # df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
-                        # df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
+                        # 正例收紧 (距离起飞的小时数)
+                        drop_hud_vals = pd.to_numeric(df_match_chk['drop_hours_until_departure'], errors='coerce')
+                        df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
+                        df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= 0].copy()
 
-                        # 正例收紧
+                        start_hud_vals = pd.to_numeric(df_match_chk['start_hours_until_departure'], errors='coerce')
+                        df_match_chk = df_match_chk.loc[start_hud_vals.notna()].copy()
+                        df_match_chk = df_match_chk.loc[(float(hud_base) - start_hud_vals.loc[start_hud_vals.notna()]) <= 0].copy()
+
+                        # 正例收紧 (持续小时数)
                         dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
                         dur_delta = dur_num_chk - float(dur_base)
                         df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
                         df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
-                        df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 72].copy()
+                        # df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 72].copy()
 
                         # 所有条件都对的上
                         if not df_match_chk.empty:
@@ -542,8 +554,11 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                 pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
                 df_rise_gap_1 = df_rise_nodes_part.loc[
                     pct_vals_1.notna(),
-                    ['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
-                     'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_cabins', 'relative_position']
+                    ['from_date',
+                     'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
+                     'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'relative_position', 
+                     'prev_rise_cabins', 'start_hours_until_departure',
+                    ]
                 ].copy()
                 df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
                 df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
@@ -560,7 +575,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                 )
                 df_match_1 = df_rise_gap_1.loc[
                     (df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) 
-                    & (df_rise_gap_1['price_abs_gap'] <= 5.0)
+                    & (df_rise_gap_1['price_abs_gap'] <= 10.0)
                     & same_sign_mask_1
                     & (df_rise_gap_1['prev_rise_cabins'] == cabins)
                 ].copy()
@@ -569,24 +584,29 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                 if not df_match_1.empty:
                     dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
                     hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
-                    dtd_base_1 = pd.to_numeric(days_to_departure, errors='coerce')
+                    # dtd_base_1 = pd.to_numeric(days_to_departure, errors='coerce')
 
-                    if pd.notna(dur_base_1) and pd.notna(dtd_base_1) and pd.notna(hud_base_1): 
+                    if pd.notna(dur_base_1) and pd.notna(hud_base_1): 
                         df_match_chk_1 = df_match_1.copy()
                         
                         # drop_dtd_vals_1 = pd.to_numeric(df_match_chk_1['rise_days_to_departure'], errors='coerce')
                         # df_match_chk_1 = df_match_chk_1.loc[drop_dtd_vals_1.notna()].copy()
                         # df_match_chk_1 = df_match_chk_1.loc[(drop_dtd_vals_1.loc[drop_dtd_vals_1.notna()] - float(dtd_base_1)).abs() <= 3].copy()
 
-                        # rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
-                        # df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
-                        # df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= -24].copy()
+                        # 反例收紧 (距离起飞的小时数)
+                        rise_hud_vals_1 = pd.to_numeric(df_match_chk_1['rise_hours_until_departure'], errors='coerce')
+                        df_match_chk_1 = df_match_chk_1.loc[rise_hud_vals_1.notna()].copy()
+                        df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - rise_hud_vals_1.loc[rise_hud_vals_1.notna()]) >= 0].copy()
+
+                        start_hud_vals_1 = pd.to_numeric(df_match_chk_1['start_hours_until_departure'], errors='coerce')
+                        df_match_chk_1 = df_match_chk_1.loc[start_hud_vals_1.notna()].copy()
+                        df_match_chk_1 = df_match_chk_1.loc[(float(hud_base_1) - start_hud_vals_1.loc[start_hud_vals_1.notna()]) <= 0].copy()
 
                         # 反例收紧:48小时内发生降价的不算显著反例
-                        _rise_pct_chk = pd.to_numeric(df_match_chk_1['rise_price_change_percent'], errors='coerce')
-                        _prev_dur_chk = pd.to_numeric(df_match_chk_1['prev_rise_duration_hours'], errors='coerce')
-                        _exclude_mask = _rise_pct_chk.lt(0) & _prev_dur_chk.lt(48)
-                        df_match_chk_1 = df_match_chk_1.loc[~_exclude_mask.fillna(False)].copy()
+                        # _rise_pct_chk = pd.to_numeric(df_match_chk_1['rise_price_change_percent'], errors='coerce')
+                        # _prev_dur_chk = pd.to_numeric(df_match_chk_1['prev_rise_duration_hours'], errors='coerce')
+                        # _exclude_mask = _rise_pct_chk.lt(0) & _prev_dur_chk.lt(48)
+                        # df_match_chk_1 = df_match_chk_1.loc[~_exclude_mask.fillna(False)].copy()
 
                         # 所有条件都对的上
                         if not df_match_chk_1.empty:

+ 1 - 1
result_keep_verify.py

@@ -338,5 +338,5 @@ def verify_process_2(min_batch_time_str, max_batch_time_str):
 
 if __name__ == "__main__":
     # verify_process("202604151100", "202604161400")
-    verify_process_2("202605071300", "202605080900")
+    verify_process_2("202605121300", "202605121300")
     pass