Procházet zdrojové kódy

修改预测判定,使得UO的正向样本里近似24小时内降价

node04 před 2 týdny
rodič
revize
75c4ffae7a
1 změnil soubory, kde provedl 24 přidání a 16 odebrání
  1. 24 16
      data_process.py

+ 24 - 16
data_process.py

@@ -278,7 +278,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
     # =====================================================================
 
     df_min_hours['simple_will_price_drop'] = 0
-    df_min_hours['simple_drop_in_hours'] = 0
+    # df_min_hours['simple_drop_in_hours'] = 0
     df_min_hours['simple_drop_in_hours_prob'] = 0.0
     df_min_hours['simple_drop_in_hours_dist'] = ''   # 空串 表示未知
     df_min_hours['flag_dist'] = ''
@@ -357,6 +357,12 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                         df_match_chk = df_match_chk.loc[drop_hud_vals.notna()].copy()
                         df_match_chk = df_match_chk.loc[(float(hud_base) - drop_hud_vals.loc[drop_hud_vals.notna()]) >= -24].copy()
 
+                        dur_num_chk = pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce')
+                        dur_delta = dur_num_chk - float(dur_base)
+                        df_match_chk = df_match_chk.assign(dur_delta=dur_delta)
+                        df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].notna()].copy()
+                        df_match_chk = df_match_chk.loc[df_match_chk['dur_delta'].abs() <= 48].copy()
+
                         # 距离起飞天数也对的上
                         if not df_match_chk.empty:
                             length_drop = df_match_chk.shape[0]
@@ -367,23 +373,26 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                             df_min_hours.loc[idx, 'drop_price_change_upper'] = round(drop_price_change_upper, 2)
                             df_min_hours.loc[idx, 'drop_price_change_lower'] = round(drop_price_change_lower, 2)
 
-                            remaining_hours = (
-                                pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
-                            ).clip(lower=0)
-                            remaining_hours = remaining_hours.round().astype(int)
+                            # remaining_hours = (
+                            #     pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
+                            # ).clip(lower=0)
+                            # remaining_hours = remaining_hours.round().astype(int)
+
+                            # counts = remaining_hours.value_counts().sort_index()
+                            # probs = (counts / counts.sum()).round(4)
 
-                            counts = remaining_hours.value_counts().sort_index()
-                            probs = (counts / counts.sum()).round(4)
+                            # top_hours = int(probs.idxmax())
+                            # top_prob = float(probs.max())
 
-                            top_hours = int(probs.idxmax())
-                            top_prob = float(probs.max())
+                            # dist_items = list(zip(probs.index.tolist(), probs.tolist()))
+                            # dist_items = dist_items[:10]
+                            # dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
 
-                            dist_items = list(zip(probs.index.tolist(), probs.tolist()))
-                            dist_items = dist_items[:10]
-                            dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
+                            dur_delta_list = df_match_chk['dur_delta'].tolist()
+                            dist_str = "'" + ' '.join([f"{ddl:g}" for ddl in dur_delta_list])
 
                             df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
-                            df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
+                            # df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
                             df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 1
                             df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
                             df_min_hours.loc[idx, 'flag_dist'] = 'd0'
@@ -448,7 +457,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
                             # 可以明确的判定不降价
                             if length_drop == 0:
                                 df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
-                                df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
+                                # df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
                                 df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
                                 # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'r0'
                                 df_min_hours.loc[idx, 'flag_dist'] = 'r0'
@@ -482,7 +491,7 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
         'price_change_amount', 'price_change_percent', 'price_duration_hours', 
         "update_hour", "create_time",
         'valid_begin_hour', 'valid_end_hour',
-        'simple_will_price_drop', 'simple_drop_in_hours', 'simple_drop_in_hours_prob', 'simple_drop_in_hours_dist',
+        'simple_will_price_drop', 'simple_drop_in_hours_prob', 'simple_drop_in_hours_dist',
         'flag_dist',
         'drop_price_change_upper', 'drop_price_change_lower', 'drop_price_sample_size',
         'rise_price_change_upper', 'rise_price_change_lower', 'rise_price_sample_size',
@@ -490,7 +499,6 @@ def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_t
     df_predict = df_min_hours[order_cols]
     df_predict = df_predict.rename(columns={
             'simple_will_price_drop': 'will_price_drop',
-            'simple_drop_in_hours': 'drop_in_hours',
             'simple_drop_in_hours_prob': 'drop_in_hours_prob',
             'simple_drop_in_hours_dist': 'drop_in_hours_dist',
         }