Browse Source

增加训练特征, 调整航线

node04 2 days ago
parent
commit
39bd9136e4
3 changed files with 27 additions and 13 deletions
  1. 4 4
      config.py
  2. 20 6
      data_process.py
  3. 3 3
      uo_atlas_import.py

+ 4 - 4
config.py

@@ -51,8 +51,8 @@ uo_city_pairs_old = [
 # UO近期有数据的城市对
 uo_city_pairs_new =[
     "BJSHKG", "BKIHKG", "BKKHKG", "BKKNGB", "CJUHKG",
-    "CJUNGB", "CNXHKG", "CRKHKG", "CZXHAN", "CZXHKG",
-    "DADHKG", "FUKHKG", "HANHKG", "HANMNL", "HIJHKG",
+    "CNXHKG", "CRKHKG", "CZXHKG",
+    "DADHKG", "FUKHKG", "HANHKG", "HIJHKG",
     "HKGBJS", "HKGBKI", "HKGBKK", "HKGCJU", "HKGCNX",
     "HKGCRK", "HKGCZX", "HKGDAD", "HKGFUK", "HKGHAN",
     "HKGHIJ", "HKGHKT", "HKGKHH", "HKGKUL", "HKGKWE",
@@ -60,8 +60,8 @@ uo_city_pairs_new =[
     "HKGPEN", "HKGPQC", "HKGPUS", "HKGRMQ", "HKGSDJ",
     "HKGSEL", "HKGSYX", "HKGTAE", "HKGTAK", "HKGTPE",
     "HKGTYO", "HKTHKG", "KHHHKG", "KULHKG", "KWEHKG",
-    "KWEPEN", "MNLHKG", "NGBBKK", "NGBHKG", "NGOHKG",
+    "MNLHKG", "NGBBKK", "NGBHKG", "NGOHKG",
     "OKAHKG", "OSAHKG", "PENHKG", "PQCHKG", "PUSHKG",
     "RMQHKG", "SDJHKG", "SELHKG", "SYXHKG", "TAEHKG",
-    "TAKHKG", "TPEBKK", "TPEHKG", "TYOHKG", "YIWHKG",
+    "TAKHKG", "TPEHKG", "TYOHKG", "YIWHKG",
 ]

+ 20 - 6
data_process.py

@@ -7,7 +7,10 @@ import os
 def preprocess_data_simple(df_input, is_train=False):
 
     print(">>> 开始数据预处理")
-    # 城市码映射成数字    
+    # 城市码映射成数字(不用)
+
+    # 更新日期是周几
+    df_input['update_week'] = df_input['update_hour'].dt.dayofweek + 1
     
     # gid:基于指定字段的分组标记(整数)
     df_input['gid'] = (
@@ -93,8 +96,11 @@ def preprocess_data_simple(df_input, is_train=False):
         prev_price = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total'].shift(1)
         drop_mask = (prev_pct > 0) & (df_target['price_change_percent'] < 0)
 
-        df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure']].copy()
+        df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week']].copy()
         df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
+        df_drop_nodes.rename(columns={'days_to_departure': 'drop_days_to_departure'}, inplace=True)
+        df_drop_nodes.rename(columns={'update_hour': 'drop_update_hour'}, inplace=True)
+        df_drop_nodes.rename(columns={'update_week': 'drop_update_week'}, inplace=True)
         df_drop_nodes['drop_price_change_percent'] = df_target.loc[drop_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
         df_drop_nodes['drop_price_change_amount'] = df_target.loc[drop_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
         df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
@@ -111,7 +117,8 @@ def preprocess_data_simple(df_input, is_train=False):
         df_drop_nodes = df_drop_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
 
         drop_info_cols = [
-            'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
+            'drop_update_hour', 'drop_update_week',
+            'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
             'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 
         ]
         # 按顺序排列 去掉gid
@@ -121,8 +128,11 @@ def preprocess_data_simple(df_input, is_train=False):
         seg_start_mask = df_target['price_duration_hours'].eq(1)
         rise_mask = seg_start_mask & (prev_pct > 0) & (df_target['price_change_percent'] > 0)
 
-        df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure']].copy()
+        df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week']].copy()
         df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
+        df_rise_nodes.rename(columns={'days_to_departure': 'rise_days_to_departure'}, inplace=True)
+        df_rise_nodes.rename(columns={'update_hour': 'rise_update_hour'}, inplace=True)
+        df_rise_nodes.rename(columns={'update_week': 'rise_update_week'}, inplace=True)
         df_rise_nodes['rise_price_change_percent'] = df_target.loc[rise_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
         df_rise_nodes['rise_price_change_amount'] = df_target.loc[rise_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
         df_rise_nodes['prev_rise_duration_hours'] = prev_dur.loc[rise_mask].astype(float).to_numpy()
@@ -134,7 +144,8 @@ def preprocess_data_simple(df_input, is_train=False):
         df_rise_nodes = df_rise_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
         
         rise_info_cols = [
-            'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
+            'rise_update_hour', 'rise_update_week',
+            'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
             'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount',
         ]
         df_rise_nodes = df_rise_nodes[flight_info_cols + ['baggage_weight'] + rise_info_cols]
@@ -143,10 +154,13 @@ def preprocess_data_simple(df_input, is_train=False):
         envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
         idx_peak = df_input.groupby(envelope_group)['price_total'].idxmax()
         df_envelope = df_input.loc[idx_peak, envelope_group + [
-            'price_total', 'hours_until_departure'
+            'from_time', 'price_total', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week',
         ]].rename(columns={
             'price_total': 'peak_price',
             'hours_until_departure': 'peak_hours',
+            'days_to_departure': 'peak_days',
+            'update_hour': 'peak_time',
+            'update_week': 'peak_week',
         }).reset_index(drop=True)
         
         del df_gid_info

+ 3 - 3
uo_atlas_import.py

@@ -215,7 +215,7 @@ def main_import_process(create_at_begin, create_at_end):
     print(f"create_at_begin: {create_at_begin}, timestamp: {create_at_begin_stamp}")
     print(f"create_at_end: {create_at_end}, timestamp: {create_at_end_stamp}")
 
-    uo_city_pairs = uo_city_pairs_old.copy()
+    uo_city_pairs = uo_city_pairs_new.copy()
 
     for idx, city_pair in enumerate(uo_city_pairs):
         atlas_client, atlas_db = mongo_con_parse(atlas_config)
@@ -230,8 +230,8 @@ def main_import_process(create_at_begin, create_at_end):
     print()
 
 if __name__ == "__main__":
-    create_at_begin = "2026-03-21 00:00:00"
-    create_at_end = "2026-03-24 23:59:59"
+    create_at_begin = "2026-03-25 00:00:00"
+    create_at_end = "2026-03-25 23:59:59"
     main_import_process(create_at_begin, create_at_end)
     
     # try: