|
|
@@ -0,0 +1,115 @@
|
|
|
+import os
|
|
|
+import datetime
|
|
|
+import pandas as pd
|
|
|
+from data_loader import mongo_con_parse, validate_one_line, fill_hourly_crawl_date
|
|
|
+
|
|
|
+
|
|
|
+def validate_process(node, date):
|
|
|
+
|
|
|
+ output_dir = f"./validate/{node}_{date}"
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
+
|
|
|
+ object_dir = "./data_shards"
|
|
|
+ csv_file = 'future_predictions.csv'
|
|
|
+ csv_path = os.path.join(object_dir, csv_file)
|
|
|
+
|
|
|
+ try:
|
|
|
+ df_predict = pd.read_csv(csv_path)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"read {csv_path} error: {str(e)}")
|
|
|
+ df_predict = pd.DataFrame()
|
|
|
+
|
|
|
+ if df_predict.empty:
|
|
|
+ print(f"预测数据为空")
|
|
|
+ return
|
|
|
+
|
|
|
+ # fly_day = df_predict['flight_day'].unique()[0]
|
|
|
+
|
|
|
+ client, db = mongo_con_parse()
|
|
|
+
|
|
|
+ count = 0
|
|
|
+ for idx, row in df_predict.iterrows():
|
|
|
+ city_pair = row['city_pair']
|
|
|
+ flight_day = row['flight_day']
|
|
|
+ flight_number_1 = row['flight_number_1']
|
|
|
+ flight_number_2 = row['flight_number_2']
|
|
|
+ baggage = row['baggage']
|
|
|
+ valid_begin_hour = row['valid_begin_hour']
|
|
|
+ df_val= validate_one_line(db, city_pair, flight_day, flight_number_1, flight_number_2, baggage, valid_begin_hour)
|
|
|
+ if not df_val.empty:
|
|
|
+ df_val_f = fill_hourly_crawl_date(df_val, rear_fill=2)
|
|
|
+ df_val_f = df_val_f[df_val_f['is_filled']==0] # 只要原始数据,不要补齐的
|
|
|
+ if df_val_f.empty:
|
|
|
+ drop_flag = 0
|
|
|
+ first_drop_amount = pd.NA
|
|
|
+ first_drop_hours = pd.NA
|
|
|
+ last_hours_util = pd.NA
|
|
|
+ last_update_hour = pd.NA
|
|
|
+ list_change_price = []
|
|
|
+ list_change_hours = []
|
|
|
+ else:
|
|
|
+ # 有效数据的最后一行
|
|
|
+ last_row = df_val_f.iloc[-1]
|
|
|
+ last_hours_util = last_row['hours_until_departure']
|
|
|
+ last_update_hour = last_row['update_hour']
|
|
|
+
|
|
|
+ # 价格变化过滤
|
|
|
+ df_price_changes = df_val_f.loc[
|
|
|
+ df_val_f["adult_total_price"].shift() != df_val_f["adult_total_price"]
|
|
|
+ ].copy()
|
|
|
+
|
|
|
+ # 价格变化幅度
|
|
|
+ df_price_changes['change_amount'] = df_price_changes['adult_total_price'].diff().fillna(0)
|
|
|
+
|
|
|
+ # 找到第一个 change_amount 小于 -10 的行
|
|
|
+ first_negative_change = df_price_changes[df_price_changes['change_amount'] < -10].head(1)
|
|
|
+
|
|
|
+ # 提取所需的值
|
|
|
+ if not first_negative_change.empty:
|
|
|
+ drop_flag = 1
|
|
|
+ first_drop_amount = first_negative_change['change_amount'].iloc[0].round(2)
|
|
|
+ first_drop_hours = first_negative_change['hours_until_departure'].iloc[0]
|
|
|
+ else:
|
|
|
+ drop_flag = 0
|
|
|
+ first_drop_amount = pd.NA
|
|
|
+ first_drop_hours = pd.NA
|
|
|
+
|
|
|
+ list_change_price = df_price_changes['adult_total_price'].tolist()
|
|
|
+ list_change_hours = df_price_changes['hours_until_departure'].tolist()
|
|
|
+ else:
|
|
|
+ drop_flag = 0
|
|
|
+ first_drop_amount = pd.NA
|
|
|
+ first_drop_hours = pd.NA
|
|
|
+ last_hours_util = pd.NA
|
|
|
+ last_update_hour = pd.NA
|
|
|
+ list_change_price = []
|
|
|
+ list_change_hours = []
|
|
|
+
|
|
|
+ safe_sep = "; "
|
|
|
+
|
|
|
+ df_predict.at[idx, 'change_prices'] = safe_sep.join(map(str, list_change_price))
|
|
|
+ df_predict.at[idx, 'change_hours'] = safe_sep.join(map(str, list_change_hours))
|
|
|
+ df_predict.at[idx, 'last_hours_util'] = last_hours_util
|
|
|
+ df_predict.at[idx, 'last_update_hour'] = last_update_hour
|
|
|
+ df_predict.at[idx, 'first_drop_amount'] = first_drop_amount * -1 # 负数转正数
|
|
|
+ df_predict.at[idx, 'first_drop_hours'] = first_drop_hours
|
|
|
+ df_predict.at[idx, 'drop_flag'] = drop_flag
|
|
|
+
|
|
|
+ count += 1
|
|
|
+ if count % 5 == 0:
|
|
|
+ print(f"cal count: {count}")
|
|
|
+
|
|
|
+ print(f"计算结束")
|
|
|
+ client.close()
|
|
|
+
|
|
|
+ timestamp_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
|
|
+ save_scv = f"result_validate_{node}_{date}_{timestamp_str}.csv"
|
|
|
+
|
|
|
+ output_path = os.path.join(output_dir, save_scv)
|
|
|
+ df_predict.to_csv(output_path, index=False, encoding="utf-8-sig")
|
|
|
+ print(f"保存完成: {output_path}")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ node, date = "node0105", "0107"
|
|
|
+ validate_process(node, date)
|