data_process.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. import pandas as pd
  2. import numpy as np
  3. import gc
  4. import os
  5. def preprocess_data_simple(df_input, is_train=False, hourly_time=None):
  6. print(">>> 开始数据预处理")
  7. # 城市码映射成数字(不用)
  8. # 更新日期是周几
  9. df_input['update_week'] = df_input['update_hour'].dt.dayofweek + 1
  10. # gid:基于指定字段的分组标记(整数)
  11. df_input['gid'] = (
  12. df_input
  13. .groupby(
  14. ['citypair', 'flight_numbers', 'from_date'], # 'baggage_weight' 先不进分组
  15. sort=False
  16. )
  17. .ngroup()
  18. )
  19. # 在 gid 与 baggage_weight 内按时间降序
  20. df_input = df_input.sort_values(
  21. by=['gid', 'baggage_weight', 'hours_until_departure'],
  22. ascending=[True, True, False]
  23. ).reset_index(drop=True)
  24. df_input = df_input[df_input['hours_until_departure'] <= 480]
  25. df_input = df_input[df_input['baggage_weight'] == 0] # 先保留0公斤行李的
  26. # 在hours_until_departure 的末尾 保留到当前时刻的数据
  27. if not is_train:
  28. df_input = df_input[df_input['update_hour'] <= hourly_time].copy()
  29. else:
  30. df_input = df_input.copy() # 训练集也 copy 一下保持一致性
  31. df_input = df_input.reset_index(drop=True)
  32. # 价格变化最小量阈值
  33. price_change_amount_threshold = 1
  34. df_input['_raw_price_diff'] = df_input.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total'].diff()
  35. # 计算价格变化量
  36. df_input['price_change_amount'] = (
  37. df_input['_raw_price_diff']
  38. .mask(df_input['_raw_price_diff'].abs() < price_change_amount_threshold, 0)
  39. .replace(0, np.nan)
  40. .groupby([df_input['gid'], df_input['baggage_weight']], group_keys=False)
  41. .ffill()
  42. .fillna(0)
  43. .round(2)
  44. )
  45. # 计算价格变化百分比(相对于上一时间点的变化率)
  46. df_input['price_change_percent'] = (
  47. df_input.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total']
  48. .pct_change()
  49. .mask(df_input['_raw_price_diff'].abs() < price_change_amount_threshold, 0)
  50. .replace(0, np.nan)
  51. .groupby([df_input['gid'], df_input['baggage_weight']], group_keys=False)
  52. .ffill()
  53. .fillna(0)
  54. .round(4)
  55. )
  56. # 第一步:标记价格变化段
  57. df_input['price_change_segment'] = (
  58. df_input.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_amount']
  59. .apply(lambda s: (s != s.shift()).cumsum())
  60. )
  61. # 第二步:计算每个变化段内的持续时间
  62. df_input['price_duration_hours'] = (
  63. df_input.groupby(['gid', 'baggage_weight', 'price_change_segment'], group_keys=False)
  64. .cumcount()
  65. .add(1)
  66. )
  67. # 可选:删除临时列
  68. df_input = df_input.drop(columns=['price_change_segment', '_raw_price_diff'])
  69. # 训练过程
  70. if is_train:
  71. df_target = df_input[(df_input['hours_until_departure'] >= 72) & (df_input['hours_until_departure'] <= 360)].copy()
  72. df_target = df_target.sort_values(
  73. by=['gid', 'baggage_weight', 'hours_until_departure'],
  74. ascending=[True, True, False]
  75. ).reset_index(drop=True)
  76. # 每条对应的前一条记录
  77. prev_pct = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_percent'].shift(1)
  78. prev_amo = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_change_amount'].shift(1)
  79. prev_dur = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_duration_hours'].shift(1)
  80. prev_price = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['price_total'].shift(1)
  81. prev_cabin = df_target.groupby(['gid', 'baggage_weight'], group_keys=False)['cabins'].shift(1)
  82. # 对于先升后降(先降再降)的分析
  83. seg_start_mask = df_target['price_duration_hours'].eq(1) # 开始变价节点
  84. drop_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] < 0)
  85. df_drop_nodes = df_target.loc[drop_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
  86. df_drop_nodes.rename(columns={'hours_until_departure': 'drop_hours_until_departure'}, inplace=True)
  87. df_drop_nodes.rename(columns={'days_to_departure': 'drop_days_to_departure'}, inplace=True)
  88. df_drop_nodes.rename(columns={'update_hour': 'drop_update_hour'}, inplace=True)
  89. df_drop_nodes.rename(columns={'update_week': 'drop_update_week'}, inplace=True)
  90. df_drop_nodes.rename(columns={'cabins': 'drop_cabins'}, inplace=True)
  91. df_drop_nodes['drop_price_change_percent'] = df_target.loc[drop_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
  92. df_drop_nodes['drop_price_change_amount'] = df_target.loc[drop_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
  93. df_drop_nodes['high_price_duration_hours'] = prev_dur.loc[drop_mask].astype(float).to_numpy()
  94. df_drop_nodes['high_price_change_percent'] = prev_pct.loc[drop_mask].astype(float).round(4).to_numpy()
  95. df_drop_nodes['high_price_change_amount'] = prev_amo.loc[drop_mask].astype(float).round(2).to_numpy()
  96. df_drop_nodes['high_price_amount'] = prev_price.loc[drop_mask].astype(float).round(2).to_numpy()
  97. df_drop_nodes['high_price_cabins'] = prev_cabin.loc[drop_mask].astype(str)
  98. df_drop_nodes = df_drop_nodes.reset_index(drop=True)
  99. flight_info_cols = [
  100. 'citypair', 'flight_numbers', 'from_time', 'from_date', 'currency',
  101. ]
  102. flight_info_cols = [c for c in flight_info_cols if c in df_target.columns]
  103. df_gid_info = df_target[['gid', 'baggage_weight'] + flight_info_cols].drop_duplicates(subset=['gid', 'baggage_weight']).reset_index(drop=True)
  104. df_drop_nodes = df_drop_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
  105. drop_info_cols = [
  106. 'drop_update_hour', 'drop_update_week', 'drop_cabins',
  107. 'drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
  108. 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'high_price_cabins',
  109. ]
  110. # 按顺序排列 去掉gid
  111. df_drop_nodes = df_drop_nodes[flight_info_cols + ['baggage_weight'] + drop_info_cols]
  112. # 对于先升再升(先降再升)的分析
  113. # seg_start_mask = df_target['price_duration_hours'].eq(1)
  114. rise_mask = seg_start_mask & ((prev_pct > 0) | (prev_pct < 0)) & (df_target['price_change_percent'] > 0)
  115. df_rise_nodes = df_target.loc[rise_mask, ['gid', 'baggage_weight', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week', 'cabins']].copy()
  116. df_rise_nodes.rename(columns={'hours_until_departure': 'rise_hours_until_departure'}, inplace=True)
  117. df_rise_nodes.rename(columns={'days_to_departure': 'rise_days_to_departure'}, inplace=True)
  118. df_rise_nodes.rename(columns={'update_hour': 'rise_update_hour'}, inplace=True)
  119. df_rise_nodes.rename(columns={'update_week': 'rise_update_week'}, inplace=True)
  120. df_rise_nodes.rename(columns={'cabins': 'rise_cabins'}, inplace=True)
  121. df_rise_nodes['rise_price_change_percent'] = df_target.loc[rise_mask, 'price_change_percent'].astype(float).round(4).to_numpy()
  122. df_rise_nodes['rise_price_change_amount'] = df_target.loc[rise_mask, 'price_change_amount'].astype(float).round(2).to_numpy()
  123. df_rise_nodes['prev_rise_duration_hours'] = prev_dur.loc[rise_mask].astype(float).to_numpy()
  124. df_rise_nodes['prev_rise_change_percent'] = prev_pct.loc[rise_mask].astype(float).round(4).to_numpy()
  125. df_rise_nodes['prev_rise_change_amount'] = prev_amo.loc[rise_mask].astype(float).round(2).to_numpy()
  126. df_rise_nodes['prev_rise_amount'] = prev_price.loc[rise_mask].astype(float).round(2).to_numpy()
  127. df_rise_nodes['prev_rise_cabins'] = prev_cabin.loc[rise_mask].astype(str)
  128. df_rise_nodes = df_rise_nodes.reset_index(drop=True)
  129. df_rise_nodes = df_rise_nodes.merge(df_gid_info, on=['gid', 'baggage_weight'], how='left')
  130. rise_info_cols = [
  131. 'rise_update_hour', 'rise_update_week', 'rise_cabins',
  132. 'rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
  133. 'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'prev_rise_cabins'
  134. ]
  135. df_rise_nodes = df_rise_nodes[flight_info_cols + ['baggage_weight'] + rise_info_cols]
  136. # 制作历史包络线
  137. envelope_group = ['citypair', 'flight_numbers', 'from_date', 'baggage_weight']
  138. idx_peak = df_target.groupby(envelope_group)['price_total'].idxmax()
  139. df_envelope = df_target.loc[idx_peak, envelope_group + [
  140. 'from_time', 'price_total', 'hours_until_departure', 'days_to_departure', 'update_hour', 'update_week',
  141. ]].rename(columns={
  142. 'price_total': 'peak_price',
  143. 'hours_until_departure': 'peak_hours',
  144. 'days_to_departure': 'peak_days',
  145. 'update_hour': 'peak_time',
  146. 'update_week': 'peak_week',
  147. }).reset_index(drop=True)
  148. del df_gid_info
  149. del df_target
  150. return df_input, df_drop_nodes, df_rise_nodes, df_envelope
  151. return df_input, None, None, None
  152. def predict_data_simple(df_input, city_pair, object_dir, predict_dir=".", pred_time_str=""):
  153. if df_input is None or df_input.empty:
  154. return pd.DataFrame()
  155. df_sorted = df_input.sort_values(
  156. by=['gid', 'baggage_weight', 'hours_until_departure'],
  157. ascending=[True, True, False],
  158. ).reset_index(drop=True)
  159. df_sorted = df_sorted[
  160. df_sorted['hours_until_departure'].between(72, 360)
  161. ].reset_index(drop=True)
  162. # 每个 gid baggage_weight 取 hours_until_departure 最小的一条 (当前小时)
  163. df_min_hours = (
  164. df_sorted.drop_duplicates(subset=['gid', 'baggage_weight'], keep='last')
  165. .reset_index(drop=True)
  166. )
  167. # 读历史降价场景
  168. drop_info_csv_path = os.path.join(object_dir, f'{city_pair}_drop_info.csv')
  169. if os.path.exists(drop_info_csv_path):
  170. df_drop_nodes = pd.read_csv(drop_info_csv_path)
  171. else:
  172. df_drop_nodes = pd.DataFrame()
  173. # 读历史升价场景
  174. rise_info_csv_path = os.path.join(object_dir, f'{city_pair}_rise_info.csv')
  175. if os.path.exists(rise_info_csv_path):
  176. df_rise_nodes = pd.read_csv(rise_info_csv_path)
  177. else:
  178. df_rise_nodes = pd.DataFrame()
  179. # 联合价格分布
  180. # 统一初始化
  181. df_min_hours['relative_position'] = np.nan
  182. if not df_drop_nodes.empty:
  183. df_drop_nodes['relative_position'] = np.nan
  184. if not df_rise_nodes.empty:
  185. df_rise_nodes['relative_position'] = np.nan
  186. parts = []
  187. # 当前待预测
  188. if not df_min_hours.empty and 'price_total' in df_min_hours.columns:
  189. cur = df_min_hours[['price_total']].copy()
  190. cur['price'] = pd.to_numeric(cur['price_total'], errors='coerce')
  191. cur['source'] = 'min'
  192. cur['row_id'] = cur.index
  193. parts.append(cur[['price', 'source', 'row_id']])
  194. # 历史降价
  195. if not df_drop_nodes.empty and 'high_price_amount' in df_drop_nodes.columns:
  196. drop = df_drop_nodes[['high_price_amount']].copy()
  197. drop['price'] = pd.to_numeric(drop['high_price_amount'], errors='coerce')
  198. drop['source'] = 'drop'
  199. drop['row_id'] = drop.index
  200. parts.append(drop[['price', 'source', 'row_id']])
  201. # 历史升价
  202. if not df_rise_nodes.empty and 'prev_rise_amount' in df_rise_nodes.columns:
  203. rise = df_rise_nodes[['prev_rise_amount']].copy()
  204. rise['price'] = pd.to_numeric(rise['prev_rise_amount'], errors='coerce')
  205. rise['source'] = 'rise'
  206. rise['row_id'] = rise.index
  207. parts.append(rise[['price', 'source', 'row_id']])
  208. if parts:
  209. all_prices = pd.concat(parts, ignore_index=True)
  210. all_prices = all_prices.dropna(subset=['price']).reset_index(drop=True)
  211. # 计算价格百分位
  212. dense_rank = all_prices['price'].rank(method='dense')
  213. max_rank = dense_rank.max()
  214. if pd.notna(max_rank) and max_rank > 1:
  215. all_prices['relative_position'] = (dense_rank - 1) / (max_rank - 1)
  216. else:
  217. all_prices['relative_position'] = 1.0
  218. all_prices['relative_position'] = all_prices['relative_position'].round(4)
  219. # 回填到三个表
  220. m = all_prices['source'] == 'min'
  221. df_min_hours.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
  222. if not df_drop_nodes.empty:
  223. m = all_prices['source'] == 'drop'
  224. df_drop_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
  225. if not df_rise_nodes.empty:
  226. m = all_prices['source'] == 'rise'
  227. df_rise_nodes.loc[all_prices.loc[m, 'row_id'], 'relative_position'] = all_prices.loc[m, 'relative_position'].values
  228. pass
  229. # =====================================================================
  230. df_min_hours['simple_will_price_drop'] = 0
  231. df_min_hours['simple_drop_in_hours'] = 0
  232. df_min_hours['simple_drop_in_hours_prob'] = 0.0
  233. df_min_hours['simple_drop_in_hours_dist'] = '' # 空串 表示未知
  234. df_min_hours['flag_dist'] = ''
  235. df_min_hours['drop_price_change_upper'] = 0.0
  236. df_min_hours['drop_price_change_lower'] = 0.0
  237. df_min_hours['drop_price_sample_size'] = 0
  238. df_min_hours['rise_price_change_upper'] = 0.0
  239. df_min_hours['rise_price_change_lower'] = 0.0
  240. df_min_hours['rise_price_sample_size'] = 0
  241. # 这个阈值取多少?
  242. pct_threshold = 0.01
  243. pct_threshold_1 = 0.01
  244. for idx, row in df_min_hours.iterrows():
  245. city_pair = row['citypair']
  246. flight_numbers = row['flight_numbers']
  247. baggage_weight = row['baggage_weight']
  248. from_date = row['from_date']
  249. if flight_numbers == "UO235" and from_date == "2026-04-25": # 调试时用
  250. pass
  251. days_to_departure = row['days_to_departure']
  252. hours_until_departure = row['hours_until_departure']
  253. price_change_percent = row['price_change_percent']
  254. price_change_amount = row['price_change_amount']
  255. price_duration_hours = row['price_duration_hours']
  256. price_amount = row['price_total']
  257. length_drop = 0
  258. length_rise = 0
  259. # 针对历史上发生的 >降价
  260. if not df_drop_nodes.empty:
  261. # 对准航线 航班号 行李配额
  262. df_drop_nodes_part = df_drop_nodes[
  263. (df_drop_nodes['citypair'] == city_pair) &
  264. (df_drop_nodes['flight_numbers'] == flight_numbers) &
  265. (df_drop_nodes['baggage_weight'] == baggage_weight)
  266. ]
  267. # 降价前 增量阈值、当前阈值 的匹配
  268. if not df_drop_nodes_part.empty and pd.notna(price_change_percent):
  269. pct_base = float(price_change_percent)
  270. pct_vals = pd.to_numeric(df_drop_nodes_part['high_price_change_percent'], errors='coerce')
  271. df_drop_gap = df_drop_nodes_part.loc[
  272. pct_vals.notna(),
  273. ['drop_days_to_departure', 'drop_hours_until_departure', 'drop_price_change_percent', 'drop_price_change_amount',
  274. 'high_price_duration_hours', 'high_price_change_percent', 'high_price_change_amount', 'high_price_amount', 'relative_position'
  275. ]
  276. ].copy()
  277. df_drop_gap['pct_gap'] = (pct_vals.loc[pct_vals.notna()] - pct_base)
  278. df_drop_gap['pct_abs_gap'] = df_drop_gap['pct_gap'].abs()
  279. price_base = pd.to_numeric(price_amount, errors='coerce')
  280. high_price_vals = pd.to_numeric(df_drop_gap['high_price_amount'], errors='coerce')
  281. df_drop_gap['price_gap'] = high_price_vals - price_base
  282. df_drop_gap['price_abs_gap'] = df_drop_gap['price_gap'].abs()
  283. df_drop_gap = df_drop_gap.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
  284. df_match = df_drop_gap[(df_drop_gap['pct_abs_gap'] <= pct_threshold) & (df_drop_gap['price_abs_gap'] <= 3.0)].copy()
  285. # 历史上出现的极近似的增长(下降)幅度后的降价场景
  286. if not df_match.empty:
  287. dur_base = pd.to_numeric(price_duration_hours, errors='coerce')
  288. # hud_base = pd.to_numeric(hours_until_departure, errors='coerce')
  289. dtd_base = pd.to_numeric(days_to_departure, errors='coerce')
  290. if pd.notna(dur_base) and pd.notna(dtd_base):
  291. df_match_chk = df_match.copy()
  292. drop_dtd_vals = pd.to_numeric(df_match_chk['drop_days_to_departure'], errors='coerce')
  293. df_match_chk = df_match_chk.loc[drop_dtd_vals.notna()].copy()
  294. df_match_chk = df_match_chk.loc[(drop_dtd_vals.loc[drop_dtd_vals.notna()] - float(dtd_base)).abs() <= 3].copy()
  295. # 距离起飞天数也对的上
  296. if not df_match_chk.empty:
  297. length_drop = df_match_chk.shape[0]
  298. df_min_hours.loc[idx, 'drop_price_sample_size'] = length_drop
  299. drop_price_change_upper = df_match_chk['drop_price_change_amount'].max() # 降价上限
  300. drop_price_change_lower = df_match_chk['drop_price_change_amount'].min() # 降价下限
  301. df_min_hours.loc[idx, 'drop_price_change_upper'] = round(drop_price_change_upper, 2)
  302. df_min_hours.loc[idx, 'drop_price_change_lower'] = round(drop_price_change_lower, 2)
  303. remaining_hours = (
  304. pd.to_numeric(df_match_chk['high_price_duration_hours'], errors='coerce') - float(dur_base)
  305. ).clip(lower=0)
  306. remaining_hours = remaining_hours.round().astype(int)
  307. counts = remaining_hours.value_counts().sort_index()
  308. probs = (counts / counts.sum()).round(4)
  309. top_hours = int(probs.idxmax())
  310. top_prob = float(probs.max())
  311. dist_items = list(zip(probs.index.tolist(), probs.tolist()))
  312. dist_items = dist_items[:10]
  313. dist_str = ' '.join([f"{int(h)}h->{float(p)}" for h, p in dist_items])
  314. df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
  315. df_min_hours.loc[idx, 'simple_drop_in_hours'] = top_hours
  316. df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 1
  317. df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = dist_str
  318. df_min_hours.loc[idx, 'flag_dist'] = 'd0'
  319. pass
  320. pass
  321. # 针对历史上发生的 <升价
  322. if not df_rise_nodes.empty:
  323. # 对准航线 航班号 行李配额
  324. df_rise_nodes_part = df_rise_nodes[
  325. (df_rise_nodes['citypair'] == city_pair) &
  326. (df_rise_nodes['flight_numbers'] == flight_numbers) &
  327. (df_rise_nodes['baggage_weight'] == baggage_weight)
  328. ]
  329. # 升价前 增量阈值、当前阈值 的匹配
  330. if not df_rise_nodes_part.empty and pd.notna(price_change_percent):
  331. pct_base_1 = float(price_change_percent)
  332. pct_vals_1 = pd.to_numeric(df_rise_nodes_part['prev_rise_change_percent'], errors='coerce')
  333. df_rise_gap_1 = df_rise_nodes_part.loc[
  334. pct_vals_1.notna(),
  335. ['rise_days_to_departure', 'rise_hours_until_departure', 'rise_price_change_percent', 'rise_price_change_amount',
  336. 'prev_rise_duration_hours', 'prev_rise_change_percent', 'prev_rise_change_amount', 'prev_rise_amount', 'relative_position']
  337. ].copy()
  338. df_rise_gap_1['pct_gap'] = (pct_vals_1.loc[pct_vals_1.notna()] - pct_base_1)
  339. df_rise_gap_1['pct_abs_gap'] = df_rise_gap_1['pct_gap'].abs()
  340. price_base_1 = pd.to_numeric(price_amount, errors='coerce')
  341. rise_price_vals_1 = pd.to_numeric(df_rise_gap_1['prev_rise_amount'], errors='coerce')
  342. df_rise_gap_1['price_gap'] = rise_price_vals_1 - price_base_1
  343. df_rise_gap_1['price_abs_gap'] = df_rise_gap_1['price_gap'].abs()
  344. df_rise_gap_1 = df_rise_gap_1.sort_values(['price_abs_gap', 'pct_abs_gap'], ascending=[True, True])
  345. df_match_1 = df_rise_gap_1.loc[(df_rise_gap_1['pct_abs_gap'] <= pct_threshold_1) & (df_rise_gap_1['price_abs_gap'] <= 3.0)].copy()
  346. # 历史上出现的极近似的增长(下降)幅度后的升价场景
  347. if not df_match_1.empty:
  348. dur_base_1 = pd.to_numeric(price_duration_hours, errors='coerce')
  349. # hud_base_1 = pd.to_numeric(hours_until_departure, errors='coerce')
  350. dtd_base_1 = pd.to_numeric(days_to_departure, errors='coerce')
  351. if pd.notna(dur_base_1) and pd.notna(dtd_base_1):
  352. df_match_chk_1 = df_match_1.copy()
  353. drop_dtd_vals_1 = pd.to_numeric(df_match_chk_1['rise_days_to_departure'], errors='coerce')
  354. df_match_chk_1 = df_match_chk_1.loc[drop_dtd_vals_1.notna()].copy()
  355. df_match_chk_1 = df_match_chk_1.loc[(drop_dtd_vals_1.loc[drop_dtd_vals_1.notna()] - float(dtd_base_1)).abs() <= 3].copy()
  356. # 距离起飞天数也对的上
  357. if not df_match_chk_1.empty:
  358. length_rise = df_match_chk_1.shape[0]
  359. df_min_hours.loc[idx, 'rise_price_sample_size'] = length_rise
  360. rise_price_change_upper = df_match_chk_1['rise_price_change_amount'].max() # 涨价上限
  361. rise_price_change_lower = df_match_chk_1['rise_price_change_amount'].min() # 涨价下限
  362. df_min_hours.loc[idx, 'rise_price_change_upper'] = round(rise_price_change_upper, 2)
  363. df_min_hours.loc[idx, 'rise_price_change_lower'] = round(rise_price_change_lower, 2)
  364. # 可以明确的判定不降价
  365. if length_drop == 0:
  366. df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
  367. df_min_hours.loc[idx, 'simple_drop_in_hours'] = 0
  368. df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = 0.0
  369. # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'r0'
  370. df_min_hours.loc[idx, 'flag_dist'] = 'r0'
  371. # 分歧判定
  372. else:
  373. drop_prob = round(length_drop / (length_rise + length_drop), 2)
  374. # 依旧保持之前的降价判定,概率修改
  375. if drop_prob >= 0.7:
  376. df_min_hours.loc[idx, 'simple_will_price_drop'] = 1
  377. # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'd1'
  378. df_min_hours.loc[idx, 'flag_dist'] = 'd1'
  379. # 改判不降价,概率修改
  380. else:
  381. df_min_hours.loc[idx, 'simple_will_price_drop'] = 0
  382. # df_min_hours.loc[idx, 'simple_drop_in_hours_dist'] = 'r1'
  383. df_min_hours.loc[idx, 'flag_dist'] = 'r1'
  384. df_min_hours.loc[idx, 'simple_drop_in_hours_prob'] = drop_prob
  385. print("判定循环结束")
  386. _dep_hour = pd.to_datetime(df_min_hours["from_time"], errors="coerce").dt.floor("h")
  387. df_min_hours["valid_begin_hour"] = (_dep_hour - pd.to_timedelta(360, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
  388. df_min_hours["valid_end_hour"] = (_dep_hour - pd.to_timedelta(72, unit="h")).dt.strftime("%Y-%m-%d %H:%M:%S")
  389. # 要展示在预测表里的字段
  390. order_cols = [
  391. "citypair", "flight_numbers", "baggage_weight", "from_date", "from_time",
  392. "cabins", "ticket_amount", "currency", "price_base", "price_tax",
  393. "price_total", 'relative_position', 'days_to_departure', 'hours_until_departure',
  394. 'price_change_amount', 'price_change_percent', 'price_duration_hours',
  395. "update_hour", "create_time",
  396. 'valid_begin_hour', 'valid_end_hour',
  397. 'simple_will_price_drop', 'simple_drop_in_hours', 'simple_drop_in_hours_prob', 'simple_drop_in_hours_dist',
  398. 'flag_dist',
  399. 'drop_price_change_upper', 'drop_price_change_lower', 'drop_price_sample_size',
  400. 'rise_price_change_upper', 'rise_price_change_lower', 'rise_price_sample_size',
  401. ]
  402. df_predict = df_min_hours[order_cols]
  403. df_predict = df_predict.rename(columns={
  404. 'simple_will_price_drop': 'will_price_drop',
  405. 'simple_drop_in_hours': 'drop_in_hours',
  406. 'simple_drop_in_hours_prob': 'drop_in_hours_prob',
  407. 'simple_drop_in_hours_dist': 'drop_in_hours_dist',
  408. }
  409. )
  410. # 排序
  411. df_predict = df_predict.sort_values(
  412. by=['citypair', 'flight_numbers', 'baggage_weight', 'from_date'],
  413. kind='mergesort',
  414. na_position='last',
  415. ).reset_index(drop=True)
  416. total_cnt = len(df_predict)
  417. if "will_price_drop" in df_predict.columns:
  418. _wpd = pd.to_numeric(df_predict["will_price_drop"], errors="coerce")
  419. drop_1_cnt = int((_wpd == 1).sum())
  420. drop_0_cnt = int((_wpd == 0).sum())
  421. else:
  422. drop_1_cnt = 0
  423. drop_0_cnt = 0
  424. print(f"will_price_drop 分类数量统计: 1(会降)={drop_1_cnt}, 0(不降)={drop_0_cnt}, 总数={total_cnt}")
  425. csv_path1 = os.path.join(predict_dir, f'future_predictions_{pred_time_str}.csv')
  426. df_predict.to_csv(csv_path1, mode='a', index=False, header=not os.path.exists(csv_path1), encoding='utf-8-sig')
  427. print("预测结果已追加")
  428. return df_predict