浏览代码

修正起飞时间的补齐过程

node04 1 月之前
父节点
当前提交
506ab5a9be
共有 1 个文件被更改,包括 34 次插入7 次删除
  1. 34 7
      data_loader.py

+ 34 - 7
data_loader.py

@@ -151,18 +151,45 @@ def query_flight_range_status(db, table_name, city_pair, flight_numbers, from_da
                     df = df.drop(columns=['_id'])
                 
                 if 'from_time' in df.columns and 'from_date' in df.columns:
+                    key_cols = ['citypair', 'flight_numbers', 'from_date']
+                    group_cols = [col for col in key_cols if col in df.columns]
+
                     from_time_raw = df['from_time']
                     from_time_str = from_time_raw.fillna('').astype(str).str.strip()
-                    non_empty = from_time_str[from_time_str.ne('')]  # 找到原始 from_time 非空的记录
+                    extracted_time = from_time_str.str.extract(r'(\d{2}:\d{2}:\d{2})$')[0]
+                    valid_time_mask = from_time_str.ne('') & extracted_time.notna()
 
-                    extracted_time = non_empty.str.extract(r'(\d{2}:\d{2}:\d{2})$')[0].dropna()
-                    if not extracted_time.empty:
-                        more_time = extracted_time.value_counts().idxmax()   # 按众数分配给其它行 构造from_time
+                    if valid_time_mask.any():
                         missing_mask = from_time_raw.isna() | from_time_str.eq('')
-                        if missing_mask.any():
-                            df.loc[missing_mask, 'from_time'] = df.loc[missing_mask, 'from_date'].astype(str).str.strip() + ' ' + more_time
+                    
+                        if group_cols:
+                            mode_by_group = (
+                                df.loc[valid_time_mask, group_cols]
+                                .assign(_mode_time=extracted_time.loc[valid_time_mask].values)
+                                .groupby(group_cols, dropna=False)['_mode_time']
+                                .agg(lambda s: s.value_counts().idxmax())
+                                .reset_index()
+                            )
+                            df = df.merge(mode_by_group, on=group_cols, how='left')
+                            fill_mask = missing_mask & df['_mode_time'].notna()
+                            if fill_mask.any():
+                                df.loc[fill_mask, 'from_time'] = (
+                                    df.loc[fill_mask, 'from_date'].astype(str).str.strip() + ' ' + df.loc[fill_mask, '_mode_time']
+                                )
+                            df = df.drop(columns=['_mode_time'])
+
+                            remaining_missing_mask = df['from_time'].isna() | df['from_time'].astype(str).str.strip().eq('')
+                            if remaining_missing_mask.any():
+                                more_time = extracted_time.loc[valid_time_mask].value_counts().idxmax()
+                                df.loc[remaining_missing_mask, 'from_time'] = (
+                                    df.loc[remaining_missing_mask, 'from_date'].astype(str).str.strip() + ' ' + more_time
+                                )
+                                pass
+                        else:
+                            more_time = extracted_time.loc[valid_time_mask].value_counts().idxmax()
+                            if missing_mask.any():
+                                df.loc[missing_mask, 'from_time'] = df.loc[missing_mask, 'from_date'].astype(str).str.strip() + ' ' + more_time
                     else:
-                        # 无法得到起飞日期的抛弃
                         print(f"⚠️ 无法提取有效起飞时间,抛弃该条记录")
                         return pd.DataFrame()