data_loader.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. import time
  2. import random
  3. from datetime import datetime, timedelta
  4. import pymongo
  5. from pymongo.errors import PyMongoError, ServerSelectionTimeoutError
  6. import pandas as pd
  7. from uo_atlas_import import mongo_con_parse
  8. from config import mongo_config, mongo_table_uo, uo_city_pairs
  9. def query_groups_of_city_pair(db, city_pair, table_name, min_days=10, max_retries=3, base_sleep=1.0):
  10. """根据city_pair查询航线, 筛选1个月内至少有10天起飞的航线"""
  11. print(f"{city_pair} 查找所有分组")
  12. date_begin = (datetime.today() - timedelta(days=30)).strftime("%Y-%m-%d")
  13. date_end = datetime.today().strftime("%Y-%m-%d")
  14. # 聚合查询管道
  15. pipeline = [
  16. {
  17. "$match": {
  18. "citypair": city_pair,
  19. "from_date": {
  20. "$gte": date_begin,
  21. "$lte": date_end
  22. }
  23. }
  24. },
  25. {
  26. "$group": {
  27. "_id": {
  28. "flight_numbers": "$flight_numbers",
  29. "from_date": "$from_date"
  30. }
  31. }
  32. },
  33. {
  34. "$group": {
  35. "_id": "$_id.flight_numbers",
  36. "days": {"$sum": 1},
  37. "details": {"$push": "$_id.from_date"}
  38. }
  39. },
  40. {
  41. "$match": {
  42. "days": {"$gte": min_days}
  43. }
  44. },
  45. {
  46. "$addFields": {
  47. "details": {"$sortArray": {"input": "$details", "sortBy": 1}}
  48. }
  49. },
  50. {
  51. "$sort": {"_id": 1}
  52. }
  53. ]
  54. for attempt in range(1, max_retries + 1):
  55. try:
  56. print(f" 第 {attempt}/{max_retries} 次尝试查询")
  57. # 执行聚合查询
  58. collection = db[table_name]
  59. results = list(collection.aggregate(pipeline))
  60. # 格式化结果,使字段名更清晰
  61. formatted_results = [
  62. {
  63. "flight_numbers": r["_id"],
  64. "days": r["days"],
  65. "flight_dates": r["details"]
  66. }
  67. for r in results
  68. ]
  69. return formatted_results
  70. except (ServerSelectionTimeoutError, PyMongoError) as e:
  71. print(f"⚠️ Mongo 查询失败: {e}")
  72. if attempt == max_retries:
  73. print("❌ 达到最大重试次数,放弃")
  74. return []
  75. # 指数退避 + 随机抖动
  76. sleep_time = base_sleep * (2 ** (attempt - 1)) + random.random()
  77. print(f"⏳ {sleep_time:.2f}s 后重试...")
  78. time.sleep(sleep_time)
  79. def query_flight_range_status(db, table_name, city_pair, flight_numbers, from_date_begin, from_date_end,
  80. limit=0, max_retries=3, base_sleep=1.0):
  81. for attempt in range(1, max_retries + 1):
  82. try:
  83. print(f"🔁 第 {attempt}/{max_retries} 次尝试查询")
  84. # 构建查询条件
  85. projection = {
  86. # "_id": 0 # 一般会关掉
  87. "citypair": 1,
  88. "flight_numbers": 1,
  89. "from_date": 1,
  90. "from_time": 1,
  91. "create_time": 1,
  92. "baggage_weight": 1,
  93. "cabins": 1,
  94. "ticket_amount": 1,
  95. "currency": 1,
  96. "price_total": 1
  97. }
  98. pipeline = [
  99. {
  100. "$match": {
  101. "citypair": city_pair,
  102. "flight_numbers": flight_numbers,
  103. "baggage_weight": {"$in": [0, 20]},
  104. "from_date": {
  105. "$gte": from_date_begin,
  106. "$lte": from_date_end
  107. }
  108. }
  109. },
  110. {
  111. "$project": projection # 就是这里
  112. },
  113. {
  114. "$sort": {
  115. "from_date": 1,
  116. "baggage_weight": 1,
  117. "create_time": 1
  118. }
  119. }
  120. ]
  121. # print(f" 查询条件: {pipeline}")
  122. # 执行查询
  123. collection = db[table_name]
  124. results = list(collection.aggregate(pipeline))
  125. print(f"✅ 查询成功,找到 {len(results)} 条记录")
  126. if results:
  127. df = pd.DataFrame(results)
  128. if '_id' in df.columns:
  129. df = df.drop(columns=['_id'])
  130. if 'from_time' in df.columns and 'from_date' in df.columns:
  131. from_time_raw = df['from_time']
  132. from_time_str = from_time_raw.fillna('').astype(str).str.strip()
  133. non_empty = from_time_str[from_time_str.ne('')] # 找到原始 from_time 非空的记录
  134. extracted_time = non_empty.str.extract(r'(\d{2}:\d{2}:\d{2})$')[0].dropna()
  135. if not extracted_time.empty:
  136. more_time = extracted_time.value_counts().idxmax() # 按众数分配给其它行 构造from_time
  137. missing_mask = from_time_raw.isna() | from_time_str.eq('')
  138. if missing_mask.any():
  139. df.loc[missing_mask, 'from_time'] = df.loc[missing_mask, 'from_date'].astype(str).str.strip() + ' ' + more_time
  140. else:
  141. # 无法得到起飞日期的抛弃
  142. return pd.DataFrame()
  143. print(f"📊 已转换为 DataFrame,形状: {df.shape}")
  144. return df
  145. else:
  146. print("⚠️ 查询结果为空")
  147. return pd.DataFrame()
  148. except (ServerSelectionTimeoutError, PyMongoError) as e:
  149. print(f"⚠️ Mongo 查询失败: {e}")
  150. if attempt == max_retries:
  151. print("❌ 达到最大重试次数,放弃")
  152. return pd.DataFrame()
  153. # 指数退避 + 随机抖动
  154. sleep_time = base_sleep * (2 ** (attempt - 1)) + random.random()
  155. print(f"⏳ {sleep_time:.2f}s 后重试...")
  156. time.sleep(sleep_time)
  157. def fill_hourly_create_time(df):
  158. """补齐成小时粒度数据"""
  159. pass
  160. def process_flight_numbers(args):
  161. process_id, db_config, city_pair, flight_numbers, from_date_begin, from_date_end, is_train, plot_flag, output_dir = args
  162. print(f"[进程{process_id}] 开始处理航班号: {flight_numbers}")
  163. # 为每个进程创建独立的数据库连接
  164. try:
  165. client, db = mongo_con_parse(db_config)
  166. print(f"[进程{process_id}] ✅ 数据库连接创建成功")
  167. except Exception as e:
  168. print(f"[进程{process_id}] ❌ 数据库连接创建失败: {e}")
  169. return pd.DataFrame()
  170. try:
  171. # 查询
  172. df_1 = query_flight_range_status(db, mongo_table_uo, city_pair, flight_numbers, from_date_begin, from_date_end)
  173. df_f1 = fill_hourly_create_time(df_1)
  174. except Exception as e:
  175. print(f"[进程{process_id}] ❌ 处理航班号:{flight_numbers} 时发生异常: {e}")
  176. return pd.DataFrame()
  177. finally:
  178. # 确保关闭数据库连接
  179. try:
  180. client.close()
  181. print(f"[进程{process_id}] ✅ 数据库连接已关闭")
  182. except:
  183. pass
  184. def load_data(db_config, city_pair, from_date_begin, from_date_end, is_train=True, plot_flag=False, output_dir='.',
  185. use_multiprocess=False, max_workers=None):
  186. print(f"开始处理航线: {city_pair}")
  187. main_client, main_db = mongo_con_parse(db_config)
  188. all_groups = query_groups_of_city_pair(main_db, city_pair, mongo_table_uo)
  189. main_client.close()
  190. all_groups_len = len(all_groups)
  191. print(f"该航线共有{all_groups_len}组航班号")
  192. print("使用单进程处理")
  193. process_id = 0
  194. for each_group in all_groups:
  195. flight_numbers = each_group.get("flight_numbers", "未知")
  196. args = (process_id, db_config, city_pair, flight_numbers, from_date_begin, from_date_end, is_train, plot_flag, output_dir)
  197. try:
  198. df_mid = process_flight_numbers(args)
  199. pass
  200. except Exception as e:
  201. print(f"❌ 航班号:{flight_numbers} 处理异常: {e}")
  202. if __name__ == "__main__":
  203. from_date_begin = "2026-03-17"
  204. from_date_end = "2026-04-01"
  205. uo_city_pair_list = [f"{pair[:3]}-{pair[3:]}" for pair in uo_city_pairs]
  206. for idx, uo_city_pair in enumerate(uo_city_pair_list, start=1):
  207. # 使用默认配置
  208. # client, db = mongo_con_parse()
  209. print(f"第 {idx} 组 :", uo_city_pair)
  210. start_time = time.time()
  211. load_data(mongo_config, uo_city_pair, from_date_begin, from_date_end)
  212. end_time = time.time()
  213. run_time = round(end_time - start_time, 3)
  214. print(f"用时: {run_time} 秒")