request_pyhttpx.py 10 KB


  1. import requests
  2. import pyhttpx
  3. import time
  4. from datetime import datetime, timedelta
  5. import retrying
  6. import execjs
  7. from lxml import etree
  8. import json
  9. from loguru import logger
  10. import threading
  11. from queue import Queue
  12. # import pandas as pd
  13. class GK:
  14. def __init__(self):
  15. self.search_flights_api = "https://booking.jetstar.com/hk/zh/booking/search-flights"
  16. self.headers = {
  17. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
  18. # 'user-agent': get_random_user_agent(),
  19. 'Accept-Encoding': 'gzip, deflate, br',
  20. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  21. 'Connection': 'keep-alive',
  22. 'Content-Type': 'application/x-www-form-urlencoded',
  23. 'accept-language': 'zh-CN,zh;q=0.9',
  24. 'cache-control': 'no-cache',
  25. 'pragma': 'no-cache',
  26. 'priority': 'u=0, i',
  27. 'referer': 'https://booking.jetstar.com/',
  28. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  29. 'sec-ch-ua-mobile': '?0',
  30. 'sec-ch-ua-platform': '"Windows"',
  31. 'sec-fetch-dest': 'document',
  32. 'sec-fetch-mode': 'navigate',
  33. 'sec-fetch-site': 'same-origin',
  34. 'sec-fetch-user': '?1',
  35. 'upgrade-insecure-requests': '1'
  36. }
  37. with open('../akm/逆向1.js', encoding='utf-8') as f:
  38. js = f.read()
  39. self.ctx = execjs.compile(js)
  40. # self.session = pyhttpx
  41. # self.ip = 100000000000
  42. # self.proxies = {
  43. # 'http': f'http://B_3351_HK___5_ss-{self.ip}:ev2pjj@proxy.renlaer.com:7778',
  44. # 'https': f'http://B_3351_HK___5_ss-{self.ip}:ev2pjj@proxy.renlaer.com:7778'
  45. # }
  46. self.lock = threading.Lock()
  47. # self.cookies_queue = CookieQueue()
  48. self.cookies_queue = Queue()
  49. self.ja3_queue = Queue()
  50. self.task_queue = Queue()
  51. self.resp_data_queue = Queue()
  52. @retrying.retry(stop_max_attempt_number=3, wait_fixed=4000)
  53. def get_ja3_str(self):
  54. url = "http://8.218.51.130:9003/api/v1/ja3"
  55. payload = {}
  56. headers = {
  57. 'cid': '750B5141EDBF7FA6F73A99C768130099'
  58. }
  59. while True:
  60. if self.ja3_queue.qsize() < 5:
  61. response = requests.get(url, headers=headers, data=payload)
  62. if response.status_code == 200:
  63. res_json = response.json()
  64. if res_json.get("code") == 0:
  65. ja3 = res_json.get("data").get("ja3_str")
  66. ua = res_json.get("data").get("ua")
  67. if "--" not in ja3 and ",," not in ja3:
  68. end_data = (
  69. ua,
  70. ja3
  71. )
  72. self.ja3_queue.put(end_data)
  73. time.sleep(3)
  74. @retrying.retry(stop_max_attempt_number=3, wait_fixed=4000)
  75. def request_new_cookie(self):
  76. logger.debug('正在获取 cookie bm-sz...')
  77. ua, ja3 = self.ja3_queue.get()
  78. # print(ua, ja3)
  79. sess = pyhttpx.HttpSession(
  80. ja3=ja3, # 自定义 JA3 字符串
  81. http2=True, # 启用 HTTP/2
  82. )
  83. # akm js file url
  84. akm_url = "https://booking.jetstar.com/MkuYlo/pcp/LD0/PPluEQ/1ik7QcJffXbmL53i/QTcvXmg7/KS5kC3N/VRQcB"
  85. data = {
  86. 'sensor_data': self.ctx.call('encrypt1')
  87. }
  88. response = sess.post(akm_url, headers=self.headers, verify=False, data=data,
  89. # proxies=self.proxies
  90. )
  91. # print(response.text)
  92. bmsz = response.cookies['bm_sz']
  93. data2 = {
  94. "sensor_data": self.ctx.call('encrypt2', bmsz)
  95. }
  96. data2 = json.dumps(data2)
  97. response2 = sess.post(akm_url, headers=self.headers, data=data2, verify=False)
  98. # print(response2.text)
  99. # print(response2.status_code)
  100. # print(response2.cookies.get_dict())
  101. # with self.lock:
  102. logger.debug(f'成功获取到 cookie :{bmsz}')
  103. return response.cookies
  104. # return bmsz
  105. def _refresh_cookie(self):
  106. while True:
  107. if self.cookies_queue.qsize() < 2:
  108. cookie = self.request_new_cookie()
  109. self.cookies_queue.put(cookie)
  110. time.sleep(3)
  111. def gen_task(self, start_date, end_date):
  112. """将每个城市对与日期组合生成独立任务, 上传到任务队列"""
  113. # 获取采集城市对
  114. # for city_code in self.gen_city():
  115. # 获取采集时间
  116. for datetime_str in self.gen_datetime(start_date, end_date):
  117. # self.task_queue.put((city_code, datetime_str))
  118. self.task_queue.put((datetime_str))
  119. @retrying.retry(stop_max_attempt_number=2)
  120. def send_get(self, url, params, bmsz_cookie):
  121. ua, ja3_str = self.ja3_queue.get()
  122. sess = pyhttpx.HttpSession(
  123. ja3=ja3_str, # 自定义 JA3 字符串
  124. http2=True, # 启用 HTTP/2
  125. )
  126. headers = {
  127. # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
  128. 'user-agent': ua,
  129. 'Accept-Encoding': 'gzip, deflate, br',
  130. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  131. 'Connection': 'keep-alive',
  132. 'Content-Type': 'application/x-www-form-urlencoded',
  133. 'accept-language': 'zh-CN,zh;q=0.9',
  134. 'cache-control': 'no-cache',
  135. 'pragma': 'no-cache',
  136. 'priority': 'u=0, i',
  137. 'referer': 'https://booking.jetstar.com/',
  138. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  139. 'sec-ch-ua-mobile': '?0',
  140. 'sec-ch-ua-platform': '"Windows"',
  141. 'sec-fetch-dest': 'document',
  142. 'sec-fetch-mode': 'navigate',
  143. 'sec-fetch-site': 'same-origin',
  144. 'sec-fetch-user': '?1',
  145. 'upgrade-insecure-requests': '1'
  146. }
  147. response = sess.get(
  148. url,
  149. headers=headers, params=params, cookies=bmsz_cookie,
  150. timeout=15,
  151. verify=False,
  152. )
  153. self.ja3_queue.put((ua, ja3_str)) # 回收破烂
  154. # print(response.text)
  155. # logger.info(f'')
  156. return response
  157. def get_data(self):
  158. while True:
  159. datetime_str = self.task_queue.get()
  160. bmsz_cookie = self.cookies_queue.get()
  161. params = {
  162. "s": "true",
  163. "adults": "1", # 成年人
  164. "children": "0", # 儿童
  165. "infants": "0", # 婴儿
  166. "selectedclass1": "economy", # 选择类型:经济舱
  167. "currency": "CNY", # 货币
  168. "mon": "true",
  169. "channel": "DESKTOP",
  170. "origin1": "PVG", # 出发地
  171. "destination1": "NRT", # 目的地
  172. "departuredate1": datetime_str # 出发时间
  173. }
  174. logger.info(f'正在采集 {datetime_str} 航班数据...')
  175. try:
  176. response = self.send_get(self.search_flights_api, params, bmsz_cookie)
  177. self.resp_data_queue.put((datetime_str, response))
  178. # 请求成功,归还Cookie
  179. self.cookies_queue.put(bmsz_cookie) # 成功时放回cookie
  180. except Exception as e:
  181. logger.error(f"错误发生: {e}")
  182. self.task_queue.put(datetime_str)
  183. finally:
  184. self.task_queue.task_done()
  185. def parse_data(self):
  186. while True:
  187. datetime_str, response = self.resp_data_queue.get()
  188. html = etree.HTML(response.text)
  189. data = html.xpath("//script[@id='bundle-data-v2']/text()")
  190. if data:
  191. json_data = json.loads(data[0])
  192. logger.info(f'获取数据成功 {datetime_str} => {json_data}')
  193. else:
  194. logger.warning(f'{datetime_str} 当天暂无数据 / 触发验证码')
  195. print(response.text)
  196. self.resp_data_queue.task_done()
  197. def gen_city(self):
  198. """提取Excel表格的城市对信息, 用set去重"""
  199. # 只读取Excel的Sheet1的航段信息, 将读取的数据存储在 df(DataFrame 对象)中。
  200. df = pd.read_excel(
  201. self.excel_path,
  202. sheet_name="Sheet1",
  203. usecols=["出发机场", "到达机场"] # 只读取 "出发机场" 和 "到达机场" 两列。
  204. )
  205. segment_info = set()
  206. segment_info.add('YNT,XIY')
  207. # 遍历 DataFrame 的每一行
  208. # for row in df.itertuples(index=True, name='Pandas'):
  209. # # 访问行中的数据
  210. # segment_info.add(row.出发机场 + ',' + row.到达机场)
  211. # logger.info(f'去重后的航段长度: {len(segment_info)}, {segment_info}')
  212. return segment_info
  213. @staticmethod
  214. def gen_datetime(start_date, end_date):
  215. current_date = datetime.strptime(start_date, '%Y-%m-%d')
  216. end_date = datetime.strptime(end_date, '%Y-%m-%d')
  217. date_list = []
  218. while current_date <= end_date:
  219. date_list.append(current_date.strftime('%Y-%m-%d')) # 转换为字符串格式存储
  220. current_date += timedelta(days=1)
  221. return date_list
  222. def run(self, start_date, end_date):
  223. thread_list = []
  224. self.gen_task(start_date, end_date)
  225. for _ in range(2):
  226. t_get_cookie = threading.Thread(target=self._refresh_cookie)
  227. thread_list.append(t_get_cookie)
  228. t_get_ja3 = threading.Thread(target=self.get_ja3_str)
  229. thread_list.append(t_get_ja3)
  230. for _ in range(6):
  231. t_get_data = threading.Thread(target=self.get_data)
  232. thread_list.append(t_get_data)
  233. t_parse_data = threading.Thread(target=self.parse_data)
  234. thread_list.append(t_parse_data)
  235. for t_obj in thread_list:
  236. t_obj.setDaemon(True)
  237. t_obj.start()
  238. for q in [self.task_queue, self.resp_data_queue]:
  239. q.join()
  240. if __name__ == '__main__':
  241. gk = GK()
  242. gk.run(start_date='2025-06-01', end_date='2025-06-30')