req_tls_client.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. import threading
  2. import time
  3. from queue import Queue
  4. import requests
  5. from lxml import etree
  6. import json
  7. import random
  8. from datetime import datetime, timedelta
  9. import execjs
  10. from loguru import logger
  11. import tls_client
  12. import retrying
  13. from urllib.parse import urljoin
  14. # requests = requests.Session()
  15. class GK:
  16. def __init__(self):
  17. self.search_flights_api = "https://booking.jetstar.com/hk/zh/booking/search-flights"
  18. self.headers = {
  19. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  20. "Accept-Language": "zh-CN,zh;q=0.9",
  21. "Connection": "keep-alive",
  22. "Referer": "https://www.jetstar.com/",
  23. "Sec-Fetch-Dest": "document",
  24. "Sec-Fetch-Mode": "navigate",
  25. "Sec-Fetch-Site": "same-site",
  26. "Sec-Fetch-User": "?1",
  27. "Upgrade-Insecure-Requests": "1",
  28. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
  29. "sec-ch-ua": "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"",
  30. "sec-ch-ua-mobile": "?0",
  31. "sec-ch-ua-platform": "\"Windows\""
  32. }
  33. with open('akm/akm_5.26.js', encoding='utf-8') as f:
  34. js = f.read()
  35. self.ctx = execjs.compile(js)
  36. self.proxies_url = 'http://B_3351_HK___5_ss-{}:ev2pjj@proxy.renlaer.com:7778'
  37. self.cookies_queue = Queue()
  38. self.ja3_queue = Queue()
  39. self.task_queue = Queue()
  40. self.resp_data_queue = Queue()
  41. # 使用本地代理软件
  42. self.proxies = {
  43. "http": "http://127.0.0.1:7897",
  44. "https": "http://127.0.0.1:7897"
  45. }
  46. def get_ja3_str(self):
  47. url = "http://8.218.51.130:9003/api/v1/ja3"
  48. payload = {}
  49. headers = {
  50. 'cid': '750B5141EDBF7FA6F73A99C768130099'
  51. }
  52. while True:
  53. if self.ja3_queue.qsize() < 3:
  54. try:
  55. response = requests.get(url, headers=headers, data=payload, timeout=15)
  56. if response.status_code == 200:
  57. # print(response.json())
  58. res_json = response.json()
  59. if res_json.get("code") == 0:
  60. ja3 = res_json.get("data").get("ja3_str")
  61. ua = res_json.get("data").get("ua")
  62. if "--" not in ja3 and ",," not in ja3:
  63. end_data = (
  64. ua,
  65. ja3
  66. )
  67. # logger.debug('获取ja3成功...')
  68. self.ja3_queue.put(end_data)
  69. except Exception as e:
  70. logger.error(f'ja3接口错误: {e}')
  71. time.sleep(3)
  72. @retrying.retry(stop_max_attempt_number=3, wait_fixed=3000)
  73. def request_new_cookie(self):
  74. statusTs = int(time.time() * 1000)
  75. # ua, ja3_string = self.ja3_queue.get()
  76. ua, ja3_string = self.ja3_queue.get()
  77. # ip = ''.join(random.choices('0123456789', k=12))
  78. # proxies = {
  79. # 'http': self.proxies_url.format(ip),
  80. # 'https': self.proxies_url.format(ip)
  81. # }
  82. # print(proxies)
  83. # browser = random.choice(self.BROWSER)
  84. """
  85. 注意:ja3 和 client_identifier、 random_tls_extension_order不能同时使用
  86. 否则存在 潜在冲突,可能导致 JA3 指纹不符合预期,
  87. 具体原因如下:
  88. 参数优先级冲突
  89. ja3_string 是直接定义 JA3 指纹的 完整参数,会覆盖 client_identifier 的默认配置。
  90. 若同时设置 ja3_string 和 client_identifier,client_identifier 的浏览器预置参数会被忽略,仅 ja3_string 生效。
  91. 随机扩展顺序干扰
  92. random_tls_extension_order=True 会打乱 TLS 扩展的顺序,导致 JA3 指纹动态变化。
  93. 虽然增强了匿名性,但若目标网站检测 JA3 的稳定性(如固定指纹校验),此配置会触发反爬机制。
  94. """
  95. get_ck_session = tls_client.Session(
  96. ja3_string=ja3_string,
  97. )
  98. headers = {
  99. # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
  100. 'user-agent': ua,
  101. 'Accept-Encoding': 'gzip, deflate, br',
  102. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  103. 'Connection': 'keep-alive',
  104. 'Content-Type': 'application/x-www-form-urlencoded',
  105. 'accept-language': 'zh-CN,zh;q=0.9',
  106. 'cache-control': 'no-cache',
  107. 'pragma': 'no-cache',
  108. 'priority': 'u=0, i',
  109. 'referer': 'https://booking.jetstar.com/',
  110. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  111. 'sec-ch-ua-mobile': '?0',
  112. 'sec-ch-ua-platform': '"Windows"',
  113. 'sec-fetch-dest': 'document',
  114. 'sec-fetch-mode': 'navigate',
  115. 'sec-fetch-site': 'same-origin',
  116. 'sec-fetch-user': '?1',
  117. 'upgrade-insecure-requests': '1'
  118. }
  119. # akm js file url
  120. akm_url = "https://www.jetstar.com/w6N0DejiHPXeE_PZTkeSCCzH/3XiJLNQzXNpfYO/EA1mYRtQBw/LV/Y8ahE0KUo"
  121. data = {
  122. 'sensor_data': self.ctx.call('encrypt1', statusTs)
  123. }
  124. response1 = get_ck_session.post(akm_url, headers=headers, data=data, timeout_seconds=15,
  125. proxy=self.proxies
  126. )
  127. # print(response1.status_code)
  128. # print(response1.text)
  129. # print(response1.cookies.get_dict())
  130. # print('111', response.headers)
  131. bmsz = response1.cookies.get_dict()['bm_sz']
  132. # print('bmsz => ', bmsz)
  133. data2 = {
  134. "sensor_data": self.ctx.call('encrypt2', statusTs, bmsz)
  135. }
  136. data2 = json.dumps(data2)
  137. response2 = get_ck_session.post(akm_url, headers=headers, data=data2, timeout_seconds=15,
  138. proxy=self.proxies
  139. )
  140. logger.debug('成功获取 cookie bm-sz: {}'.format(bmsz[10:]))
  141. # print(response2.text)
  142. # print(response2.cookies.get_dict())
  143. if response2.status_code == 201:
  144. # print('响应cookie1', response1.cookies.get_dict())
  145. # 返回第一次请求响应的cookie
  146. return response1.cookies.get_dict()
  147. else:
  148. logger.error('状态码错误{}'.format(response2.status_code))
  149. print(response2.text)
  150. def _refresh_cookie(self):
  151. while True:
  152. if self.cookies_queue.qsize() < 3:
  153. cookie = self.request_new_cookie()
  154. self.cookies_queue.put(cookie)
  155. time.sleep(3)
  156. @retrying.retry(stop_max_attempt_number=5, wait_fixed=3000)
  157. def request_with_redirect(self, url, params, bmsz_cookie, max_redirects=3):
  158. """"""
  159. ua, ja3_string = self.ja3_queue.get()
  160. # ip = ''.join(random.choices('0123456789', k=12))
  161. # proxies = {
  162. # 'http': self.proxies_url.format(ip),
  163. # 'https': self.proxies_url.format(ip)
  164. # }
  165. req_session = tls_client.Session(
  166. ja3_string=ja3_string, # 直接注入自定义指纹
  167. )
  168. headers = {
  169. 'user-agent': ua,
  170. 'Accept-Encoding': 'gzip, deflate, br',
  171. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  172. 'Connection': 'keep-alive',
  173. 'Content-Type': 'application/x-www-form-urlencoded',
  174. 'accept-language': 'zh-CN,zh;q=0.9',
  175. 'cache-control': 'no-cache',
  176. 'pragma': 'no-cache',
  177. 'priority': 'u=0, i',
  178. 'referer': 'https://booking.jetstar.com/',
  179. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  180. 'sec-ch-ua-mobile': '?0',
  181. 'sec-ch-ua-platform': '"Windows"',
  182. 'sec-fetch-dest': 'document',
  183. 'sec-fetch-mode': 'navigate',
  184. 'sec-fetch-site': 'same-origin',
  185. 'sec-fetch-user': '?1',
  186. 'upgrade-insecure-requests': '1'
  187. }
  188. redirect_count = 0
  189. current_url = url
  190. while redirect_count < max_redirects:
  191. response = req_session.get(current_url, headers=headers, cookies=bmsz_cookie, params=params,
  192. timeout_seconds=15, # timeout
  193. proxy=self.proxies
  194. )
  195. print(response.status_code)
  196. # print(response.text)
  197. # 检查是否为重定向状态码
  198. if response.status_code in (301, 302, 303, 307, 308):
  199. # 获取 Location 头(需处理相对路径)
  200. location = response.headers.get("Location")
  201. # if not location:
  202. # break
  203. current_url = urljoin(current_url, location)
  204. redirect_count += 1
  205. # print(f"Redirecting to: {current_url}")
  206. # 可选:继承原始请求的特定 Headers(如 Referer)
  207. # headers["Referer"] = response.url
  208. else:
  209. # 请求成功,归还Cookie
  210. # print('请求成功的cookie', req_session.cookies.get_dict())
  211. # print('请求响应的cookie', response.cookies.get_dict())
  212. self.cookies_queue.put(bmsz_cookie) # 成功时放回cookie
  213. return response # 返回最终响应
  214. raise Exception(f" ({max_redirects})")
  215. def get_data(self):
  216. while True:
  217. city_code, datetime_str = self.task_queue.get()
  218. origin1, destination1 = city_code.split('\t')
  219. bmsz_cookie = self.cookies_queue.get()
  220. params = {
  221. "s": "true",
  222. "adults": "1",
  223. "children": "0",
  224. "infants": "0",
  225. "selectedclass1": "economy",
  226. # "currency": "CNY",
  227. "mon": "true",
  228. "channel": "DESKTOP",
  229. "origin1": origin1,
  230. "destination1": destination1,
  231. "departuredate1": datetime_str
  232. }
  233. # # print(params1)
  234. # params = {
  235. # "s": "true",
  236. # "adults": "1",
  237. # "children": "0",
  238. # "infants": "0",
  239. # "selectedclass1": "economy",
  240. # "currency": "CNY",
  241. # "mon": "true",
  242. # "channel": "DESKTOP",
  243. # "origin1": "CTS",
  244. # "destination1": "KOJ",
  245. # "departuredate1": "2025-05-30" # !!!
  246. # }
  247. # print(params)
  248. logger.info(f'正在采集{city_code} {datetime_str} 航班数据...')
  249. try:
  250. response = self.request_with_redirect(self.search_flights_api, params, bmsz_cookie)
  251. self.resp_data_queue.put((city_code, datetime_str, response))
  252. except Exception as e:
  253. logger.error(e)
  254. # 失败时重新上传任务
  255. # self.task_queue.put(datetime_str)
  256. finally:
  257. self.task_queue.task_done()
  258. def parse_data(self):
  259. while True:
  260. city_code, datetime_str, response = self.resp_data_queue.get()
  261. html = etree.HTML(response.text)
  262. data = html.xpath("//script[@id='bundle-data-v2']/text()")
  263. if data:
  264. json_data = json.loads(data[0])
  265. print('获取数据成功', city_code, datetime_str, ' => ', json_data)
  266. # print(response.text)
  267. else:
  268. logger.warning(f'{datetime_str} 触发验证码或拒绝访问错误, => {response.text}')
  269. self.resp_data_queue.task_done()
  270. @staticmethod
  271. def gen_datetime(start_date, end_date):
  272. current_date = datetime.strptime(start_date, '%Y-%m-%d')
  273. end_date = datetime.strptime(end_date, '%Y-%m-%d')
  274. date_list = []
  275. while current_date <= end_date:
  276. date_list.append(current_date.strftime('%Y-%m-%d')) # 转换为字符串格式存储
  277. current_date += timedelta(days=1)
  278. return date_list
  279. def gen_city(self):
  280. routes = """
  281. CTS KOJ
  282. """
  283. temp_list = [i.strip() for i in routes.split("\n") if i.strip()]
  284. routes = list(set(temp_list)) # 去重
  285. print(routes)
  286. return routes
  287. @staticmethod
  288. def gen_date_format(date_str):
  289. """20250531 => 2025-05-31 """
  290. original_date = datetime.strptime(date_str, "%Y%m%d")
  291. return original_date.strftime("%Y-%m-%d")
  292. def gen_task(self, start_date, end_date):
  293. """将每个城市对与日期组合生成独立任务, 上传到任务队列"""
  294. # 获取采集城市对
  295. for city_code in self.gen_city():
  296. # 获取采集时间
  297. flight_date_list = self.search_flight_date(city_code, start_date, end_date)
  298. if not flight_date_list:
  299. print(city_code, start_date, end_date, '无航班')
  300. continue
  301. for datetime_str in flight_date_list:
  302. # 日期格式转为 20250531 => 2025-05-31
  303. self.task_queue.put((city_code, self.gen_date_format(datetime_str)))
  304. # self.task_queue.put((datetime_str))
  305. def search_flight_date(self, city_pair, start_date, end_date):
  306. """查询航班日期, 即那天有航班"""
  307. departures, arrivals = city_pair.split('\t')
  308. url = "https://digitalapi.jetstar.com/v1/farecache/flights/batch/availability-with-fareclasses"
  309. params = {
  310. "flightCount": "1",
  311. "includeSoldOut": "true",
  312. "requestType": "StarterOnly",
  313. "from": start_date, # 采集开始时间
  314. "end": end_date, # 采集结束时间,可随意写 后面写完实例属性
  315. "departures": departures,
  316. "arrivals": arrivals,
  317. "direction": "outbound",
  318. "paxCount": "1",
  319. "includeFees": "false"
  320. }
  321. headers = {
  322. "accept": "application/json, text/plain, */*",
  323. "accept-language": "zh-CN,zh;q=0.9",
  324. "cache-control": "no-cache",
  325. "culture": "zh-HK",
  326. "origin": "https://www.jetstar.com",
  327. "pragma": "no-cache",
  328. "priority": "u=1, i",
  329. "referer": "https://www.jetstar.com/",
  330. "sec-ch-ua": "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"",
  331. "sec-ch-ua-mobile": "?0",
  332. "sec-ch-ua-platform": "\"Windows\"",
  333. "sec-fetch-dest": "empty",
  334. "sec-fetch-mode": "cors",
  335. "sec-fetch-site": "same-site",
  336. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
  337. }
  338. response = requests.get(url, headers=headers, params=params, verify=False)
  339. response.raise_for_status()
  340. # for i in response.json():
  341. # print(i)
  342. json_data = response.json()[0]['routes']
  343. # 只有一个键值对
  344. for key, val in json_data.items():
  345. flight_dates = list(val.get('flights', {}).keys())
  346. print(f'航段: {key} 对应有航班的日期为: {flight_dates}')
  347. return flight_dates
  348. def main(self, start_date, end_date):
  349. thread_list = list()
  350. self.gen_task(start_date, end_date)
  351. for _ in range(1):
  352. t_get_cookie = threading.Thread(target=self._refresh_cookie)
  353. thread_list.append(t_get_cookie)
  354. t_get_ja3 = threading.Thread(target=self.get_ja3_str)
  355. thread_list.append(t_get_ja3)
  356. for _ in range(1):
  357. t_get_data = threading.Thread(target=self.get_data)
  358. thread_list.append(t_get_data)
  359. t_parse_data = threading.Thread(target=self.parse_data)
  360. thread_list.append(t_parse_data)
  361. for t_obj in thread_list:
  362. t_obj.setDaemon(True)
  363. t_obj.start()
  364. for q in [self.task_queue, self.resp_data_queue]:
  365. q.join()
  366. if __name__ == '__main__':
  367. gk = GK()
  368. gk.main(start_date='2025-05-30', end_date='2025-06-05')
  369. # gk.gen_city()
  370. # http://B_3351_AU___5_ss-XXXXXXXXXXXX:ev2pjj@proxy.renlaer.com:7778
  371. # curl -x http://B_3351_SG___5_ss-XXXXXXXXXXXX:ev2pjj@proxy.renlaer.com:7778 cip.cc
  372. # curl -x http://B_3351_TW___5_ss-XXXXXXXXXXXX:ev2pjj@proxy.renlaer.com:7778 cip.cc
  373. # curl -x http://B_3351_HK___5_ss-115511111111:ev2pjj@proxy.renlaer.com:7778 cip.cc