| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432 |
- import threading
- import time
- from queue import Queue
- import requests
- from lxml import etree
- import json
- import random
- from datetime import datetime, timedelta
- import execjs
- from loguru import logger
- import tls_client
- import retrying
- from urllib.parse import urljoin
- # requests = requests.Session()
- class GK:
- def __init__(self):
- self.search_flights_api = "https://booking.jetstar.com/hk/zh/booking/search-flights"
- self.headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Connection": "keep-alive",
- "Referer": "https://www.jetstar.com/",
- "Sec-Fetch-Dest": "document",
- "Sec-Fetch-Mode": "navigate",
- "Sec-Fetch-Site": "same-site",
- "Sec-Fetch-User": "?1",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
- "sec-ch-ua": "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"",
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": "\"Windows\""
- }
- with open('akm/akm_5.26.js', encoding='utf-8') as f:
- js = f.read()
- self.ctx = execjs.compile(js)
- self.proxies_url = 'http://B_3351_HK___5_ss-{}:ev2pjj@proxy.renlaer.com:7778'
- self.cookies_queue = Queue()
- self.ja3_queue = Queue()
- self.task_queue = Queue()
- self.resp_data_queue = Queue()
- # 使用本地代理软件
- self.proxies = {
- "http": "http://127.0.0.1:7897",
- "https": "http://127.0.0.1:7897"
- }
- def get_ja3_str(self):
- url = "http://8.218.51.130:9003/api/v1/ja3"
- payload = {}
- headers = {
- 'cid': '750B5141EDBF7FA6F73A99C768130099'
- }
- while True:
- if self.ja3_queue.qsize() < 3:
- try:
- response = requests.get(url, headers=headers, data=payload, timeout=15)
- if response.status_code == 200:
- # print(response.json())
- res_json = response.json()
- if res_json.get("code") == 0:
- ja3 = res_json.get("data").get("ja3_str")
- ua = res_json.get("data").get("ua")
- if "--" not in ja3 and ",," not in ja3:
- end_data = (
- ua,
- ja3
- )
- # logger.debug('获取ja3成功...')
- self.ja3_queue.put(end_data)
- except Exception as e:
- logger.error(f'ja3接口错误: {e}')
- time.sleep(3)
- @retrying.retry(stop_max_attempt_number=3, wait_fixed=3000)
- def request_new_cookie(self):
- statusTs = int(time.time() * 1000)
- # ua, ja3_string = self.ja3_queue.get()
- ua, ja3_string = self.ja3_queue.get()
- # ip = ''.join(random.choices('0123456789', k=12))
- # proxies = {
- # 'http': self.proxies_url.format(ip),
- # 'https': self.proxies_url.format(ip)
- # }
- # print(proxies)
- # browser = random.choice(self.BROWSER)
- """
- 注意:ja3 和 client_identifier、 random_tls_extension_order不能同时使用
- 否则存在 潜在冲突,可能导致 JA3 指纹不符合预期,
- 具体原因如下:
- 参数优先级冲突
- ja3_string 是直接定义 JA3 指纹的 完整参数,会覆盖 client_identifier 的默认配置。
- 若同时设置 ja3_string 和 client_identifier,client_identifier 的浏览器预置参数会被忽略,仅 ja3_string 生效。
- 随机扩展顺序干扰
- random_tls_extension_order=True 会打乱 TLS 扩展的顺序,导致 JA3 指纹动态变化。
- 虽然增强了匿名性,但若目标网站检测 JA3 的稳定性(如固定指纹校验),此配置会触发反爬机制。
- """
- get_ck_session = tls_client.Session(
- ja3_string=ja3_string,
- )
- headers = {
- # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
- 'user-agent': ua,
- 'Accept-Encoding': 'gzip, deflate, br',
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'Connection': 'keep-alive',
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'cache-control': 'no-cache',
- 'pragma': 'no-cache',
- 'priority': 'u=0, i',
- 'referer': 'https://booking.jetstar.com/',
- 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"',
- 'sec-fetch-dest': 'document',
- 'sec-fetch-mode': 'navigate',
- 'sec-fetch-site': 'same-origin',
- 'sec-fetch-user': '?1',
- 'upgrade-insecure-requests': '1'
- }
- # akm js file url
- akm_url = "https://www.jetstar.com/w6N0DejiHPXeE_PZTkeSCCzH/3XiJLNQzXNpfYO/EA1mYRtQBw/LV/Y8ahE0KUo"
- data = {
- 'sensor_data': self.ctx.call('encrypt1', statusTs)
- }
- response1 = get_ck_session.post(akm_url, headers=headers, data=data, timeout_seconds=15,
- proxy=self.proxies
- )
- # print(response1.status_code)
- # print(response1.text)
- # print(response1.cookies.get_dict())
- # print('111', response.headers)
- bmsz = response1.cookies.get_dict()['bm_sz']
- # print('bmsz => ', bmsz)
- data2 = {
- "sensor_data": self.ctx.call('encrypt2', statusTs, bmsz)
- }
- data2 = json.dumps(data2)
- response2 = get_ck_session.post(akm_url, headers=headers, data=data2, timeout_seconds=15,
- proxy=self.proxies
- )
- logger.debug('成功获取 cookie bm-sz: {}'.format(bmsz[10:]))
- # print(response2.text)
- # print(response2.cookies.get_dict())
- if response2.status_code == 201:
- # print('响应cookie1', response1.cookies.get_dict())
- # 返回第一次请求响应的cookie
- return response1.cookies.get_dict()
- else:
- logger.error('状态码错误{}'.format(response2.status_code))
- print(response2.text)
- def _refresh_cookie(self):
- while True:
- if self.cookies_queue.qsize() < 3:
- cookie = self.request_new_cookie()
- self.cookies_queue.put(cookie)
- time.sleep(3)
- @retrying.retry(stop_max_attempt_number=5, wait_fixed=3000)
- def request_with_redirect(self, url, params, bmsz_cookie, max_redirects=3):
- """"""
- ua, ja3_string = self.ja3_queue.get()
- # ip = ''.join(random.choices('0123456789', k=12))
- # proxies = {
- # 'http': self.proxies_url.format(ip),
- # 'https': self.proxies_url.format(ip)
- # }
- req_session = tls_client.Session(
- ja3_string=ja3_string, # 直接注入自定义指纹
- )
- headers = {
- 'user-agent': ua,
- 'Accept-Encoding': 'gzip, deflate, br',
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'Connection': 'keep-alive',
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'cache-control': 'no-cache',
- 'pragma': 'no-cache',
- 'priority': 'u=0, i',
- 'referer': 'https://booking.jetstar.com/',
- 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"',
- 'sec-fetch-dest': 'document',
- 'sec-fetch-mode': 'navigate',
- 'sec-fetch-site': 'same-origin',
- 'sec-fetch-user': '?1',
- 'upgrade-insecure-requests': '1'
- }
- redirect_count = 0
- current_url = url
- while redirect_count < max_redirects:
- response = req_session.get(current_url, headers=headers, cookies=bmsz_cookie, params=params,
- timeout_seconds=15, # timeout
- proxy=self.proxies
- )
- print(response.status_code)
- # print(response.text)
- # 检查是否为重定向状态码
- if response.status_code in (301, 302, 303, 307, 308):
- # 获取 Location 头(需处理相对路径)
- location = response.headers.get("Location")
- # if not location:
- # break
- current_url = urljoin(current_url, location)
- redirect_count += 1
- # print(f"Redirecting to: {current_url}")
- # 可选:继承原始请求的特定 Headers(如 Referer)
- # headers["Referer"] = response.url
- else:
- # 请求成功,归还Cookie
- # print('请求成功的cookie', req_session.cookies.get_dict())
- # print('请求响应的cookie', response.cookies.get_dict())
- self.cookies_queue.put(bmsz_cookie) # 成功时放回cookie
- return response # 返回最终响应
- raise Exception(f" ({max_redirects})")
- def get_data(self):
- while True:
- city_code, datetime_str = self.task_queue.get()
- origin1, destination1 = city_code.split('\t')
- bmsz_cookie = self.cookies_queue.get()
- params = {
- "s": "true",
- "adults": "1",
- "children": "0",
- "infants": "0",
- "selectedclass1": "economy",
- # "currency": "CNY",
- "mon": "true",
- "channel": "DESKTOP",
- "origin1": origin1,
- "destination1": destination1,
- "departuredate1": datetime_str
- }
- # # print(params1)
- # params = {
- # "s": "true",
- # "adults": "1",
- # "children": "0",
- # "infants": "0",
- # "selectedclass1": "economy",
- # "currency": "CNY",
- # "mon": "true",
- # "channel": "DESKTOP",
- # "origin1": "CTS",
- # "destination1": "KOJ",
- # "departuredate1": "2025-05-30" # !!!
- # }
- # print(params)
- logger.info(f'正在采集{city_code} {datetime_str} 航班数据...')
- try:
- response = self.request_with_redirect(self.search_flights_api, params, bmsz_cookie)
- self.resp_data_queue.put((city_code, datetime_str, response))
- except Exception as e:
- logger.error(e)
- # 失败时重新上传任务
- # self.task_queue.put(datetime_str)
- finally:
- self.task_queue.task_done()
- def parse_data(self):
- while True:
- city_code, datetime_str, response = self.resp_data_queue.get()
- html = etree.HTML(response.text)
- data = html.xpath("//script[@id='bundle-data-v2']/text()")
- if data:
- json_data = json.loads(data[0])
- print('获取数据成功', city_code, datetime_str, ' => ', json_data)
- # print(response.text)
- else:
- logger.warning(f'{datetime_str} 触发验证码或拒绝访问错误, => {response.text}')
- self.resp_data_queue.task_done()
- @staticmethod
- def gen_datetime(start_date, end_date):
- current_date = datetime.strptime(start_date, '%Y-%m-%d')
- end_date = datetime.strptime(end_date, '%Y-%m-%d')
- date_list = []
- while current_date <= end_date:
- date_list.append(current_date.strftime('%Y-%m-%d')) # 转换为字符串格式存储
- current_date += timedelta(days=1)
- return date_list
- def gen_city(self):
- routes = """
-
-
- CTS KOJ
- """
- temp_list = [i.strip() for i in routes.split("\n") if i.strip()]
- routes = list(set(temp_list)) # 去重
- print(routes)
- return routes
- @staticmethod
- def gen_date_format(date_str):
- """20250531 => 2025-05-31 """
- original_date = datetime.strptime(date_str, "%Y%m%d")
- return original_date.strftime("%Y-%m-%d")
- def gen_task(self, start_date, end_date):
- """将每个城市对与日期组合生成独立任务, 上传到任务队列"""
- # 获取采集城市对
- for city_code in self.gen_city():
- # 获取采集时间
- flight_date_list = self.search_flight_date(city_code, start_date, end_date)
- if not flight_date_list:
- print(city_code, start_date, end_date, '无航班')
- continue
- for datetime_str in flight_date_list:
- # 日期格式转为 20250531 => 2025-05-31
- self.task_queue.put((city_code, self.gen_date_format(datetime_str)))
- # self.task_queue.put((datetime_str))
- def search_flight_date(self, city_pair, start_date, end_date):
- """查询航班日期, 即那天有航班"""
- departures, arrivals = city_pair.split('\t')
- url = "https://digitalapi.jetstar.com/v1/farecache/flights/batch/availability-with-fareclasses"
- params = {
- "flightCount": "1",
- "includeSoldOut": "true",
- "requestType": "StarterOnly",
- "from": start_date, # 采集开始时间
- "end": end_date, # 采集结束时间,可随意写 后面写完实例属性
- "departures": departures,
- "arrivals": arrivals,
- "direction": "outbound",
- "paxCount": "1",
- "includeFees": "false"
- }
- headers = {
- "accept": "application/json, text/plain, */*",
- "accept-language": "zh-CN,zh;q=0.9",
- "cache-control": "no-cache",
- "culture": "zh-HK",
- "origin": "https://www.jetstar.com",
- "pragma": "no-cache",
- "priority": "u=1, i",
- "referer": "https://www.jetstar.com/",
- "sec-ch-ua": "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"",
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": "\"Windows\"",
- "sec-fetch-dest": "empty",
- "sec-fetch-mode": "cors",
- "sec-fetch-site": "same-site",
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
- }
- response = requests.get(url, headers=headers, params=params, verify=False)
- response.raise_for_status()
- # for i in response.json():
- # print(i)
- json_data = response.json()[0]['routes']
- # 只有一个键值对
- for key, val in json_data.items():
- flight_dates = list(val.get('flights', {}).keys())
- print(f'航段: {key} 对应有航班的日期为: {flight_dates}')
- return flight_dates
- def main(self, start_date, end_date):
- thread_list = list()
- self.gen_task(start_date, end_date)
- for _ in range(1):
- t_get_cookie = threading.Thread(target=self._refresh_cookie)
- thread_list.append(t_get_cookie)
- t_get_ja3 = threading.Thread(target=self.get_ja3_str)
- thread_list.append(t_get_ja3)
- for _ in range(1):
- t_get_data = threading.Thread(target=self.get_data)
- thread_list.append(t_get_data)
- t_parse_data = threading.Thread(target=self.parse_data)
- thread_list.append(t_parse_data)
- for t_obj in thread_list:
- t_obj.setDaemon(True)
- t_obj.start()
- for q in [self.task_queue, self.resp_data_queue]:
- q.join()
- if __name__ == '__main__':
- gk = GK()
- gk.main(start_date='2025-05-30', end_date='2025-06-05')
- # gk.gen_city()
- # http://B_3351_AU___5_ss-XXXXXXXXXXXX:ev2pjj@proxy.renlaer.com:7778
- # curl -x http://B_3351_SG___5_ss-XXXXXXXXXXXX:ev2pjj@proxy.renlaer.com:7778 cip.cc
- # curl -x http://B_3351_TW___5_ss-XXXXXXXXXXXX:ev2pjj@proxy.renlaer.com:7778 cip.cc
- # curl -x http://B_3351_HK___5_ss-115511111111:ev2pjj@proxy.renlaer.com:7778 cip.cc
|