request_curl_cffi.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. from datetime import datetime, timedelta
  2. from curl_cffi import requests
  3. # import requests
  4. from requests.exceptions import Timeout
  5. import retrying
  6. import execjs
  7. import json
  8. import time
  9. from loguru import logger
  10. class GK:
  11. def __init__(self):
  12. self.akm_url = 'https://www.jetstar.com/c9NCrswc1aL9a_poKlkL/Y5OpJhrfcSzf/MwUVAg/SE0/adRNiWCo'
  13. self.search_flights_api = "https://booking.jetstar.com/hk/zh/booking/search-flights"
  14. self.headers = {
  15. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  16. "accept-language": "zh-CN,zh;q=0.9",
  17. "cache-control": "no-cache",
  18. "pragma": "no-cache",
  19. "priority": "u=0, i",
  20. "referer": "https://booking.jetstar.com/",
  21. "sec-ch-ua": "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"",
  22. "sec-ch-ua-mobile": "?0",
  23. "sec-ch-ua-platform": "\"Windows\"",
  24. "sec-fetch-dest": "document",
  25. "sec-fetch-mode": "navigate",
  26. "sec-fetch-site": "same-origin",
  27. "sec-fetch-user": "?1",
  28. "upgrade-insecure-requests": "1",
  29. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
  30. }
  31. with open('../akm/akm_5.26.js', encoding='utf-8') as f:
  32. js = f.read()
  33. self.ctx = execjs.compile(js)
  34. self.session = requests.Session()
  35. def get_cookie(self):
  36. # akm js file url
  37. akm_url = self.akm_url
  38. statusTs = str(int(time.time() * 1000))
  39. data = {
  40. 'sensor_data': self.ctx.call('encrypt1', statusTs)
  41. }
  42. response = self.session.post(akm_url, headers=self.headers, verify=False, data=data,
  43. impersonate='chrome100',
  44. # 不指定 impersonate 时,TLS 指纹是 curl 原生的,而非浏览器指纹(依旧过不了检测)。需显式设置该参数以绕过 TLS 指纹检测
  45. http_version=2
  46. )
  47. logger.info(f'第一次请求cookie bmsz 状态吗 {response.status_code}')
  48. print('内容 ', response.text)
  49. print('响应cookie ', response.cookies.get_dict())
  50. bmsz = response.cookies.get_dict()['bm_sz']
  51. print('bmsz =>', bmsz)
  52. data2 = {
  53. "sensor_data": self.ctx.call('encrypt2', statusTs, bmsz)
  54. }
  55. data2 = json.dumps(data2)
  56. response2 = self.session.post(akm_url, headers=self.headers, data=data2, verify=False,
  57. impersonate='chrome101',
  58. http_version=2
  59. )
  60. logger.info(f'第2次请求验证 cookie bmsz 状态吗 {response.status_code}')
  61. print(response2.text)
  62. print(response2.cookies.get_dict())
  63. @retrying.retry(stop_max_attempt_number=3)
  64. def send_get(self, url, params):
  65. try:
  66. response = self.session.get(
  67. url,
  68. headers=self.headers, params=params,
  69. timeout=20,
  70. verify=False,
  71. # proxies=proxies
  72. impersonate='chrome99',
  73. http_version=2
  74. )
  75. response.raise_for_status()
  76. print('请求返回cookie', response.cookies.get_dict())
  77. return response
  78. # 捕获超时请求,可能是cookie不行了,更新后报错触发重试
  79. except Timeout as e:
  80. print(f"请求超时,重新更换cookie: {e}")
  81. # # 清除旧 Cookie
  82. # self.session.cookies.clear()
  83. # print(self.session.cookies.get_dict())
  84. # self.get_cookie()
  85. raise
  86. # except Exception as e:
  87. # logger.error(e)
  88. #
  89. # return None
  90. def get_data(self, datetime_str):
  91. params = {
  92. "s": "true",
  93. "adults": "1", # 成年人
  94. "children": "0", # 儿童
  95. "infants": "0", # 婴儿
  96. "selectedclass1": "economy", # 选择类型:经济舱
  97. "currency": "CNY", # 货币
  98. "mon": "true",
  99. "channel": "DESKTOP",
  100. "origin1": "PVG", # 出发地
  101. "destination1": "NRT", # 目的地
  102. "departuredate1": datetime_str # 出发时间
  103. }
  104. response = self.send_get(self.search_flights_api, params)
  105. if not response:
  106. return
  107. # print(response.text)
  108. print(response)
  109. from lxml import etree
  110. import json
  111. html = etree.HTML(response.text)
  112. data = html.xpath("//script[@id='bundle-data-v2']/text()")
  113. if data:
  114. json_data = json.loads(data[0])
  115. print(datetime_str, ' => ', json_data)
  116. else:
  117. print(response.text)
  118. @staticmethod
  119. def gen_datetime(start_date, end_date):
  120. """生成抓取日期: 2025-03-09 传入这种格式"""
  121. # 将字符串转换为 datetime 对象
  122. current_date = datetime.strptime(start_date, '%Y-%m-%d')
  123. end_date = datetime.strptime(end_date, '%Y-%m-%d')
  124. # 初始化一个空列表来存储日期
  125. date_list = []
  126. # 使用 timedelta 循环遍历每一天
  127. while current_date <= end_date:
  128. date_list.append(current_date.strftime('%Y-%m-%d')) # 转换为字符串格式存储
  129. current_date += timedelta(days=1)
  130. return date_list
  131. def run(self, start_date, end_date):
  132. self.get_cookie()
  133. # # 获取采集时间
  134. for num, datetime_str in enumerate(self.gen_datetime(start_date, end_date), start=1):
  135. # if num % 5 == 0:
  136. # self.session = requests.Session()
  137. # self.get_cookie()
  138. self.get_data(datetime_str)
  139. # # time.sleep(1)
  140. if __name__ == '__main__':
  141. gk = GK()
  142. gk.run(start_date='2025-05-29', end_date='2025-06-29')