gk_flights_spider.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. import threading
  2. import queue
  3. import time
  4. import json
  5. import re
  6. import execjs
  7. import random
  8. import retrying
  9. from lxml import etree
  10. import datetime
  11. import requests
  12. import tls_client
  13. from urllib.parse import urljoin
  14. from bson.objectid import ObjectId
  15. from flights_utils import get_now_time, put_data_by_queue, get_data_by_queue, \
  16. time_format_conversion
  17. from my_logger import Logger
  18. from flights_mongodb import mongo_con_parse
  19. from settings import CRAWL_DATE_CONF_STATUS_TAB, FLIGHTS_WEBSITES_ROUTE_CONF_TAB, PROXY_TAIL
  20. def generate_date_range(days):
  21. """
  22. 生成 n 天的日期范围
  23. """
  24. # 获取今天的日期(结束日期)
  25. start_date = datetime.datetime.today()
  26. # 计算开始日期:结束日期 - (days-1) 天(确保包含今天)
  27. end_date = start_date + datetime.timedelta(days=days)
  28. # 格式化输出
  29. return (
  30. start_date.strftime("%Y-%m-%d"),
  31. end_date.strftime("%Y-%m-%d")
  32. )
  33. class GKSpider():
  34. def __init__(self):
  35. self.website = 'gk' # 网站
  36. self.is_proxy = True
  37. self.is_online = True #
  38. if self.is_proxy:
  39. if self.is_online:
  40. # proxies = {
  41. # 'http': f'http://B_3351_HK___5_ss-{ip}:ev2pjj@proxy.renlaer.com:7778',
  42. # 'https': f'http://B_3351_HK___5_ss-{ip}:ev2pjj@proxy.renlaer.com:7778'
  43. # }
  44. self.proxy_meta = f"http://B_3351_HK___5_ss-xxxxxxxxxxxx:{PROXY_TAIL}" # AU / HK
  45. self.time_sleep = 0.5
  46. else:
  47. self.proxy_meta = "http://127.0.0.1:7897"
  48. # self.time_sleep = 5.5
  49. self.time_sleep = 0.5
  50. self.proxies = {
  51. "http": self.proxy_meta,
  52. "https": self.proxy_meta,
  53. }
  54. else:
  55. self.proxies = None
  56. self.time_sleep = 5.5
  57. self.search_flights_api = "https://booking.jetstar.com/hk/zh/booking/search-flights"
  58. with open('./js_files/akm逆向5.26.js', encoding='utf-8') as f:
  59. js = f.read()
  60. self.ctx = execjs.compile(js)
  61. self.task_queue = queue.Queue()
  62. self.cookies_queue = queue.Queue()
  63. self.ja3_queue = queue.Queue() # 要回收可用ja3 频繁切换ja3会增加触发验证码几率
  64. self.headers = {
  65. "accept": "application/json, text/plain, */*",
  66. "accept-language": "zh-CN,zh;q=0.9",
  67. "cache-control": "no-cache",
  68. "culture": "zh-HK",
  69. "origin": "https://www.jetstar.com",
  70. "pragma": "no-cache",
  71. "priority": "u=1, i",
  72. "referer": "https://www.jetstar.com/",
  73. "sec-ch-ua": "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"",
  74. "sec-ch-ua-mobile": "?0",
  75. "sec-ch-ua-platform": "\"Windows\"",
  76. "sec-fetch-dest": "empty",
  77. "sec-fetch-mode": "cors",
  78. "sec-fetch-site": "same-site",
  79. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
  80. }
  81. if self.is_online:
  82. self.cookie_thread_num = 3
  83. self.thread_numbers = 8
  84. self.crawl_days = 10
  85. else:
  86. self.cookie_thread_num = 3
  87. self.thread_numbers = 8
  88. self.crawl_days = 5
  89. self.cookie_queue_num = 4 # 队列中 cookie 的数量
  90. self.ja3_queue_num = 4 # 队列中的 ja3 数量
  91. self.retry_number = 2 # 5
  92. self.db = mongo_con_parse()
  93. self.info_tab = "flights_{}_info_tab".format(self.website)
  94. self.logger = Logger('./logs/{}_flights_spiders.log'.format(self.website), 'debug').logger
  95. def init_crawl_date_data(self):
  96. # 初始化 采集批次时间 状态
  97. # 开始采集 创建一个批次时间 采集完成更新采集状态
  98. self.db.get_collection(CRAWL_DATE_CONF_STATUS_TAB).insert_one(
  99. {
  100. "website": self.website,
  101. "crawl_date": self.crawl_date,
  102. "crawl_status": 0, # 采集状态
  103. "clean_status": 0, # 数据清洗状态
  104. "to_csv_status": 0 # 导出文件状态
  105. }
  106. )
  107. def init_crawl_conf(self):
  108. # 获取采集时间
  109. self.crawl_date = get_now_time()
  110. self.logger.info("本次数据采集批次为: {}".format(self.crawl_date))
  111. # 初始化 采集批次时间 状态
  112. self.init_crawl_date_data()
  113. # 从航线表里提取记录(提取采集航线表的)
  114. search_routes = self.db.get_collection(
  115. FLIGHTS_WEBSITES_ROUTE_CONF_TAB
  116. ).find(
  117. {
  118. "source_website": self.website, # 数据来源网站
  119. "website_status": 1, # 网站是否采集状态
  120. "flight_route_status": 1 # 航线是否采集状态
  121. },
  122. {
  123. "_id": 0 # 排除返回结果中的_id字段,因为MongoDB默认会返回_id,但这里用户可能不需要这个字段
  124. }
  125. )
  126. self.logger.info("获取需要采集的航线.并喂给队列")
  127. for search_route in search_routes:
  128. put_data_by_queue(self.task_queue, search_route)
  129. def general_proxies(self):
  130. if self.is_online:
  131. proxy_meta = self.proxy_meta
  132. random_no = ''.join(random.choices('0123456789', k=12))
  133. # region = ''.join(random.choices(["US"], k=1))
  134. # 香港 新加坡 台湾 都能服务 新加坡最快最稳定
  135. # proxy_meta_mid = re.sub(r"_(US)_", f"_{region}_", proxy_meta)
  136. proxy_meta_new = re.sub(r"-(x+):", f"-{random_no}:", proxy_meta)
  137. # print(f"proxy_meta_new: {proxy_meta_new}")
  138. proxies = {
  139. "http": proxy_meta_new,
  140. "https": proxy_meta_new,
  141. }
  142. else:
  143. proxies = self.proxies
  144. return proxies
  145. @retrying.retry(stop_max_attempt_number=3, wait_fixed=3000)
  146. def request_new_cookie(self):
  147. statusTs = int(time.time() * 1000)
  148. try:
  149. ua, ja3_string = self.ja3_queue.get()
  150. proxy = self.general_proxies()
  151. get_ck_session = tls_client.Session(
  152. ja3_string=ja3_string,
  153. )
  154. headers = {
  155. 'user-agent': ua,
  156. 'Accept-Encoding': 'gzip, deflate, br',
  157. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  158. 'Connection': 'keep-alive',
  159. 'Content-Type': 'application/x-www-form-urlencoded',
  160. 'accept-language': 'zh-CN,zh;q=0.9',
  161. 'cache-control': 'no-cache',
  162. 'pragma': 'no-cache',
  163. 'priority': 'u=0, i',
  164. 'referer': 'https://booking.jetstar.com/',
  165. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  166. 'sec-ch-ua-mobile': '?0',
  167. 'sec-ch-ua-platform': '"Windows"',
  168. 'sec-fetch-dest': 'document',
  169. 'sec-fetch-mode': 'navigate',
  170. 'sec-fetch-site': 'same-origin',
  171. 'sec-fetch-user': '?1',
  172. 'upgrade-insecure-requests': '1'
  173. }
  174. # akm js file url
  175. akm_url = "https://www.jetstar.com/xsRfGu1Zb-8uTxanFA/9kX3LGbVJLVb/FB0BajANQQk/YE94/HSh0EkU"
  176. data = {
  177. 'sensor_data': self.ctx.call('encrypt1', statusTs)
  178. }
  179. response1 = get_ck_session.post(akm_url, headers=headers, data=data,
  180. proxy=proxy
  181. )
  182. # print(response1.status_code)
  183. # print(response1.text)
  184. # print(response1.cookies.get_dict())
  185. # print('111', response.headers)
  186. bmsz = response1.cookies.get_dict()['bm_sz']
  187. # print('bmsz => ', bmsz)
  188. data2 = {
  189. "sensor_data": self.ctx.call('encrypt2', statusTs, bmsz)
  190. }
  191. data2 = json.dumps(data2)
  192. response2 = get_ck_session.post(akm_url, headers=headers, data=data2,
  193. proxy=proxy
  194. )
  195. # print(response2.text)
  196. # print(response2.cookies.get_dict())
  197. if response2.status_code == 201:
  198. self.logger.debug('成功获取 cookie bm-sz: {}'.format(bmsz[-16:]))
  199. # 返回第一次请求响应的cookie
  200. return response1.cookies.get_dict()
  201. else:
  202. self.logger.error('状态码错误{}, {}'.format(response2.status_code, response2.text))
  203. except Exception as e:
  204. print('request cookie error, 重试中..', e)
  205. raise
  206. @retrying.retry(stop_max_attempt_number=3, wait_fixed=3000)
  207. def get_ja3(self):
  208. url = "http://8.218.51.130:9003/api/v1/ja3"
  209. payload = {}
  210. headers = {
  211. 'cid': '750B5141EDBF7FA6F73A99C768130099'
  212. }
  213. response = requests.get(url, headers=headers, data=payload, timeout=15)
  214. if response.status_code == 200:
  215. # print(response.json())
  216. res_json = response.json()
  217. if res_json.get("code") == 0:
  218. ja3 = res_json.get("data").get("ja3_str")
  219. ua = res_json.get("data").get("ua")
  220. if "--" not in ja3 and ",," not in ja3:
  221. return ua, ja3
  222. def _refresh_ja3(self):
  223. while True:
  224. if self.ja3_queue.qsize() < self.ja3_queue_num:
  225. try:
  226. ua, ja3 = self.get_ja3()
  227. # logger.debug('获取ja3成功...')
  228. self.ja3_queue.put((ua, ja3))
  229. except Exception as e:
  230. self.logger.error(f'ja3接口错误: {e}')
  231. time.sleep(3)
  232. def _refresh_cookie(self):
  233. while True:
  234. if self.cookies_queue.qsize() < self.cookie_queue_num:
  235. cookie = self.request_new_cookie()
  236. self.cookies_queue.put(cookie)
  237. time.sleep(3)
  238. @retrying.retry(stop_max_attempt_number=5, wait_fixed=3000)
  239. def tls_client_get_request(self, url, params, max_redirects=3):
  240. ua, ja3_string = self.ja3_queue.get()
  241. bmsz_cookie = self.cookies_queue.get()
  242. proxy = self.general_proxies()
  243. req_session = tls_client.Session(
  244. ja3_string=ja3_string, # 直接注入自定义指纹
  245. )
  246. headers = {
  247. 'user-agent': ua,
  248. 'Accept-Encoding': 'gzip, deflate, br',
  249. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  250. 'Connection': 'keep-alive',
  251. 'Content-Type': 'application/x-www-form-urlencoded',
  252. 'accept-language': 'zh-CN,zh;q=0.9',
  253. 'cache-control': 'no-cache',
  254. 'pragma': 'no-cache',
  255. 'priority': 'u=0, i',
  256. 'referer': 'https://booking.jetstar.com/',
  257. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  258. 'sec-ch-ua-mobile': '?0',
  259. 'sec-ch-ua-platform': '"Windows"',
  260. 'sec-fetch-dest': 'document',
  261. 'sec-fetch-mode': 'navigate',
  262. 'sec-fetch-site': 'same-origin',
  263. 'sec-fetch-user': '?1',
  264. 'upgrade-insecure-requests': '1'
  265. }
  266. redirect_count = 0
  267. current_url = url
  268. while redirect_count < max_redirects:
  269. response = req_session.get(current_url, headers=headers, cookies=bmsz_cookie, params=params,
  270. timeout_seconds=15, # timeout
  271. proxy=proxy
  272. )
  273. # print(response.status_code)
  274. # print(response.text)
  275. # 检查是否为重定向状态码
  276. if response.status_code in (301, 302, 303, 307, 308):
  277. # 获取 Location 头(需处理相对路径)
  278. location = response.headers.get("Location")
  279. # if not location:
  280. # break
  281. current_url = urljoin(current_url, location)
  282. redirect_count += 1
  283. elif response.status_code == 200:
  284. html = etree.HTML(response.text)
  285. data = html.xpath("//script[@id='bundle-data-v2']/text()")
  286. if data:
  287. json_data = json.loads(data[0])
  288. # 请求成功,归还Cookie 和 ja3
  289. self.cookies_queue.put(bmsz_cookie) # 成功时放回cookie
  290. self.ja3_queue.put((ua, ja3_string)) # 回收可用ja3 频繁切换ja3会增加触发验证码几率
  291. return json_data # 返回提取后的数据
  292. else:
  293. self.logger.warning(f'触发验证码或拒绝访问错误, 重试中... => {response.text}')
  294. raise
  295. else:
  296. self.logger.error(f'状态码错误, {response.status_code}, 响应内容:{response.text}')
  297. raise Exception(f"超过最大重定向次数, 检查({max_redirects})")
  298. @retrying.retry(stop_max_attempt_number=3, wait_fixed=3000)
  299. def get_cookie_with_verify_price(self):
  300. """验价,刷cookie"""
  301. statusTs = int(time.time() * 1000)
  302. try:
  303. ua, ja3_string = self.get_ja3()
  304. proxy = self.general_proxies()
  305. get_ck_session = tls_client.Session(
  306. ja3_string=ja3_string,
  307. )
  308. headers = {
  309. 'user-agent': ua,
  310. 'Accept-Encoding': 'gzip, deflate, br',
  311. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  312. 'Connection': 'keep-alive',
  313. 'Content-Type': 'application/x-www-form-urlencoded',
  314. 'accept-language': 'zh-CN,zh;q=0.9',
  315. 'cache-control': 'no-cache',
  316. 'pragma': 'no-cache',
  317. 'priority': 'u=0, i',
  318. 'referer': 'https://booking.jetstar.com/',
  319. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  320. 'sec-ch-ua-mobile': '?0',
  321. 'sec-ch-ua-platform': '"Windows"',
  322. 'sec-fetch-dest': 'document',
  323. 'sec-fetch-mode': 'navigate',
  324. 'sec-fetch-site': 'same-origin',
  325. 'sec-fetch-user': '?1',
  326. 'upgrade-insecure-requests': '1'
  327. }
  328. # akm js file url
  329. akm_url = "https://www.jetstar.com/xsRfGu1Zb-8uTxanFA/9kX3LGbVJLVb/FB0BajANQQk/YE94/HSh0EkU"
  330. data = {
  331. 'sensor_data': self.ctx.call('encrypt1', statusTs)
  332. }
  333. response1 = get_ck_session.post(akm_url, headers=headers, data=data,
  334. proxy=proxy
  335. )
  336. # print(response1.status_code)
  337. # print(response1.text)
  338. # print(response1.cookies.get_dict())
  339. # print('111', response.headers)
  340. bmsz = response1.cookies.get_dict()['bm_sz']
  341. # print('bmsz => ', bmsz)
  342. data2 = {
  343. "sensor_data": self.ctx.call('encrypt2', statusTs, bmsz)
  344. }
  345. data2 = json.dumps(data2)
  346. response2 = get_ck_session.post(akm_url, headers=headers, data=data2,
  347. proxy=proxy
  348. )
  349. # print(response2.text)
  350. # print(response2.cookies.get_dict())
  351. if response2.status_code == 201:
  352. self.logger.debug('成功获取 cookie bm-sz: {}'.format(bmsz[-16:]))
  353. # 返回第一次请求响应的cookie
  354. return response1.cookies.get_dict()
  355. else:
  356. self.logger.error('状态码错误{}, {}'.format(response2.status_code, response2.text))
  357. except Exception as e:
  358. print('request cookie error, 重试中..', e)
  359. raise
  360. @retrying.retry(stop_max_attempt_number=5, wait_fixed=3000)
  361. def get_flights_with_verify_price(self, from_city_code, to_city_code, search_date, thread_number=0, max_redirects=3):
  362. """验价逻辑, 不用队列"""
  363. sleep_r = self.time_sleep if self.time_sleep >= 1 else 1
  364. time.sleep(sleep_r)
  365. params = {
  366. "s": "true",
  367. "adults": "1",
  368. "children": "0",
  369. "infants": "0",
  370. "selectedclass1": "economy",
  371. "currency": "CNY",
  372. "mon": "true",
  373. "channel": "DESKTOP",
  374. "origin1": from_city_code, # "PVG"
  375. "destination1": to_city_code,
  376. "departuredate1": search_date
  377. }
  378. ua, ja3_string = self.get_ja3()
  379. bmsz_cookie = self.get_cookie_with_verify_price()
  380. proxy = self.general_proxies()
  381. req_session = tls_client.Session(
  382. ja3_string=ja3_string, # 直接注入自定义指纹
  383. )
  384. headers = {
  385. 'user-agent': ua,
  386. 'Accept-Encoding': 'gzip, deflate, br',
  387. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  388. 'Connection': 'keep-alive',
  389. 'Content-Type': 'application/x-www-form-urlencoded',
  390. 'accept-language': 'zh-CN,zh;q=0.9',
  391. 'cache-control': 'no-cache',
  392. 'pragma': 'no-cache',
  393. 'priority': 'u=0, i',
  394. 'referer': 'https://booking.jetstar.com/',
  395. 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
  396. 'sec-ch-ua-mobile': '?0',
  397. 'sec-ch-ua-platform': '"Windows"',
  398. 'sec-fetch-dest': 'document',
  399. 'sec-fetch-mode': 'navigate',
  400. 'sec-fetch-site': 'same-origin',
  401. 'sec-fetch-user': '?1',
  402. 'upgrade-insecure-requests': '1'
  403. }
  404. redirect_count = 0
  405. current_url = self.search_flights_api
  406. while redirect_count < max_redirects:
  407. response = req_session.get(current_url, headers=headers, cookies=bmsz_cookie, params=params,
  408. timeout_seconds=15, # timeout
  409. proxy=proxy
  410. )
  411. # print(response.status_code)
  412. # print(response.text)
  413. # 检查是否为重定向状态码
  414. if response.status_code in (301, 302, 303, 307, 308):
  415. # 获取 Location 头(需处理相对路径)
  416. location = response.headers.get("Location")
  417. # if not location:
  418. # break
  419. current_url = urljoin(current_url, location)
  420. redirect_count += 1
  421. elif response.status_code == 200:
  422. html = etree.HTML(response.text)
  423. data = html.xpath("//script[@id='bundle-data-v2']/text()")
  424. if data:
  425. json_data = json.loads(data[0])
  426. return {"inner_data": json_data} # 返回提取后的数据
  427. else:
  428. self.logger.warning(f'触发验证码或拒绝访问错误, 重试中... => {response.text}')
  429. raise
  430. else:
  431. self.logger.error(f'状态码错误, {response.status_code}, 响应内容:{response.text}')
  432. raise Exception(f"超过最大重定向次数, 检查({max_redirects})")
  433. def get_flights(self, from_city_code, to_city_code, search_date, thread_number=0):
  434. sleep_r = self.time_sleep if self.time_sleep >= 1 else 1
  435. time.sleep(sleep_r)
  436. params = {
  437. "s": "true",
  438. "adults": "1",
  439. "children": "0",
  440. "infants": "0",
  441. "selectedclass1": "economy",
  442. "currency": "CNY",
  443. "mon": "true",
  444. "channel": "DESKTOP",
  445. "origin1": from_city_code, # "PVG"
  446. "destination1": to_city_code,
  447. "departuredate1": search_date
  448. }
  449. try:
  450. resp_json = self.tls_client_get_request(self.search_flights_api, params)
  451. self.logger.info(f'获取数据成功 {from_city_code}-{to_city_code} {search_date} => {str(resp_json)[:200]}')
  452. return {"inner_data": resp_json}
  453. except Exception as e:
  454. self.logger.error(
  455. f"thread_number:{thread_number} {from_city_code}-{to_city_code}-{search_date}- 5次重试全部失败 {str(e)}")
  456. @retrying.retry(stop_max_attempt_number=3, wait_fixed=3000)
  457. def search_flight_date(self, from_city_code, to_city_code, start_date, end_date):
  458. """查询航班日期, 即那天有航班"""
  459. url = "https://digitalapi.jetstar.com/v1/farecache/flights/batch/availability-with-fareclasses"
  460. params = {
  461. "flightCount": "1",
  462. "includeSoldOut": "true",
  463. "requestType": "StarterOnly",
  464. "from": start_date, # 采集开始时间
  465. "end": end_date, # 采集结束时间,可随意写
  466. "departures": from_city_code,
  467. "arrivals": to_city_code,
  468. "direction": "outbound",
  469. "paxCount": "1",
  470. "includeFees": "false"
  471. }
  472. response = requests.get(url, headers=self.headers, params=params, verify=False)
  473. response.raise_for_status()
  474. # for i in response.json():
  475. # print(i)
  476. json_data = response.json()[0]['routes']
  477. # 只有一个键值对
  478. for key, val in json_data.items():
  479. flight_dates = list(val.get('flights', {}).keys())
  480. # print(f'航段: {key} 对应有航班的日期为: {flight_dates}')
  481. return flight_dates
  482. def thread_task(self, thread_number):
  483. while True:
  484. try:
  485. # 从队列取出 航线数据
  486. queue_data = get_data_by_queue(self.task_queue)
  487. search_route = queue_data # 从队列里取出本次航线
  488. from_city_code = search_route['from_city_code']
  489. to_city_code = search_route['to_city_code']
  490. # 生成采集天数的日期范围, 额外多一天
  491. start_date, end_date = generate_date_range(self.crawl_days)
  492. # 从网站接口查询那天有航班, 日期格式要转换
  493. search_date_list = self.search_flight_date(from_city_code, to_city_code, start_date, end_date)
  494. if not search_date_list:
  495. self.logger.info(f'{start_date}~{end_date} 内的航段 {from_city_code}-{to_city_code} 无航班')
  496. continue
  497. for search_date in search_date_list:
  498. self.logger.info(f"线程:{thread_number}, 正在采集 => {search_date} {from_city_code}-{to_city_code}")
  499. # 日期格式转换 20250531 => 2025-05-31
  500. search_date = time_format_conversion(
  501. search_date,
  502. in_strftime_str="%Y%m%d",
  503. out_strftime_str="%Y-%m-%d"
  504. )
  505. flights_json = self.get_flights(from_city_code, to_city_code, search_date, thread_number)
  506. if flights_json:
  507. Trips = flights_json.get('inner_data', {}).get("Trips", [])
  508. if not Trips:
  509. self.logger.info(
  510. f'Trip 为空, {search_date}:{from_city_code}-{to_city_code}----{flights_json}')
  511. continue
  512. Flights = Trips[0].get('Flights', [])
  513. if len(Flights) > 0:
  514. self.logger.info("获取航班信息{1}:{2}-{3}成功, thread_number:{0}".format(thread_number, search_date,
  515. from_city_code,
  516. to_city_code))
  517. self.save_mongodb_datas(flights_json, from_city_code, to_city_code, search_date)
  518. else:
  519. self.logger.info(
  520. f"获取航班信息为空 {search_date}:{from_city_code}-{to_city_code},thread_number:{thread_number}")
  521. else:
  522. self.logger.info("thread_number:{0},请求航班信息失败/ 该航线数据可能需要重新抓取{1}:{2}-{3}".format(
  523. thread_number, search_date, from_city_code, to_city_code))
  524. except Exception as e:
  525. self.logger.error(f"thread_number:{thread_number}- 未知错误--unknown err:{str(e)}")
  526. finally:
  527. self.task_queue.task_done()
  528. def run(self):
  529. # 往队列上传任务,初始化 采集批次时间状态
  530. self.init_crawl_conf()
  531. thread_list = []
  532. # 刷cookie
  533. for _ in range(1, self.cookie_thread_num + 1):
  534. t_get_cookie = threading.Thread(target=self._refresh_cookie)
  535. thread_list.append(t_get_cookie)
  536. # 刷ja3
  537. t_get_ja3 = threading.Thread(target=self._refresh_ja3)
  538. thread_list.append(t_get_ja3)
  539. # 采集data
  540. for thread_number in range(1, self.thread_numbers + 1):
  541. t_get_data = threading.Thread(target=self.thread_task, args=(thread_number,))
  542. thread_list.append(t_get_data)
  543. for t_obj in thread_list:
  544. t_obj.setDaemon(True)
  545. t_obj.start()
  546. self.task_queue.join()
  547. # 更新采集状态
  548. self.update_crawl_date_status()
  549. # 关闭数据库连接
  550. self.db.client.close()
  551. def save_mongodb_datas(self, info_data, from_city_code, to_city_code, search_date):
  552. # 将采集到的数据入库 并打上相应的数据
  553. info_data["website"] = self.website
  554. info_data["crawl_date"] = self.crawl_date
  555. info_data["search_from_city_code"] = from_city_code
  556. # info_data["search_from_airport_code"] = from_airport_code
  557. info_data["search_to_city_code"] = to_city_code
  558. # info_data["search_to_airport_code"] = to_airport_code
  559. info_data["search_date"] = search_date # 搜索日期,采集航班的日期
  560. info_data["clean_status"] = 0 # 给每一条数据标记清理状态
  561. self.db.get_collection(self.info_tab).insert_one(info_data)
  562. def update_crawl_date_status(self):
  563. # 更新 采集批次时间 状态
  564. find_data = self.db.get_collection(CRAWL_DATE_CONF_STATUS_TAB).find_one(
  565. {
  566. "website": self.website,
  567. "crawl_date": self.crawl_date,
  568. "crawl_status": 0
  569. }
  570. )
  571. if find_data:
  572. self.db.get_collection(CRAWL_DATE_CONF_STATUS_TAB).update_one(
  573. {
  574. "_id": ObjectId(find_data.get("_id")) # 通过文档的 _id 精确匹配
  575. },
  576. {
  577. "$set":
  578. {
  579. "crawl_status": 1 # 将 crawl_status 设为 1(表示已爬取)
  580. }
  581. }
  582. )
  583. if __name__ == "__main__":
  584. spider = GKSpider()
  585. spider.run()