start_node02.sh 338 B

12345678
  1. #!/bin/bash
  2. SCRIPT_NAME="main_tr.py"
  3. LOG_NAME="nohup_node02.log"
  4. rm nohup_*.log
  5. /usr/bin/pkill -f "torchrun.*${SCRIPT_NAME}" || true
  6. sleep 2
  7. nohup ~/anaconda3/bin/torchrun --nproc_per_node=2 --nnodes=4 --node_rank=2 --master_addr=node04 --master_port=29600 ~/yuzhou/jiangcang_vj/${SCRIPT_NAME} > ~/yuzhou/jiangcang_vj/${LOG_NAME} 2>&1 &