start_node01.sh 337 B

1234567
  1. #!/bin/bash
  2. SCRIPT_NAME="main_tr.py"
  3. LOG_NAME="nohup_node01.log"
  4. rm nohup_*.log
  5. /usr/bin/pkill -f "torchrun.*${SCRIPT_NAME}" || true
  6. sleep 2
  7. nohup ~/anaconda3/bin/torchrun --nproc_per_node=4 --nnodes=4 --node_rank=1 --master_addr=node04 --master_port=29600 ~/yuzhou/jiangcang_vj/${SCRIPT_NAME} > ~/yuzhou/jiangcang_vj/${LOG_NAME} 2>&1 &