- #!/bin/bash
- SCRIPT_NAME="main_tr.py"
- LOG_NAME="nohup_node03.log"
- rm nohup_*.log
- /usr/bin/pkill -f "torchrun.*${SCRIPT_NAME}" || true
- sleep 2
- nohup ~/anaconda3/bin/torchrun --nproc_per_node=4 --nnodes=4 --node_rank=3 --master_addr=node04 --master_port=29600 ~/yuzhou/jiangcang_vj/${SCRIPT_NAME} > ~/yuzhou/jiangcang_vj/${LOG_NAME} 2>&1 &
|