sh脚本多卡顺序执行训练文件
常规的单机多卡训练脚本一般为
python -m torch.distributed.run --nproc_per_node 2 train.py
上述脚本采用 2 张显卡训练
采用sh脚本,单次顺序执行多个多卡训练文件
例如 train1.py train2.py
特点:在执行完 train1.py之后再执行train2.py文件
#!/bin/bash# 设置环境变量
export CUDA_VISIBLE_DEVICES=0,1 # 指定使用的 GPU 设备
export NCCL_DEBUG=INFO # 设置 NCCL 的调试级别
export MASTER_ADDR="127.0.0.1" # 主节点的 IP 地址
export MASTER_PORT="29500" # 主节点的端口号# 打印开始时间
echo "Training started at $(date)"# 执行第一个训练脚本
echo "Starting training script 1: train1.py"
python -m torch.distributed.run --nproc_per_node 2 train1.py# 检查第一个脚本是否成功执行
if [ $? -ne 0 ]; thenecho "Training script 1 failed at $(date)"exit 1
elseecho "Training script 1 finished successfully at $(date)"
fi# 确保第一个脚本完全结束后再执行第二个脚本
echo "Waiting for 5 seconds to ensure the first script is fully terminated..."
sleep 5# 执行第二个训练脚本
echo "Starting training script 2: train2.py"
python -m torch.distributed.run --nproc_per_node 2 train2.py# 检查第二个脚本是否成功执行
if [ $? -ne 0 ]; thenecho "Training script 2 failed at $(date)"exit 1
elseecho "Training script 2 finished successfully at $(date)"
fi# 打印结束时间
echo "All training scripts finished at $(date)"