1. 基本主机检测测试
# 测试MPI是否能发现所有节点
mpirun -np 2 --allow-run-as-root hostname# 使用MPI内置的进程信息
mpirun -np 4 --allow-run-as-root printenv OMPI_COMM_WORLD_RANK OMPI_COMM_WORLD_SIZE# np参数按需修改
2. MPI 点对点通信测试
# 简单的ping-pong测试(需要编写测试程序)
cat > mpi_pingpong.c << 'EOF'
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>int main(int argc, char** argv) {MPI_Init(&argc, &argv);int world_rank, world_size;MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);MPI_Comm_size(MPI_COMM_WORLD, &world_size);if (world_size < 2) {printf("This test requires at least 2 processes\n");MPI_Finalize();return 1;}const int num_elements = 1000;double* data = (double*)malloc(num_elements * sizeof(double));double start_time, end_time;if (world_rank == 0) {// Process 0 sends to process 1for (int i = 0; i < num_elements; i++) {data[i] = i * 1.0;}start_time = MPI_Wtime();MPI_Send(data, num_elements, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD);MPI_Recv(data, num_elements, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);end_time = MPI_Wtime();printf("Ping-pong time for %d doubles: %f seconds\n", num_elements, end_time - start_time);} else if (world_rank == 1) {// Process 1 receives from process 0 and sends backMPI_Recv(data, num_elements, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);MPI_Send(data, num_elements, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);}free(data);MPI_Finalize();return 0;
}
EOF# 编译运行
mpicc -o mpi_pingpong mpi_pingpong.c
mpirun -np 2 --allow-run-as-root ./mpi_pingpong
4. 使用 OpenMPI 自带的性能测试工具
# 测试点对点通信带宽和延迟 测试双节点,验证可行性
mpirun -np 2 --allow-run-as-root $(which osu_bw)# 测试延迟
mpirun -np 2 --allow-run-as-root $(which osu_latency)# 测试集体操作性能(如果安装了osu-micro-benchmarks)
mpirun -np 4 --allow-run-as-root $(which osu_allreduce)# 如果没有安装,可以先安装:
# Ubuntu: sudo apt-get install libopenmpi-dev openmpi-bin openssh-client
# CentOS: sudo yum install openmpi-devel openmpi openssh-clients
5. MPI排障思路
# 我用的是主机文件
# 例如
192.168.1.100 slots=8
192.168.1.101 slots=8配置 SSH 无密码登录
ssh-keygen -t rsa# 将公钥复制到另一台机器
ssh-copy-id your-other-machine# 测试无密码登录
ssh your-other-machine# 如果仍然有问题,检查 MPI 配置
# 查看 MPI 的详细调试信息
mpirun -hostfile /etc/mpi_hosts -np 2 --mca plm_base_verbose 5 --allow-run-as-root hostname# 检查 MPI 的进程启动方法
ompi_info --param plm all# 尝试使用不同的进程启动方法
mpirun -hostfile /etc/mpi_hosts --mca plm_rsh_agent ssh -np 2 --allow-run-as-root hostname