Installation and Testing of OpenMPI 1.7.5 (MPI-3 compliant) with GCC, Intel, and PGI compilers

For a very long time, OpenMPI has described itself as "an open source, freely available implementation of both the MPI-1 and MPI-2 documents", which allows for parallel programming. The team has just released version 1.7.5, and they can proudly announce Open MPI is now fully MPI-3.0 compliant. This is a "feature release" will be part of the 1.8 series.

This is exciting news, albeit it is a narrow collection of people who will feel this sense of excitement. A somewhat larger set will enjoy the advantages in running parallel programs with a MPI-3 implmentation, and a much larger set (possibly containing most of the species) will enjoy the products that arise from these improved capabilities. A full set of the changes are available.

The following is an example installation of OpenMPI 1.7.5 on a Linux cluster. It assumes the existence of the recent versions of the GCC, Intel, and PGI compilers, and the use of environment modules. It makes use of some small configuration files for each of these compilers.

First download to a sensible place and extract.


cd /usr/local/src/OPENMPI
wget http://www.open-mpi.org/software/ompi/v1.7/downloads/openmpi-1.7.5.tar.bz2
tar xvf openmpi-1.7.5.tar.bz2
cd openmpi-1.7.5

Then walk through the compiler configurations for GCC, PGI, and Intel respectively.


module purge
module load gcc/4.8.2
../config-gcc
make -j 2 all
make install
module purge
module load pgi/12.10
make clean
../config-pgi
make -j 2 all
make install
module purge
module load intel/14.0.0
make clean
../config-intel
make -j 2 all
make install

One may even write a config-all script to run the entire set with a single command.

The config-gcc file takes the following form.


#!/bin/bash
BASE=`basename $PWD | sed -e s,-,/,`
./configure --prefix=/usr/local/${BASE}-gcc --with-openib --with-tm=/usr/local/torque/latest --enable-static --enable-shared

The config-intel file takes the following form.


#!/bin/bash
BASE=`basename $PWD | sed -e s,-,/,`
CC=icc CXX=icpc F77=ifort FC=ifort ./configure --prefix=/usr/local/${BASE}-intel --with-openib --with-tm=/usr/local/torque/latest --enable-static --enable-shared

The config-pgi file takes the following form.


#!/bin/bash
BASE=`basename $PWD | sed -e s,-,/,`
CC=pgcc CXX=pgcpp F77=pgf77 FC=pgf90 ./configure --prefix=/usr/local/${BASE}-pgi --with-openib --with-tm=/usr/local/torque/latest --enable-static --enable-shared

Each compiler version of openmpi now requires it's own environment module.


cd /usr/local/Modules/modulefiles/openmpi
ln -s .base-intel 1.7.5-intel
ln -s .base-gcc 1.7.5-gcc
ln -s .base-pgi 1.7.5-pgi

A module file for intel is as follows:


#%Module1.0#####################################################################
##
## $name modulefile
##
set ver [lrange [split [ module-info name ] / ] 1 1 ]
set name [lrange [split [ module-info name ] / ] 0 0 ]
set loading [module-info mode load]
set desc [join [read [ open "/usr/local/Modules/modulefiles/$name/.desc" ] ] ]
set subver [lrange [split $ver - ] 0 0 ]
set compiler [lrange [split $ver - ] 1 1 ]

proc ModulesHelp { } {
puts stderr "\tThis module sets the envinronment for $name v$ver"
}
module-whatis "$desc (v$ver)"
if { $loading && ![ is-loaded intel/14.0.0 ] } {
module load intel
}
module-whatis "$desc (v$ver)"
module load $name-$compiler/$subver

A module file for gcc is as follows:


#%Module1.0#####################################################################
##
## $name modulefile
##
set ver [lrange [split [ module-info name ] / ] 1 1 ]
set name [lrange [split [ module-info name ] / ] 0 0 ]
set loading [module-info mode load]
set desc [join [read [ open "/usr/local/Modules/modulefiles/$name/.desc" ] ] ]
set subver [lrange [split $ver - ] 0 0 ]
set compiler [lrange [split $ver - ] 1 1 ]
proc ModulesHelp { } {
puts stderr "\tThis module sets the envinronment for $name v$ver"
}
module-whatis "$desc (v$ver)"
if { $loading && ![ is-loaded gcc/4.8.2 ] } {
module load gcc/4.8.2
}
module-whatis "$desc (v$ver)"
module load $name-$compiler/$subver

A module file for pgi is as follows:


#%Module1.0#####################################################################
##
## $name modulefile
##
set ver [lrange [split [ module-info name ] / ] 1 1 ]
set name [lrange [split [ module-info name ] / ] 0 0 ]
set loading [module-info mode load]
set desc [join [read [ open "/usr/local/Modules/modulefiles/$name/.desc" ] ] ]
set subver [lrange [split $ver - ] 0 0 ]
set compiler [lrange [split $ver - ] 1 1 ]
proc ModulesHelp { } {
puts stderr "\tThis module sets the envinronment for $name v$ver"
}
module-whatis "$desc (v$ver)"
if { $loading && ![ is-loaded pgi/12.10 ] } {
module load pgi/12.10
}
module-whatis "$desc (v$ver)"
module load $name-$compiler/$subver

When this is done, exit login as a user. Copy the pbs test scripts to a directory and conduct a minimal test each version, using mpi-pong.c.


module purge
module load openmpi-intel/1.7.5
mpicc -o mpi-pong mpi-pong.c
mpiexec -np 2 ./mpi-pong
module purge
module load openmpi-gcc/1.7.5
mpicc -o mpi-pong mpi-pong.c
mpiexec -np 2 ./mpi-pong
module purge
module load openmpi-pgi/1.7.5
mpicc -o mpi-pong mpi-pong.c
mpiexec -np 2 ./mpi-pong


/* mpi-pong.c Generic Benchmark code
* Dave Turner - Ames Lab - July of 1994+++
*
* Most Unix timers can't be trusted for very short times, so take this
* into account when looking at the results. This code also only times
* a single message passing event for each size, so the results may vary
* between runs. For more accurate measurements, grab NetPIPE from
* http://www.scl.ameslab.gov/ .
*/
#include "mpi.h"
#include
#include
int main (int argc, char **argv)
{
int myproc, size, other_proc, nprocs, i, last;
double t0, t1, time;
double *a, *b;
double max_rate = 0.0, min_latency = 10e6;
MPI_Request request, request_a, request_b;
MPI_Status status;
#if defined (_CRAYT3E)
a = (double *) shmalloc (132000 * sizeof (double));
b = (double *) shmalloc (132000 * sizeof (double));
#else
a = (double *) malloc (132000 * sizeof (double));
b = (double *) malloc (132000 * sizeof (double));
#endif
for (i = 0; i < 132000; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
if (nprocs != 2) {
printf("Error: You don't have two processors available.");
exit (1);
}
other_proc = (myproc + 1) % 2;
printf("Hello from %d of %d\n", myproc, nprocs);
MPI_Barrier(MPI_COMM_WORLD);
/* Timer accuracy test */
t0 = MPI_Wtime();
t1 = MPI_Wtime();
while (t1 == t0) t1 = MPI_Wtime();
if (myproc == 0)
printf("Timer accuracy of ~%f usecs\n\n", (t1 - t0) * 1000000);
/* Communications between nodes
* - Blocking sends and recvs
* - No guarantee of prepost, so might pass through comm buffer
*/
for (size = 8; size <= 1048576; size *= 2) {
for (i = 0; i < size / 8; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
last = size / 8 - 1;
MPI_Barrier(MPI_COMM_WORLD);
t0 = MPI_Wtime();
if (myproc == 0) {
MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);
} else {
MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);
b[0] += 1.0;
if (last != 0)
b[last] += 1.0;
MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
}
t1 = MPI_Wtime();
time = 1.e6 * (t1 - t0);
MPI_Barrier(MPI_COMM_WORLD);
if ((b[0] != 1.0 || b[last] != last + 1)) {
printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);
exit (1);
}
for (i = 1; i < last - 1; i++)
if (b[i] != (double) i)
printf("ERROR - b[%d] = %f\n", i, b[i]);
if (myproc == 0 && time > 0.000001) {
printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
size, time, 2.0 * size / time);
if (2 * size / time > max_rate) max_rate = 2 * size / time;
if (time / 2 < min_latency) min_latency = time / 2;
} else if (myproc == 0) {
printf(" %7d bytes took less than the timer accuracy\n", size);
}
}
/* Async communications
* - Prepost receives to guarantee bypassing the comm buffer
*/
MPI_Barrier(MPI_COMM_WORLD);
if (myproc == 0) printf("\n Asynchronous ping-pong\n\n");
for (size = 8; size <= 1048576; size *= 2) {
for (i = 0; i < size / 8; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
last = size / 8 - 1;
MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request);
MPI_Barrier(MPI_COMM_WORLD);
t0 = MPI_Wtime();
if (myproc == 0) {
MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Wait(&request, &status);
} else {
MPI_Wait(&request, &status);
b[0] += 1.0;
if (last != 0)
b[last] += 1.0;
MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
}
t1 = MPI_Wtime();
time = 1.e6 * (t1 - t0);
MPI_Barrier(MPI_COMM_WORLD);
if ((b[0] != 1.0 || b[last] != last + 1))
printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);
for (i = 1; i < last - 1; i++)
if (b[i] != (double) i)
printf("ERROR - b[%d] = %f\n", i, b[i]);
if (myproc == 0 && time > 0.000001) {
printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
size, time, 2.0 * size / time);
if (2 * size / time > max_rate) max_rate = 2 * size / time;
if (time / 2 < min_latency) min_latency = time / 2;
} else if (myproc == 0) {
printf(" %7d bytes took less than the timer accuracy\n", size);
}
}
/* Bidirectional communications
* - Prepost receives to guarantee bypassing the comm buffer
*/
MPI_Barrier(MPI_COMM_WORLD);
if (myproc == 0) printf("\n Bi-directional asynchronous ping-pong\n\n");
for (size = 8; size <= 1048576; size *= 2) {
for (i = 0; i < size / 8; i++) {
a[i] = (double) i;
b[i] = 0.0;
}
last = size / 8 - 1;
MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_b);
MPI_Irecv(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_a);
MPI_Barrier(MPI_COMM_WORLD);
t0 = MPI_Wtime();
MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Wait(&request_b, &status);
b[0] += 1.0;
if (last != 0)
b[last] += 1.0;
MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
MPI_Wait(&request_a, &status);
t1 = MPI_Wtime();
time = 1.e6 * (t1 - t0);
MPI_Barrier(MPI_COMM_WORLD);
if ((a[0] != 1.0 || a[last] != last + 1))
printf("ERROR - a[0] = %f a[%d] = %f\n", a[0], last, a[last]);
for (i = 1; i < last - 1; i++)
if (a[i] != (double) i)
printf("ERROR - a[%d] = %f\n", i, a[i]);
if (myproc == 0 && time > 0.000001) {
printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
size, time, 2.0 * size / time);
if (2 * size / time > max_rate) max_rate = 2 * size / time;
if (time / 2 < min_latency) min_latency = time / 2;
} else if (myproc == 0) {
printf(" %7d bytes took less than the timer accuracy\n", size);
}
}
if (myproc == 0)
printf("\n Max rate = %f MB/sec Min latency = %f usec\n",
max_rate, min_latency);
MPI_Finalize();
return (0);
}