Installing OpenMPI on a 64-bit AMD Opteron Cluster running CentOS 5 Linux

Open MPI is a message passing interface that combines the merger of three major MPI implementations (FT-MPI, LA-MPI, and LAM/MPI) to create a complete MPI-2 implementation. MPI is a language-independent communications protocol used to program parallel computers.

To install, login into the management node of the cluster, download the source code, extract, and run the configure scripts according to the appropriate compilers (e.g., pgi, gcc and intel) looped with the appropriate environment variable modules for each of those compilers. Note that these configuration files each have their own specific compiler flags. When running the `make` it is possible to do a parallel install, as is done here, otherwise `make all install` is recommended.


cd /usr/local/src/OPENMPI
wget http://www.open-mpi.org/software/ompi/v1.4/downloads/openmpi-1.4.3.tar.bz2
tar xvf openmpi-1.4.3.tar.bz2
cd openmpi-1.4.3
module purge
module load gcc
../config-gcc
make -j 4 all
make install
module purge
module load pgi
make clean
../config-pgi
make -j 4 all
make install
module purge
module load intel
make clean
../config-intel
make -j 4 all
make install

The various scripts are in the following form:


cat config-gcc
#!/bin/bash
BASE=`basename $PWD | sed -e s,-,/,`
./configure --prefix=/usr/local/${BASE}-gcc --with-openib --with-tm=/usr/local/torque/latest --enable-static --enable-shared


cat config-pgi
#!/bin/bash
BASE=`basename $PWD | sed -e s,-,/,`
CC=pgcc CXX=pgcpp F77=pgf77 FC=pgf90 ./configure --prefix=/usr/local/${BASE}-pgi --with-openib --with-tm=/usr/local/torque/latest --enable-static --enable-shared


cat config-intel
#!/bin/bash
BASE=`basename $PWD | sed -e s,-,/,`
CC=icc CXX=icpc F77=ifort FC=ifort ./configure --prefix=/usr/local/${BASE}-intel --with-openib --with-tm=/usr/local/torque/latest --enable-static --enable-shared

Each compiler version of openmpi now requires it's own environment module. Repeat for each compiler the following:

#%Module1.0#####################################################################
##
## $name modulefile
##

set ver [lrange [split [ module-info name ] / ] 1 1 ]
set name [lrange [split [ module-info name ] / ] 0 0 ]
set loading [module-info mode load]
set subname [lrange [split $name - ] 0 0 ]
set compiler [lrange [split $name - ] 1 1 ]

proc ModulesHelp { } {
  puts stderr "\tThis module sets the envinronment for $name v$ver"
}

module-whatis   "Set environment variables to use $name version $ver"

if { $loading && ![ is-loaded $compiler ] } {
  module load $compiler
}

prepend-path --delim " " CPPFLAGS -I/usr/local/$subname/$ver-$compiler/include
prepend-path --delim " " LDFLAGS -L/usr/local/$subname/$ver-$compiler/lib

prepend-path LD_LIBRARY_PATH /usr/local/$subname/$ver-$compiler/lib
prepend-path MANPATH /usr/local/$subname/$ver-$compiler/share/man
prepend-path PATH /usr/local/$subname/$ver-$compiler/bin

setenv  MPI_DIR		/usr/local/$subname/$ver-$compiler/
setenv  MPI_HOME        /usr/local/$subname/$ver-$compiler/
setenv  OPENMPI_ROOT    /usr/local/$subname/$ver-$compiler/

When this is done, exit the root account and login as a user. Coduct a minimal test each version with a parallel program


ssh [username]@[cluster].[loginnode]
module load openmpi-intel/1.4.3
mpicc -o mpi-pong mpi-pong.c
mpiexec -np 2 ./mpi-pong
module purge
module load openmpi-gcc/1.4.3
mpicc -o mpi-pong mpi-pong.c
mpiexec -np 2 ./mpi-pong
module purge
module load openmpi-pgi/1.4.3
mpicc -o mpi-pong mpi-pong.c
mpiexec -np 2 ./mpi-pong

The program mpi-pong takes the following form:

/* Use the File->Save as selection to save this source code*/
/*                  pong.c Generic Benchmark code
 *               Dave Turner - Ames Lab - July of 1994+++
 *
 *  Most Unix timers can't be trusted for very short times, so take this
 *  into account when looking at the results.  This code also only times
 *  a single message passing event for each size, so the results may vary
 *  between runs.  For more accurate measurements, grab NetPIPE from
 *  http://www.scl.ameslab.gov/ .
 */
#include "mpi.h"
#include 
#include 

int main (int argc, char **argv)
{
   int myproc, size, other_proc, nprocs, i, last;
   double t0, t1, time;
   double *a, *b;
   double max_rate = 0.0, min_latency = 10e6;
   MPI_Request request, request_a, request_b;
   MPI_Status status;

#if defined (_CRAYT3E)
   a = (double *) shmalloc (132000 * sizeof (double));
   b = (double *) shmalloc (132000 * sizeof (double));
#else
   a = (double *) malloc (132000 * sizeof (double));
   b = (double *) malloc (132000 * sizeof (double));
#endif

   for (i = 0; i < 132000; i++) {
      a[i] = (double) i;
      b[i] = 0.0;
   }

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myproc);

   if (nprocs != 2) {
	   printf("Error: You don't have two processors available.");
	   exit (1);
   }

   other_proc = (myproc + 1) % 2;

   printf("Hello from %d of %d\n", myproc, nprocs);
   MPI_Barrier(MPI_COMM_WORLD);

/* Timer accuracy test */

   t0 = MPI_Wtime();
   t1 = MPI_Wtime();

   while (t1 == t0) t1 = MPI_Wtime();

   if (myproc == 0)
      printf("Timer accuracy of ~%f usecs\n\n", (t1 - t0) * 1000000);

/* Communications between nodes 
 *   - Blocking sends and recvs
 *   - No guarantee of prepost, so might pass through comm buffer
 */

   for (size = 8; size <= 1048576; size *= 2) {
      for (i = 0; i < size / 8; i++) {
         a[i] = (double) i;
         b[i] = 0.0;
      }
      last = size / 8 - 1;

      MPI_Barrier(MPI_COMM_WORLD);
      t0 = MPI_Wtime();

      if (myproc == 0) {

         MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
         MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);

      } else {

         MPI_Recv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &status);

         b[0] += 1.0;
         if (last != 0)
         b[last] += 1.0;

         MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);

      }

      t1 = MPI_Wtime();
      time = 1.e6 * (t1 - t0);
      MPI_Barrier(MPI_COMM_WORLD);

      if ((b[0] != 1.0 || b[last] != last + 1)) {
         printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);
         exit (1);
      }
      for (i = 1; i < last - 1; i++)
         if (b[i] != (double) i)
            printf("ERROR - b[%d] = %f\n", i, b[i]);
      if (myproc == 0 && time > 0.000001) {
         printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
                     size, time, 2.0 * size / time);
         if (2 * size / time > max_rate) max_rate = 2 * size / time;
         if (time / 2 < min_latency) min_latency = time / 2;
      } else if (myproc == 0) {
         printf(" %7d bytes took less than the timer accuracy\n", size);
      }
   }

/* Async communications
 *   - Prepost receives to guarantee bypassing the comm buffer
 */

   MPI_Barrier(MPI_COMM_WORLD);
   if (myproc == 0) printf("\n  Asynchronous ping-pong\n\n");

   for (size = 8; size <= 1048576; size *= 2) {
      for (i = 0; i < size / 8; i++) {
         a[i] = (double) i;
         b[i] = 0.0;
      }
      last = size / 8 - 1;

      MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request);
      MPI_Barrier(MPI_COMM_WORLD);
      t0 = MPI_Wtime();

      if (myproc == 0) {

         MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
         MPI_Wait(&request, &status);

      } else {

         MPI_Wait(&request, &status);

         b[0] += 1.0;
         if (last != 0)
         b[last] += 1.0;

         MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
      }

      t1 = MPI_Wtime();

      time = 1.e6 * (t1 - t0);
      MPI_Barrier(MPI_COMM_WORLD);

      if ((b[0] != 1.0 || b[last] != last + 1))
         printf("ERROR - b[0] = %f b[%d] = %f\n", b[0], last, b[last]);
      for (i = 1; i < last - 1; i++)
         if (b[i] != (double) i)
            printf("ERROR - b[%d] = %f\n", i, b[i]);
      if (myproc == 0 && time > 0.000001) {
         printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
                  size, time, 2.0 * size / time);
         if (2 * size / time > max_rate) max_rate = 2 * size / time;
         if (time / 2 < min_latency) min_latency = time / 2;
      } else if (myproc == 0) {
         printf(" %7d bytes took less than the timer accuracy\n", size);
      }
   }

/* Bidirectional communications
 *   - Prepost receives to guarantee bypassing the comm buffer
 */

   MPI_Barrier(MPI_COMM_WORLD);
   if (myproc == 0) printf("\n  Bi-directional asynchronous ping-pong\n\n");

   for (size = 8; size <= 1048576; size *= 2) {
      for (i = 0; i < size / 8; i++) {
         a[i] = (double) i;
         b[i] = 0.0;
      }
      last = size / 8 - 1;

      MPI_Irecv(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_b);
      MPI_Irecv(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD, &request_a);
      MPI_Barrier(MPI_COMM_WORLD);

      t0 = MPI_Wtime();

      MPI_Send(a, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
      MPI_Wait(&request_b, &status);

      b[0] += 1.0;
      if (last != 0)
      b[last] += 1.0;

      MPI_Send(b, size/8, MPI_DOUBLE, other_proc, 0, MPI_COMM_WORLD);
      MPI_Wait(&request_a, &status);

      t1 = MPI_Wtime();
      time = 1.e6 * (t1 - t0);
      MPI_Barrier(MPI_COMM_WORLD);

      if ((a[0] != 1.0 || a[last] != last + 1))
         printf("ERROR - a[0] = %f a[%d] = %f\n", a[0], last, a[last]);
      for (i = 1; i < last - 1; i++)
      if (a[i] != (double) i)
         printf("ERROR - a[%d] = %f\n", i, a[i]);
      if (myproc == 0 && time > 0.000001) {
         printf(" %7d bytes took %9.0f usec (%8.3f MB/sec)\n",
                    size, time, 2.0 * size / time);
         if (2 * size / time > max_rate) max_rate = 2 * size / time;
         if (time / 2 < min_latency) min_latency = time / 2;
      } else if (myproc == 0) {
         printf(" %7d bytes took less than the timer accuracy\n", size);
      }
   }

   if (myproc == 0)
      printf("\n Max rate = %f MB/sec  Min latency = %f usec\n",
               max_rate, min_latency);

   MPI_Finalize();
   return (0);
}

MPI applications should be compiled using the Open MPI "wrapper" compilers:

C programs: mpicc your-code.c
C++ programs: mpiCC your-code.cc or
mpic++ your-code.cc (for case-insensitive filesystems)
F77 programs: mpif77 your-code.f
F90 programs: mpif90 your-code.f90