I have written a simulation software for highly parallelized execution, using MPI for internode and threads for intranode parallelization to reduce the memory footprint by u
This can be done - here is a test code that sets up a small table on each shared memory node. Only one process (node rank 0) actually allocates and initialises the table, but all processes on a node can read it (apologies for the formatting - seems to be a space/tab issue)
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(void)
{
int i, flag;
int nodesize, noderank;
int size, rank, irank;
int tablesize, localtablesize;
int *table, *localtable;
int *model;
MPI_Comm allcomm, nodecomm;
char verstring[MPI_MAX_LIBRARY_VERSION_STRING];
char nodename[MPI_MAX_PROCESSOR_NAME];
MPI_Aint winsize;
int windisp;
int *winptr;
int version, subversion, verstringlen, nodestringlen;
allcomm = MPI_COMM_WORLD;
MPI_Win wintable;
tablesize = 5;
MPI_Init(NULL, NULL);
MPI_Comm_size(allcomm, &size);
MPI_Comm_rank(allcomm, &rank);
MPI_Get_processor_name(nodename, &nodestringlen);
MPI_Get_version(&version, &subversion);
MPI_Get_library_version(verstring, &verstringlen);
if (rank == 0)
{
printf("Version %d, subversion %d\n", version, subversion);
printf("Library <%s>\n", verstring);
}
// Create node-local communicator
MPI_Comm_split_type(allcomm, MPI_COMM_TYPE_SHARED, rank,
MPI_INFO_NULL, &nodecomm);
MPI_Comm_size(nodecomm, &nodesize);
MPI_Comm_rank(nodecomm, &noderank);
// Only rank 0 on a node actually allocates memory
localtablesize = 0;
if (noderank == 0) localtablesize = tablesize;
// debug info
printf("Rank %d of %d, rank %d of %d in node <%s>, localtablesize %d\n",
rank, size, noderank, nodesize, nodename, localtablesize);
MPI_Win_allocate_shared(localtablesize*sizeof(int), sizeof(int),
MPI_INFO_NULL, nodecomm, &localtable, &wintable);
MPI_Win_get_attr(wintable, MPI_WIN_MODEL, &model, &flag);
if (1 != flag)
{
printf("Attribute MPI_WIN_MODEL not defined\n");
}
else
{
if (MPI_WIN_UNIFIED == *model)
{
if (rank == 0) printf("Memory model is MPI_WIN_UNIFIED\n");
}
else
{
if (rank == 0) printf("Memory model is *not* MPI_WIN_UNIFIED\n");
MPI_Finalize();
return 1;
}
}
// need to get local pointer valid for table on rank 0
table = localtable;
if (noderank != 0)
{
MPI_Win_shared_query(wintable, 0, &winsize, &windisp, &table);
}
// All table pointers should now point to copy on noderank 0
// Initialise table on rank 0 with appropriate synchronisation
MPI_Win_fence(0, wintable);
if (noderank == 0)
{
for (i=0; i < tablesize; i++)
{
table[i] = rank*tablesize + i;
}
}
MPI_Win_fence(0, wintable);
// Check we did it right
for (i=0; i < tablesize; i++)
{
printf("rank %d, noderank %d, table[%d] = %d\n",
rank, noderank, i, table[i]);
}
MPI_Finalize();
}
Here is some sample output for 6 processes across two nodes:
Version 3, subversion 1
Library <SGI MPT 2.14 04/05/16 03:53:22>
Rank 3 of 6, rank 0 of 3 in node <r1i0n1>, localtablesize 5
Rank 4 of 6, rank 1 of 3 in node <r1i0n1>, localtablesize 0
Rank 5 of 6, rank 2 of 3 in node <r1i0n1>, localtablesize 0
Rank 0 of 6, rank 0 of 3 in node <r1i0n0>, localtablesize 5
Rank 1 of 6, rank 1 of 3 in node <r1i0n0>, localtablesize 0
Rank 2 of 6, rank 2 of 3 in node <r1i0n0>, localtablesize 0
Memory model is MPI_WIN_UNIFIED
rank 3, noderank 0, table[0] = 15
rank 3, noderank 0, table[1] = 16
rank 3, noderank 0, table[2] = 17
rank 3, noderank 0, table[3] = 18
rank 3, noderank 0, table[4] = 19
rank 4, noderank 1, table[0] = 15
rank 4, noderank 1, table[1] = 16
rank 4, noderank 1, table[2] = 17
rank 4, noderank 1, table[3] = 18
rank 4, noderank 1, table[4] = 19
rank 5, noderank 2, table[0] = 15
rank 5, noderank 2, table[1] = 16
rank 5, noderank 2, table[2] = 17
rank 5, noderank 2, table[3] = 18
rank 5, noderank 2, table[4] = 19
rank 0, noderank 0, table[0] = 0
rank 0, noderank 0, table[1] = 1
rank 0, noderank 0, table[2] = 2
rank 0, noderank 0, table[3] = 3
rank 0, noderank 0, table[4] = 4
rank 1, noderank 1, table[0] = 0
rank 1, noderank 1, table[1] = 1
rank 1, noderank 1, table[2] = 2
rank 1, noderank 1, table[3] = 3
rank 1, noderank 1, table[4] = 4
rank 2, noderank 2, table[0] = 0
rank 2, noderank 2, table[1] = 1
rank 2, noderank 2, table[2] = 2
rank 2, noderank 2, table[3] = 3
rank 2, noderank 2, table[4] = 4