Can I use MPI with shared memory

后端 未结 1 1362
感情败类
感情败类 2021-01-07 07:03

I have written a simulation software for highly parallelized execution, using MPI for internode and threads for intranode parallelization to reduce the memory footprint by u

相关标签:
1条回答
  • 2021-01-07 07:23

    This can be done - here is a test code that sets up a small table on each shared memory node. Only one process (node rank 0) actually allocates and initialises the table, but all processes on a node can read it (apologies for the formatting - seems to be a space/tab issue)

    #include <stdio.h>
    #include <stdlib.h>
    #include <mpi.h>
    
    int main(void)
    {
      int i, flag;
    
      int nodesize, noderank;
      int size, rank, irank;
      int tablesize, localtablesize;
      int *table, *localtable;
      int *model;
    
      MPI_Comm allcomm, nodecomm;
    
      char verstring[MPI_MAX_LIBRARY_VERSION_STRING];
      char nodename[MPI_MAX_PROCESSOR_NAME];
    
      MPI_Aint winsize;
      int windisp;
      int *winptr;
    
      int version, subversion, verstringlen, nodestringlen;
    
      allcomm = MPI_COMM_WORLD;
    
      MPI_Win wintable;
    
      tablesize = 5;
    
      MPI_Init(NULL, NULL);
    
      MPI_Comm_size(allcomm, &size);
      MPI_Comm_rank(allcomm, &rank);
    
      MPI_Get_processor_name(nodename, &nodestringlen);
    
      MPI_Get_version(&version, &subversion);
      MPI_Get_library_version(verstring, &verstringlen);
    
      if (rank == 0)
        {
          printf("Version %d, subversion %d\n", version, subversion);
          printf("Library <%s>\n", verstring);
        }
    
      // Create node-local communicator
    
      MPI_Comm_split_type(allcomm, MPI_COMM_TYPE_SHARED, rank,
                  MPI_INFO_NULL, &nodecomm);
    
      MPI_Comm_size(nodecomm, &nodesize);
      MPI_Comm_rank(nodecomm, &noderank);
    
      // Only rank 0 on a node actually allocates memory
    
      localtablesize = 0;
    
      if (noderank == 0) localtablesize = tablesize;
    
      // debug info
    
      printf("Rank %d of %d, rank %d of %d in node <%s>, localtablesize %d\n",
         rank, size, noderank, nodesize, nodename, localtablesize);
    
    
      MPI_Win_allocate_shared(localtablesize*sizeof(int), sizeof(int),
                  MPI_INFO_NULL, nodecomm, &localtable, &wintable);
    
      MPI_Win_get_attr(wintable, MPI_WIN_MODEL, &model, &flag);
    
      if (1 != flag)
        {
          printf("Attribute MPI_WIN_MODEL not defined\n");
        }
      else
        {
          if (MPI_WIN_UNIFIED == *model)
        {
          if (rank == 0) printf("Memory model is MPI_WIN_UNIFIED\n");
        }
          else
        {
          if (rank == 0) printf("Memory model is *not* MPI_WIN_UNIFIED\n");
    
          MPI_Finalize();
          return 1;
        }
        }
    
      // need to get local pointer valid for table on rank 0
    
      table = localtable;
    
      if (noderank != 0)
        {
          MPI_Win_shared_query(wintable, 0, &winsize, &windisp, &table);
        }
    
      // All table pointers should now point to copy on noderank 0
    
      // Initialise table on rank 0 with appropriate synchronisation
    
      MPI_Win_fence(0, wintable);
    
      if (noderank == 0)
        {
          for (i=0; i < tablesize; i++)
        {
          table[i] = rank*tablesize + i;
        }
        }
    
      MPI_Win_fence(0, wintable);
    
      // Check we did it right
    
      for (i=0; i < tablesize; i++)
        {
          printf("rank %d, noderank %d, table[%d] = %d\n",
             rank, noderank, i, table[i]);
        }
    
      MPI_Finalize();
    }
    

    Here is some sample output for 6 processes across two nodes:

    Version 3, subversion 1
    Library <SGI MPT 2.14  04/05/16 03:53:22>
    Rank 3 of 6, rank 0 of 3 in node <r1i0n1>, localtablesize 5
    Rank 4 of 6, rank 1 of 3 in node <r1i0n1>, localtablesize 0
    Rank 5 of 6, rank 2 of 3 in node <r1i0n1>, localtablesize 0
    Rank 0 of 6, rank 0 of 3 in node <r1i0n0>, localtablesize 5
    Rank 1 of 6, rank 1 of 3 in node <r1i0n0>, localtablesize 0
    Rank 2 of 6, rank 2 of 3 in node <r1i0n0>, localtablesize 0
    Memory model is MPI_WIN_UNIFIED
    rank 3, noderank 0, table[0] = 15
    rank 3, noderank 0, table[1] = 16
    rank 3, noderank 0, table[2] = 17
    rank 3, noderank 0, table[3] = 18
    rank 3, noderank 0, table[4] = 19
    rank 4, noderank 1, table[0] = 15
    rank 4, noderank 1, table[1] = 16
    rank 4, noderank 1, table[2] = 17
    rank 4, noderank 1, table[3] = 18
    rank 4, noderank 1, table[4] = 19
    rank 5, noderank 2, table[0] = 15
    rank 5, noderank 2, table[1] = 16
    rank 5, noderank 2, table[2] = 17
    rank 5, noderank 2, table[3] = 18
    rank 5, noderank 2, table[4] = 19
    rank 0, noderank 0, table[0] = 0
    rank 0, noderank 0, table[1] = 1
    rank 0, noderank 0, table[2] = 2
    rank 0, noderank 0, table[3] = 3
    rank 0, noderank 0, table[4] = 4
    rank 1, noderank 1, table[0] = 0
    rank 1, noderank 1, table[1] = 1
    rank 1, noderank 1, table[2] = 2
    rank 1, noderank 1, table[3] = 3
    rank 1, noderank 1, table[4] = 4
    rank 2, noderank 2, table[0] = 0
    rank 2, noderank 2, table[1] = 1
    rank 2, noderank 2, table[2] = 2
    rank 2, noderank 2, table[3] = 3
    rank 2, noderank 2, table[4] = 4
    
    0 讨论(0)
提交回复
热议问题