Overlapping pages with mmap (MAP_FIXED)

前端 未结 3 634
慢半拍i
慢半拍i 2021-02-04 07:08

Due to some obscure reasons which are not relevant for this question, I need to resort to use MAP_FIXED in order to obtain a page close to where the text section of libc lives i

相关标签:
3条回答
  • 2021-02-04 07:33

    It seems that posix_mem_offset() is what I was looking for.

    Not only it tells you if an address is mapped but also, in case it happens to be mapped, it implicitly gives you the boundaries of the mapped area to which it belongs (by providing SIZE_MAX in the len argument).

    So, before enforcing MAP_FIXED, I can use posix_mem_offset() to verify that the address I am using is not mapped yet.

    I could use msync() or mincore() too (checking for an ENOMEM error tells you that an address is already mapped), but then I would be blinder (no information about the area where the address is mapped). Also, msync() has side effects which may have a performance impact and mincore() is BSD-only (not POSIX).

    0 讨论(0)
  • 2021-02-04 07:43
    1. Use page = sysconf(SC_PAGE_SIZE) to find out the page size, then scan each page-sized block you wish to check using msync(addr, page, 0) (with (unsigned long)addr % page == 0, i.e. addr aligned to pages). If it returns -1 with errno == ENOMEM, that page is not mapped.

      Edited: As fons commented below, mincore(addr,page,&dummy) is superior to msync(). (The implementation of the syscall is in mm/mincore.c in the Linux kernel sources, with C libraries usually providing a wrapper that updates errno. As the syscall does the mapping check immediately after making sure addr is page aligned, it is optimal in the not-mapped case (ENOMEM). It does some work if the page is already mapped, so if performance is paramount, try to avoid checking pages you know are mapped.

      You must do this individually, separately per each page, because for regions larger than a single page, ENOMEM means that the region was not fully mapped; it might still be partially mapped. Mapping is always granular to page-sized units.

    2. As far as I can tell, there is no way to tell mmap() to fail if the region is already mapped, or contains already mapped pages. (The same applies to mremap(), so you cannot create a mapping, then move it to the desired region.)

      This means you run a risk of a race condition. It would be best to execute the actual syscalls yourself, instead of the C library wrappers, just in case they do memory allocation or change memory mappings internally:

      #define _GNU_SOURCE
      #include <unistd.h>
      #include <sys/syscall.h>
      
      static size_t page = 0;
      static inline size_t page_size(void)
      {
          if (!page)
              page = (size_t)sysconf(_SC_PAGESIZE);
          return page;
      }
      
      
      static inline int raw_msync(void *addr, size_t length, int flags)
      {
          return syscall(SYS_msync, addr, length, flags);
      }
      
      static inline void *raw_mmap(void *addr, size_t length, int prot, int flags)
      {
          return (void *)syscall(SYS_mmap, addr, length, prot, flags, -1, (off_t)0);
      }
      

    However, I suspect that whatever it is you are trying to do, you eventually need to parse /proc/self/maps anyway.

    • I recommend avoiding standard I/O stdio.h altogether (as the various operations will allocate memory dynamically, and thus change the mappings), and instead use the lower-level unistd.h interfaces, which are much less likely to affect the mappings. Here is a set of simple, crude functions, that you can use to find out each mapped region and the protections enabled in that region (and discard the other info). In practice, it uses about a kilobyte of code and less than that in stack, so it is very useful even on limited architectures (say, embedded devices).

      #include <unistd.h>
      #include <fcntl.h>
      #include <errno.h>
      #include <string.h>
      
      #ifndef   INPUT_BUFFER
      #define   INPUT_BUFFER   512
      #endif /* INPUT_BUFFER */
      
      #ifndef   INPUT_EOF
      #define   INPUT_EOF     -256
      #endif /* INPUT_EOF */
      
      #define   PERM_PRIVATE  16
      #define   PERM_SHARED    8
      #define   PERM_READ      4
      #define   PERM_WRITE     2
      #define   PERM_EXEC      1
      
      typedef struct {
          int            descriptor;
          int            status;
          unsigned char *next;
          unsigned char *ends;
          unsigned char  buffer[INPUT_BUFFER + 16];
      } input_buffer;
      
      /* Refill input buffer. Returns the number of new bytes.
       * Sets status to ENODATA at EOF.
      */
      static size_t input_refill(input_buffer *const input)
      {
          ssize_t n;
      
          if (input->status)
              return (size_t)0;
      
          if (input->next > input->buffer) {
              if (input->ends > input->next) {
                  memmove(input->buffer, input->next,
                          (size_t)(input->ends - input->next));
                  input->ends = input->buffer + (size_t)(input->ends - input->next);
                  input->next = input->buffer;
              } else {
                  input->ends = input->buffer;
                  input->next = input->buffer;
              }
          }
      
          do {
              n = read(input->descriptor, input->ends,
                       INPUT_BUFFER - (size_t)(input->ends - input->buffer));
          } while (n == (ssize_t)-1 && errno == EINTR);
          if (n > (ssize_t)0) {
              input->ends += n;
              return (size_t)n;
      
          } else
          if (n == (ssize_t)0) {
              input->status = ENODATA;
              return (size_t)0;
          }
      
          if (n == (ssize_t)-1)
              input->status = errno;
          else
              input->status = EIO;
      
          return (size_t)0;
      }
      
      /* Low-lever getchar() equivalent.
      */
      static inline int input_next(input_buffer *const input)
      {
          if (input->next < input->ends)
              return *(input->next++);
          else
          if (input_refill(input) > 0)
              return *(input->next++);
          else
              return INPUT_EOF;
      }
      
      /* Low-level ungetc() equivalent.
      */
      static inline int input_back(input_buffer *const input, const int c)
      {
          if (c < 0 || c > 255)
              return INPUT_EOF;
          else
          if (input->next > input->buffer)
              return *(--input->next) = c;
          else
          if (input->ends >= input->buffer + sizeof input->buffer)
              return INPUT_EOF;
      
          memmove(input->next + 1, input->next, (size_t)(input->ends - input->next));
          input->ends++;
          return *(input->next) = c;
      }
      
      /* Low-level fopen() equivalent.
      */
      static int input_open(input_buffer *const input, const char *const filename)
      {
          if (!input)
              return errno = EINVAL;
      
          input->descriptor = -1;
          input->status = 0;
          input->next = input->buffer;
          input->ends = input->buffer;
      
          if (!filename || !*filename)
              return errno = input->status = EINVAL;
      
          do {
              input->descriptor = open(filename, O_RDONLY | O_NOCTTY);
          } while (input->descriptor == -1 && errno == EINTR);
          if (input->descriptor == -1)
              return input->status = errno;
      
          return 0;
      }
      
      /* Low-level fclose() equivalent.
      */
      static int input_close(input_buffer *const input)
      {
          int result;
      
          if (!input)
              return errno = EINVAL;
      
          /* EOF is not an error; we use ENODATA for that. */
          if (input->status == ENODATA)
              input->status = 0;
      
          if (input->descriptor != -1) {
              do {
                  result = close(input->descriptor);
              } while (result == -1 && errno == EINTR);
              if (result == -1 && !input->status)
                  input->status = errno;
          }
      
          input->descriptor = -1;
          input->next = input->buffer;
          input->ends = input->buffer;
      
          return errno = input->status;
      }
      
      /* Read /proc/self/maps, and fill in the arrays corresponding to the fields.
       * The function will return the number of mappings, even if not all are saved.
      */
      size_t read_maps(size_t const n,
                       void **const ptr, size_t *const len,
                       unsigned char *const mode)
      {
          input_buffer    input;
          size_t          i = 0;
          unsigned long   curr_start, curr_end;
          unsigned char   curr_mode;
          int             c;
      
          errno = 0;
      
          if (input_open(&input, "/proc/self/maps"))
              return (size_t)0; /* errno already set. */
      
          c = input_next(&input);
          while (c >= 0) {
      
              /* Skip leading controls and whitespace */
              while (c >= 0 && c <= 32)
                  c = input_next(&input);
      
              /* EOF? */
              if (c < 0)
                  break;
      
              curr_start = 0UL;
              curr_end = 0UL;
              curr_mode = 0U;
      
              /* Start of address range. */
              while (1)
                  if (c >= '0' && c <= '9') {
                      curr_start = (16UL * curr_start) + c - '0';
                      c = input_next(&input);
                  } else
                  if (c >= 'A' && c <= 'F') {
                      curr_start = (16UL * curr_start) + c - 'A' + 10;
                      c = input_next(&input);
                  } else
                  if (c >= 'a' && c <= 'f') {
                      curr_start = (16UL * curr_start) + c - 'a' + 10;
                      c = input_next(&input);
                  } else
                      break;
              if (c == '-')
                  c = input_next(&input);
              else {
                  errno = EIO;
                  return (size_t)0;
              }
      
              /* End of address range. */
              while (1)
                  if (c >= '0' && c <= '9') {
                      curr_end = (16UL * curr_end) + c - '0';
                      c = input_next(&input);
                  } else
                  if (c >= 'A' && c <= 'F') {
                      curr_end = (16UL * curr_end) + c - 'A' + 10;
                      c = input_next(&input);
                  } else
                  if (c >= 'a' && c <= 'f') {
                      curr_end = (16UL * curr_end) + c - 'a' + 10;
                      c = input_next(&input);
                  } else
                      break;
              if (c == ' ')
                  c = input_next(&input);
              else {
                  errno = EIO;
                  return (size_t)0;
              }
      
              /* Permissions. */
              while (1)
                  if (c == 'r') {
                      curr_mode |= PERM_READ;
                      c = input_next(&input);
                  } else
                  if (c == 'w') {
                      curr_mode |= PERM_WRITE;
                      c = input_next(&input);
                  } else
                  if (c == 'x') {
                      curr_mode |= PERM_EXEC;
                      c = input_next(&input);
                  } else
                  if (c == 's') {
                      curr_mode |= PERM_SHARED;
                      c = input_next(&input);
                  } else
                  if (c == 'p') {
                      curr_mode |= PERM_PRIVATE;
                      c = input_next(&input);
                  } else
                  if (c == '-') {
                      c = input_next(&input);
                  } else
                      break;
              if (c == ' ')
                  c = input_next(&input);
              else {
                  errno = EIO;
                  return (size_t)0;
              }
      
              /* Skip the rest of the line. */
              while (c >= 0 && c != '\n')
                  c = input_next(&input);
      
              /* Add to arrays, if possible. */
              if (i < n) {
                  if (ptr) ptr[i] = (void *)curr_start;
                  if (len) len[i] = (size_t)(curr_end - curr_start);
                  if (mode) mode[i] = curr_mode;
              }
              i++;
          }
      
          if (input_close(&input))
              return (size_t)0; /* errno already set. */
      
          errno = 0;
          return i;
      }
      

      The read_maps() function reads up to n regions, start addresses as void * into the ptr array, lengths into the len array, and permissions into the mode array, returning the total number of maps (may be greater than n), or zero with errno set if an error occurs.

      It is quite possible to use syscalls for the low-level I/O above, so that you don't use any C library features, but I don't think it is at all necessary. (The C libraries, as far as I can tell, use very simple wrappers around the actual syscalls for these.)

    I hope you find this useful.

    0 讨论(0)
  • 2021-02-04 07:43

    "Which explains what I am seeing, but I have a couple of questions:"

    "Is there a way to detect if something was already mapped to certain address? without accessing /proc/maps?"

    Yes, use mmap without MAP_FIXED.

    "Is there a way to force mmap to fail in the case of finding overlapping pages?"

    Apparently not, but simply use munmap after the mmap if mmap returns a mapping at other than the requested address.

    When used without MAP_FIXED, mmap on both linux and Mac OS X (and I suspect elsewhere also) obeys the address parameter iff no existing mapping in the range [address, address + length) exists. So if mmap answers a mapping at a different address to the one you supply you can infer there already exists a mapping in that range and you need to use a different range. Since mmap will typically answer a mapping at a very high address when it ignores the address parameter, simply unmap the region using munmap, and try again at a different address.

    Using mincore to check for use of an address range is not only a waste of time (one has to probe a page at a time), it may not work. Older linux kernels will only fail mincore appropriately for file mappings. They won't answer anything at all for MAP_ANON mappings. But as I've pointed out, all you need is mmap and munmap.

    I've just been through this exercise in implementing a memory manager for a Smalltalk VM. I use sbrk(0) to find out the first address at which I can map the first segment, and then use mmap and an increment of 1Mb to search for room for subsequent segments:

    static long          pageSize = 0;
    static unsigned long pageMask = 0;
    
    #define roundDownToPage(v) ((v)&pageMask)
    #define roundUpToPage(v) (((v)+pageSize-1)&pageMask)
    
    void *
    sqAllocateMemory(usqInt minHeapSize, usqInt desiredHeapSize)
    {
        char *hint, *address, *alloc;
        unsigned long alignment, allocBytes;
    
        if (pageSize) {
            fprintf(stderr, "sqAllocateMemory: already called\n");
            exit(1);
        }
        pageSize = getpagesize();
        pageMask = ~(pageSize - 1);
    
        hint = sbrk(0); /* the first unmapped address above existing data */
    
        alignment = max(pageSize,1024*1024);
        address = (char *)(((usqInt)hint + alignment - 1) & ~(alignment - 1));
    
        alloc = sqAllocateMemorySegmentOfSizeAboveAllocatedSizeInto
                    (roundUpToPage(desiredHeapSize), address, &allocBytes);
        if (!alloc) {
            fprintf(stderr, "sqAllocateMemory: initial alloc failed!\n");
            exit(errno);
        }
        return (usqInt)alloc;
    }
    
    /* Allocate a region of memory of at least size bytes, at or above minAddress.
     *  If the attempt fails, answer null.  If the attempt succeeds, answer the
     * start of the region and assign its size through allocatedSizePointer.
     */
    void *
    sqAllocateMemorySegmentOfSizeAboveAllocatedSizeInto(sqInt size, void *minAddress, sqInt *allocatedSizePointer)
    {
        char *address, *alloc;
        long bytes, delta;
    
        address = (char *)roundUpToPage((unsigned long)minAddress);
        bytes = roundUpToPage(size);
        delta = max(pageSize,1024*1024);
    
        while ((unsigned long)(address + bytes) > (unsigned long)address) {
            alloc = mmap(address, bytes, PROT_READ | PROT_WRITE,
                         MAP_ANON | MAP_PRIVATE, -1, 0);
            if (alloc == MAP_FAILED) {
                perror("sqAllocateMemorySegmentOfSizeAboveAllocatedSizeInto mmap");
                return 0;
            }
            /* is the mapping both at or above address and not too far above address? */
            if (alloc >= address && alloc <= address + delta) {
                *allocatedSizePointer = bytes;
                return alloc;
            }
            /* mmap answered a mapping well away from where Spur prefers.  Discard
             * the mapping and try again delta higher.
             */
            if (munmap(alloc, bytes) != 0)
                perror("sqAllocateMemorySegment... munmap");
            address += delta;
        }
        return 0;
    }
    

    This appears to work well, allocating memory at ascending addresses while skipping over any existing mappings.

    HTH

    0 讨论(0)
提交回复
热议问题