Determining which gencode (compute_, arch_) values I need for nvcc - within CMake

前端 未结 3 763
情话喂你
情话喂你 2021-01-13 09:10

I\'m using CMake as a build system for my code, which involves CUDA. I was thinking of automating the task of deciding which compute_XX and arch_XX

相关标签:
3条回答
  • 2021-01-13 09:33

    My strategy has been to compile and run a bash script that probes the card and returns the gencode for cmake. Inspiration came from University of Chicago's SLURM. To handle errors or multiple gpus or other circumstances, modify as necessary.

    In your project folder create a file cudaComputeVersion.bash and ensure it is executable from the shell. Into this file put:

    #!/bin/bash
    
    # create a 'here document' that is code we compile and use to probe the card
    cat << EOF > /tmp/cudaComputeVersion.cu
    #include <stdio.h>
    int main()
    {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop,0);
    int v = prop.major * 10 + prop.minor;
    printf("-gencode arch=compute_%d,code=sm_%d\n",v,v);
    }
    EOF
    
    # probe the card and cleanup
    /usr/local/cuda/bin/nvcc /tmp/cudaComputeVersion.cu -o /tmp/cudaComputeVersion
    /tmp/cudaComputeVersion
    rm /tmp/cudaComputeVersion.cu
    rm /tmp/cudaComputeVersion
    

    And in your CMakeLists.txt put:

    # at cmake-build-time, probe the card and set a cmake variable
    execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cudaComputeVersion.bash OUTPUT_VARIABLE GENCODE)
    # at project-compile-time, include the gencode into the compile options
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "${GENCODE}")
    
    # this makes CMake all chatty and allows you to see that GENCODE was set correctly
    set(CMAKE_VERBOSE_MAKEFILE TRUE)
    

    cheers

    0 讨论(0)
  • 2021-01-13 09:39

    You can use the cuda_select_nvcc_arch_flags() macro in the FindCUDA module for this without any additional scripts when using CMake 3.7 or newer.

    include(FindCUDA)
    set(CUDA_ARCH_LIST Auto CACHE STRING
        "List of CUDA architectures (e.g. Pascal, Volta, etc) or \
    compute capability versions (6.1, 7.0, etc) to generate code for. \
    Set to Auto for automatic detection (default)."
    )
    cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCH_LIST})
    list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})
    

    The above sets CUDA_ARCH_FLAGS to -gencode arch=compute_61,code=sm_61 on my machine, for example. The CUDA_ARCH_LIST cache variable can be configured by the user to generate code for specific compute capabilites instead of automatic detection.

    Note: the FindCUDA module has been deprecated since CMake 3.10. However, no equivalent alternative to the cuda_select_nvcc_arch_flags() macro appears to be provided yet in the latest CMake release (v3.14). See this relevant issue at the CMake issue tracker for further details.

    0 讨论(0)
  • A slight improvement over @orthopteroid's answer, which pretty much ensures a unique temporary file is generated, and only requires one instead of two temporary files.

    The following goes into scripts/get_cuda_sm.sh:

    #!/bin/bash 
    #
    # Prints the compute capability of the first CUDA device installed
    # on the system, or alternatively the device whose index is the
    # first command-line argument
    
    device_index=${1:-0}
    timestamp=$(date +%s.%N)
    gcc_binary=$(which g++)
    gcc_binary=${gcc_binary:-g++}
    cuda_root=${CUDA_DIR:-/usr/local/cuda}
    CUDA_INCLUDE_DIRS=${CUDA_INCLUDE_DIRS:-${cuda_root}/include}
    CUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY:-${cuda_root}/lib64/libcudart.so}
    generated_binary="/tmp/cuda-compute-version-helper-$$-$timestamp"
    # create a 'here document' that is code we compile and use to probe the card
    source_code="$(cat << EOF 
    #include <stdio.h>
    #include <cuda_runtime_api.h>
    
    int main()
    {
            cudaDeviceProp prop;
            cudaError_t status;
            int device_count;
            status = cudaGetDeviceCount(&device_count);
            if (status != cudaSuccess) { 
                    fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status)); 
                    return -1;
            }
            if (${device_index} >= device_count) {
                    fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", ${device_index}, device_count);
                    return -1;
            }
            status = cudaGetDeviceProperties(&prop, ${device_index});
            if (status != cudaSuccess) { 
                    fprintf(stderr,"cudaGetDeviceProperties() for device ${device_index} failed: %s\n", cudaGetErrorString(status)); 
                    return -1;
            }
            int v = prop.major * 10 + prop.minor;
            printf("%d\\n", v);
    }
    EOF
    )"
    echo "$source_code" | $gcc_binary -x c++ -I"$CUDA_INCLUDE_DIRS" -o "$generated_binary" - -x none "$CUDA_CUDART_LIBRARY"
    
    # probe the card and cleanup
    
    $generated_binary
    rm $generated_binary
    

    and the following goes into CMakeLists.txt or a CMake module:

    if (NOT CUDA_TARGET_COMPUTE_CAPABILITY)
        if("$ENV{CUDA_SM}" STREQUAL "")
            set(ENV{CUDA_INCLUDE_DIRS} "${CUDA_INCLUDE_DIRS}")
            set(ENV{CUDA_CUDART_LIBRARY} "${CUDA_CUDART_LIBRARY}")
            set(ENV{CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}")
            execute_process(COMMAND 
                bash -c "${CMAKE_CURRENT_SOURCE_DIR}/scripts/get_cuda_sm.sh" 
                OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY_)
        else()
            set(CUDA_TARGET_COMPUTE_CAPABILITY_ $ENV{CUDA_SM})
        endif()
    
        set(CUDA_TARGET_COMPUTE_CAPABILITY "${CUDA_TARGET_COMPUTE_CAPABILITY_}" 
            CACHE STRING "CUDA compute capability of the (first) CUDA device on \
            the system, in XY format (like the X.Y format but no dot); see table \
            of features and capabilities by capability X.Y value at \
            https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications")
    
        execute_process(COMMAND 
            bash -c "echo -n $(echo ${CUDA_TARGET_COMPUTE_CAPABILITY})" 
            OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY)
        execute_process(COMMAND 
            bash -c "echo ${CUDA_TARGET_COMPUTE_CAPABILITY} | sed 's/^\\([0-9]\\)\\([0-9]\\)/\\1.\\2/;' | xargs echo -n" 
            OUTPUT_VARIABLE FORMATTED_COMPUTE_CAPABILITY)
    
        message(STATUS 
            "CUDA device-side code will assume compute capability \
            ${FORMATTED_COMPUTE_CAPABILITY}")
    endif()
    
    set(CUDA_GENCODE
        "arch=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}, code=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}")
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode ${CUDA_GENCODE} )
    
    0 讨论(0)
提交回复
热议问题