Determining which gencode (compute_, arch_) values I need for nvcc - within CMake

问题

I'm using CMake as a build system for my code, which involves CUDA. I was thinking of automating the task of deciding which compute_XX and arch_XX I need to to pass to my nvcc in order to compile for the GPU(s) on my current machine.

Is there a way to do this:
1. With the NVIDIA GPU deployment kit?
2. Without the NVIDIA GPU deployment kit?
Does CMake's FindCUDA help you in determining the values for these switches?

回答1:

My strategy has been to compile and run a bash script that probes the card and returns the gencode for cmake. Inspiration came from University of Chicago's SLURM. To handle errors or multiple gpus or other circumstances, modify as necessary.

In your project folder create a file cudaComputeVersion.bash and ensure it is executable from the shell. Into this file put:

#!/bin/bash

# create a 'here document' that is code we compile and use to probe the card
cat << EOF > /tmp/cudaComputeVersion.cu
#include <stdio.h>
int main()
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop,0);
int v = prop.major * 10 + prop.minor;
printf("-gencode arch=compute_%d,code=sm_%d\n",v,v);
}
EOF

# probe the card and cleanup
/usr/local/cuda/bin/nvcc /tmp/cudaComputeVersion.cu -o /tmp/cudaComputeVersion
/tmp/cudaComputeVersion
rm /tmp/cudaComputeVersion.cu
rm /tmp/cudaComputeVersion

And in your CMakeLists.txt put:

# at cmake-build-time, probe the card and set a cmake variable
execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cudaComputeVersion.bash OUTPUT_VARIABLE GENCODE)
# at project-compile-time, include the gencode into the compile options
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "${GENCODE}")

# this makes CMake all chatty and allows you to see that GENCODE was set correctly
set(CMAKE_VERBOSE_MAKEFILE TRUE)

cheers

回答2:

A slight improvement over @orthopteroid's answer, which pretty much ensures a unique temporary file is generated, and only requires one instead of two temporary files.

The following goes into scripts/get_cuda_sm.sh:

#!/bin/bash 
#
# Prints the compute capability of the first CUDA device installed
# on the system, or alternatively the device whose index is the
# first command-line argument

device_index=${1:-0}
timestamp=$(date +%s.%N)
gcc_binary=$(which g++)
gcc_binary=${gcc_binary:-g++}
cuda_root=${CUDA_DIR:-/usr/local/cuda}
CUDA_INCLUDE_DIRS=${CUDA_INCLUDE_DIRS:-${cuda_root}/include}
CUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY:-${cuda_root}/lib64/libcudart.so}
generated_binary="/tmp/cuda-compute-version-helper-$$-$timestamp"
# create a 'here document' that is code we compile and use to probe the card
source_code="$(cat << EOF 
#include <stdio.h>
#include <cuda_runtime_api.h>

int main()
{
        cudaDeviceProp prop;
        cudaError_t status;
        int device_count;
        status = cudaGetDeviceCount(&device_count);
        if (status != cudaSuccess) { 
                fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status)); 
                return -1;
        }
        if (${device_index} >= device_count) {
                fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", ${device_index}, device_count);
                return -1;
        }
        status = cudaGetDeviceProperties(&prop, ${device_index});
        if (status != cudaSuccess) { 
                fprintf(stderr,"cudaGetDeviceProperties() for device ${device_index} failed: %s\n", cudaGetErrorString(status)); 
                return -1;
        }
        int v = prop.major * 10 + prop.minor;
        printf("%d\\n", v);
}
EOF
)"
echo "$source_code" | $gcc_binary -x c++ -I"$CUDA_INCLUDE_DIRS" -o "$generated_binary" - -x none "$CUDA_CUDART_LIBRARY"

# probe the card and cleanup

$generated_binary
rm $generated_binary

and the following goes into CMakeLists.txt or a CMake module:

if (NOT CUDA_TARGET_COMPUTE_CAPABILITY)
    if("$ENV{CUDA_SM}" STREQUAL "")
        set(ENV{CUDA_INCLUDE_DIRS} "${CUDA_INCLUDE_DIRS}")
        set(ENV{CUDA_CUDART_LIBRARY} "${CUDA_CUDART_LIBRARY}")
        set(ENV{CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}")
        execute_process(COMMAND 
            bash -c "${CMAKE_CURRENT_SOURCE_DIR}/scripts/get_cuda_sm.sh" 
            OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY_)
    else()
        set(CUDA_TARGET_COMPUTE_CAPABILITY_ $ENV{CUDA_SM})
    endif()

    set(CUDA_TARGET_COMPUTE_CAPABILITY "${CUDA_TARGET_COMPUTE_CAPABILITY_}" 
        CACHE STRING "CUDA compute capability of the (first) CUDA device on \
        the system, in XY format (like the X.Y format but no dot); see table \
        of features and capabilities by capability X.Y value at \
        https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications")

    execute_process(COMMAND 
        bash -c "echo -n $(echo ${CUDA_TARGET_COMPUTE_CAPABILITY})" 
        OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY)
    execute_process(COMMAND 
        bash -c "echo ${CUDA_TARGET_COMPUTE_CAPABILITY} | sed 's/^\\([0-9]\\)\\([0-9]\\)/\\1.\\2/;' | xargs echo -n" 
        OUTPUT_VARIABLE FORMATTED_COMPUTE_CAPABILITY)

    message(STATUS 
        "CUDA device-side code will assume compute capability \
        ${FORMATTED_COMPUTE_CAPABILITY}")
endif()

set(CUDA_GENCODE
    "arch=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}, code=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode ${CUDA_GENCODE} )

回答3:

You can use the cuda_select_nvcc_arch_flags() macro in the FindCUDA module for this without any additional scripts when using CMake 3.7 or newer.

include(FindCUDA)
set(CUDA_ARCH_LIST Auto CACHE LIST
    "List of CUDA architectures (e.g. Pascal, Volta, etc) or \
compute capability versions (6.1, 7.0, etc) to generate code for. \
Set to Auto for automatic detection (default)."
)
cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCH_LIST})
list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})

The above sets CUDA_ARCH_FLAGS to -gencode arch=compute_61,code=sm_61 on my machine, for example. The CUDA_ARCH_LIST cache variable can be configured by the user to generate code for specific compute capabilites instead of automatic detection.

Note: the FindCUDA module has been deprecated since CMake 3.10. However, no equivalent alternative to the cuda_select_nvcc_arch_flags() macro appears to be provided yet in the latest CMake release (v3.14). See this relevant issue at the CMake issue tracker for further details.

来源：https://stackoverflow.com/questions/35485087/determining-which-gencode-compute-arch-values-i-need-for-nvcc-within-cmak

标签

cuda

cmake

build-automation

detection