Determining which gencode (compute_, arch_) values I need for nvcc - within CMake

末鹿安然 提交于 2019-12-19 07:34:44

问题


I'm using CMake as a build system for my code, which involves CUDA. I was thinking of automating the task of deciding which compute_XX and arch_XX I need to to pass to my nvcc in order to compile for the GPU(s) on my current machine.

  • Is there a way to do this:

    1. With the NVIDIA GPU deployment kit?
    2. Without the NVIDIA GPU deployment kit?
  • Does CMake's FindCUDA help you in determining the values for these switches?


回答1:


My strategy has been to compile and run a bash script that probes the card and returns the gencode for cmake. Inspiration came from University of Chicago's SLURM. To handle errors or multiple gpus or other circumstances, modify as necessary.

In your project folder create a file cudaComputeVersion.bash and ensure it is executable from the shell. Into this file put:

#!/bin/bash

# create a 'here document' that is code we compile and use to probe the card
cat << EOF > /tmp/cudaComputeVersion.cu
#include <stdio.h>
int main()
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop,0);
int v = prop.major * 10 + prop.minor;
printf("-gencode arch=compute_%d,code=sm_%d\n",v,v);
}
EOF

# probe the card and cleanup
/usr/local/cuda/bin/nvcc /tmp/cudaComputeVersion.cu -o /tmp/cudaComputeVersion
/tmp/cudaComputeVersion
rm /tmp/cudaComputeVersion.cu
rm /tmp/cudaComputeVersion

And in your CMakeLists.txt put:

# at cmake-build-time, probe the card and set a cmake variable
execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cudaComputeVersion.bash OUTPUT_VARIABLE GENCODE)
# at project-compile-time, include the gencode into the compile options
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "${GENCODE}")

# this makes CMake all chatty and allows you to see that GENCODE was set correctly
set(CMAKE_VERBOSE_MAKEFILE TRUE)

cheers




回答2:


A slight improvement over @orthopteroid's answer, which pretty much ensures a unique temporary file is generated, and only requires one instead of two temporary files.

The following goes into scripts/get_cuda_sm.sh:

#!/bin/bash 
#
# Prints the compute capability of the first CUDA device installed
# on the system, or alternatively the device whose index is the
# first command-line argument

device_index=${1:-0}
timestamp=$(date +%s.%N)
gcc_binary=$(which g++)
gcc_binary=${gcc_binary:-g++}
cuda_root=${CUDA_DIR:-/usr/local/cuda}
CUDA_INCLUDE_DIRS=${CUDA_INCLUDE_DIRS:-${cuda_root}/include}
CUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY:-${cuda_root}/lib64/libcudart.so}
generated_binary="/tmp/cuda-compute-version-helper-$$-$timestamp"
# create a 'here document' that is code we compile and use to probe the card
source_code="$(cat << EOF 
#include <stdio.h>
#include <cuda_runtime_api.h>

int main()
{
        cudaDeviceProp prop;
        cudaError_t status;
        int device_count;
        status = cudaGetDeviceCount(&device_count);
        if (status != cudaSuccess) { 
                fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status)); 
                return -1;
        }
        if (${device_index} >= device_count) {
                fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", ${device_index}, device_count);
                return -1;
        }
        status = cudaGetDeviceProperties(&prop, ${device_index});
        if (status != cudaSuccess) { 
                fprintf(stderr,"cudaGetDeviceProperties() for device ${device_index} failed: %s\n", cudaGetErrorString(status)); 
                return -1;
        }
        int v = prop.major * 10 + prop.minor;
        printf("%d\\n", v);
}
EOF
)"
echo "$source_code" | $gcc_binary -x c++ -I"$CUDA_INCLUDE_DIRS" -o "$generated_binary" - -x none "$CUDA_CUDART_LIBRARY"

# probe the card and cleanup

$generated_binary
rm $generated_binary

and the following goes into CMakeLists.txt or a CMake module:

if (NOT CUDA_TARGET_COMPUTE_CAPABILITY)
    if("$ENV{CUDA_SM}" STREQUAL "")
        set(ENV{CUDA_INCLUDE_DIRS} "${CUDA_INCLUDE_DIRS}")
        set(ENV{CUDA_CUDART_LIBRARY} "${CUDA_CUDART_LIBRARY}")
        set(ENV{CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}")
        execute_process(COMMAND 
            bash -c "${CMAKE_CURRENT_SOURCE_DIR}/scripts/get_cuda_sm.sh" 
            OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY_)
    else()
        set(CUDA_TARGET_COMPUTE_CAPABILITY_ $ENV{CUDA_SM})
    endif()

    set(CUDA_TARGET_COMPUTE_CAPABILITY "${CUDA_TARGET_COMPUTE_CAPABILITY_}" 
        CACHE STRING "CUDA compute capability of the (first) CUDA device on \
        the system, in XY format (like the X.Y format but no dot); see table \
        of features and capabilities by capability X.Y value at \
        https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications")

    execute_process(COMMAND 
        bash -c "echo -n $(echo ${CUDA_TARGET_COMPUTE_CAPABILITY})" 
        OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY)
    execute_process(COMMAND 
        bash -c "echo ${CUDA_TARGET_COMPUTE_CAPABILITY} | sed 's/^\\([0-9]\\)\\([0-9]\\)/\\1.\\2/;' | xargs echo -n" 
        OUTPUT_VARIABLE FORMATTED_COMPUTE_CAPABILITY)

    message(STATUS 
        "CUDA device-side code will assume compute capability \
        ${FORMATTED_COMPUTE_CAPABILITY}")
endif()

set(CUDA_GENCODE
    "arch=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}, code=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode ${CUDA_GENCODE} )



回答3:


You can use the cuda_select_nvcc_arch_flags() macro in the FindCUDA module for this without any additional scripts when using CMake 3.7 or newer.

include(FindCUDA)
set(CUDA_ARCH_LIST Auto CACHE LIST
    "List of CUDA architectures (e.g. Pascal, Volta, etc) or \
compute capability versions (6.1, 7.0, etc) to generate code for. \
Set to Auto for automatic detection (default)."
)
cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCH_LIST})
list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})

The above sets CUDA_ARCH_FLAGS to -gencode arch=compute_61,code=sm_61 on my machine, for example. The CUDA_ARCH_LIST cache variable can be configured by the user to generate code for specific compute capabilites instead of automatic detection.

Note: the FindCUDA module has been deprecated since CMake 3.10. However, no equivalent alternative to the cuda_select_nvcc_arch_flags() macro appears to be provided yet in the latest CMake release (v3.14). See this relevant issue at the CMake issue tracker for further details.



来源:https://stackoverflow.com/questions/35485087/determining-which-gencode-compute-arch-values-i-need-for-nvcc-within-cmak

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!