I need to dynamically allocate some arrays inside the kernel function. How can a I do that?
My code is something like that:
__global__ func(float *gr
If the value of n and nn were known before the kernel is called, then why not cudaMalloc the memory on host side and pass in the device memory pointer to the kernel?
Dynamic memory allocation is only supported on compute capability 2.x and newer hardware. You can use either the C++ new keyword or malloc in the kernel, so your example could become:
__global__ func(float *grid_d,int n, int nn){
int i,j;
float *x = new float[n], *y = new float[nn];
}
This allocates memory on a local memory runtime heap which has the lifetime of the context, so make sure you free the memory after the kernel finishes running if your intention is not to use the memory again. You should also note that runtime heap memory cannot be accessed directly from the host APIs, so you cannot pass a pointer allocated inside a kernel as an argument to cudaMemcpy
, for example.
Maybe you should test
cudaMalloc(&foo,sizeof(int) * ARRAY_SIZE * ITERATIONS);
cudaFree(foo);
instead
for (int i = 0; i < ITERATIONS; ++ i) {
if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
int * foo;
cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
cudaFree(foo);
}
@talonmies answered your question on how to dynamically allocate memory within a kernel. This is intended as a supplemental answer, addressing performance of __device__ malloc()
and an alternative you might want to consider.
Allocating memory dynamically in the kernel can be tempting because it allows GPU code to look more like CPU code. But it can seriously affect performance. I wrote a self contained test and have included it below. The test launches some 2.6 million threads. Each thread populates 16 integers of global memory with some values derived from the thread index, then sums up the values and returns the sum.
The test implements two approaches. The first approach uses __device__ malloc()
and the second approach uses memory that is allocated before the kernel runs.
On my 2.0 device, the kernel runs in 1500ms when using __device__ malloc()
and 27ms when using pre-allocated memory. In other words, the test takes 56x longer to run when memory is allocated dynamically within the kernel. The time includes the outer loop cudaMalloc()
/ cudaFree()
, which is not part of the kernel. If the same kernel is launched many times with the same number of threads, as is often the case, the cost of the cudaMalloc()
/ cudaFree()
is amortized over all the kernel launches. That brings the difference even higher, to around 60x.
Speculating, I think that the performance hit is in part caused by implicit serialization. The GPU must probably serialize all simultaneous calls to __device__ malloc()
in order to provide separate chunks of memory to each caller.
The version that does not use __device__ malloc()
allocates all the GPU memory before running the kernel. A pointer to the memory is passed to the kernel. Each thread calculates an index into the previously allocated memory instead of using a __device__ malloc()
.
The potential issue with allocating memory up front is that, if only some threads need to allocate memory, and it is not known which threads those are, it will be necessary to allocate memory for all the threads. If there is not enough memory for that, it might be more efficient to reduce the number of threads per kernel call then using __device__ malloc()
. Other workarounds would probably end up reimplementing what __device__ malloc()
is doing in the background, and would see a similar performance hit.
Test the performance of __device__ malloc()
:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
const int N_ITEMS(16);
#define USE_DYNAMIC_MALLOC
__global__ void test_malloc(int* totals)
{
int tx(blockIdx.x * blockDim.x + threadIdx.x);
int* s(new int[N_ITEMS]);
for (int i(0); i < N_ITEMS; ++i) {
s[i] = tx * i;
}
int total(0);
for (int i(0); i < N_ITEMS; ++i) {
total += s[i];
}
totals[tx] = total;
delete[] s;
}
__global__ void test_malloc_2(int* items, int* totals)
{
int tx(blockIdx.x * blockDim.x + threadIdx.x);
int* s(items + tx * N_ITEMS);
for (int i(0); i < N_ITEMS; ++i) {
s[i] = tx * i;
}
int total(0);
for (int i(0); i < N_ITEMS; ++i) {
total += s[i];
}
totals[tx] = total;
}
int main()
{
cudaError_t cuda_status;
cudaSetDevice(0);
int blocks_per_launch(1024 * 10);
int threads_per_block(256);
int threads_per_launch(blocks_per_launch * threads_per_block);
int* totals_d;
cudaMalloc((void**)&totals_d, threads_per_launch * sizeof(int));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaDeviceSynchronize();
cudaEventRecord(start, 0);
#ifdef USE_DYNAMIC_MALLOC
cudaDeviceSetLimit(cudaLimitMallocHeapSize, threads_per_launch * N_ITEMS * sizeof(int));
test_malloc<<<blocks_per_launch, threads_per_block>>>(totals_d);
#else
int* items_d;
cudaMalloc((void**)&items_d, threads_per_launch * sizeof(int) * N_ITEMS);
test_malloc_2<<<blocks_per_launch, threads_per_block>>>(items_d, totals_d);
cudaFree(items_d);
#endif
cuda_status = cudaDeviceSynchronize();
if (cuda_status != cudaSuccess) {
printf("Error: %d\n", cuda_status);
exit(1);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed: %f\n", elapsedTime);
int* totals_h(new int[threads_per_launch]);
cuda_status = cudaMemcpy(totals_h, totals_d, threads_per_launch * sizeof(int), cudaMemcpyDeviceToHost);
if (cuda_status != cudaSuccess) {
printf("Error: %d\n", cuda_status);
exit(1);
}
for (int i(0); i < 10; ++i) {
printf("%d ", totals_h[i]);
}
printf("\n");
cudaFree(totals_d);
delete[] totals_h;
return cuda_status;
}
Output:
C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 27.311169
0 120 240 360 480 600 720 840 960 1080
C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 1516.711914
0 120 240 360 480 600 720 840 960 1080
Ran an experiment based on the concepts in @rogerdahl's post. Assumptions:
The malloc+free calls local to the GPU seemed to be much faster than the cudaMalloc
+ cudaFree
calls. The program's output:
Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
timer for cuda malloc timer took 1.169631s
Starting timer for device malloc timer
Stopping timer for device malloc timer
timer for device malloc timer took 0.029794s
I'm leaving out the code for timer.h
and timer.cpp
, but here's the code for the test itself:
#include "cuda_runtime.h"
#include <stdio.h>
#include <thrust/system/cuda/error.h>
#include "timer.h"
static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 32;
const int ITERATIONS = 1 << 12;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);
const int ARRAY_SIZE = 64;
void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
if (err == cudaSuccess)
return;
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
}
__global__ void mallocai() {
for (int i = 0; i < ITERATIONS_PER_BLOCKTHREAD; ++i) {
int * foo;
foo = (int *) malloc(sizeof(int) * ARRAY_SIZE);
free(foo);
}
}
int main() {
Timer cuda_malloc_timer("cuda malloc timer");
for (int i = 0; i < ITERATIONS; ++ i) {
if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
int * foo;
cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
cudaFree(foo);
}
cuda_malloc_timer.stop_and_report();
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
Timer device_malloc_timer("device malloc timer");
device_malloc_timer.start();
mallocai<<<BLOCK_COUNT, THREADS_PER_BLOCK>>>();
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
device_malloc_timer.stop_and_report();
}
If you find mistakes, please lmk in the comments, and I'll try to fix them.
And I ran them again with larger everything:
const int BLOCK_COUNT = 56;
const int THREADS_PER_BLOCK = 1024;
const int ITERATIONS = 1 << 18;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);
const int ARRAY_SIZE = 1024;
And cudaMalloc was still slower by a lot:
Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
timer for cuda malloc timer took 74.878016s
Starting timer for device malloc timer
Stopping timer for device malloc timer
timer for device malloc timer took 0.167331s