The following trial presents my intention, which failed to compile:
__host__ __device__ void f(){}
int main()
{
f<<<1,1>>>();
}
The tutorial you are looking at is so old, 2008? It might not be compatible with the version of CUDA you are using.
You can use __global__
and that means __host__ __device__
, this works:
__global__ void f()
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
}
int main()
{
f<<<1,1>>>();
}
You need to create a CUDA kernel entry point, e.g. __global__
function. Something like:
#include <stdio.h>
__host__ __device__ void f() {
#ifdef __CUDA_ARCH__
printf ("Device Thread %d\n", threadIdx.x);
#else
printf ("Host code!\n");
#endif
}
__global__ void kernel() {
f();
}
int main() {
kernel<<<1,1>>>();
if (cudaDeviceSynchronize() != cudaSuccess) {
fprintf (stderr, "Cuda call failed\n");
}
f();
return 0;
}