Variable gets lost after allocating array of structs in cuda

前端 未结 1 581
温柔的废话
温柔的废话 2021-01-15 21:06

I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and

1条回答
  •  一向
    一向 (楼主)
    2021-01-15 21:16

    The problem is in the function cudaGraphMalloc where you are trying to allocate device memory to the members of outGraph which has already been allocated on the device. In process of doing so, you are de-referencing a device pointer on host which is illegal.

    To allocate device memory to members of struct type variable which exists on the device, we first have to create a temporary host variable of that struct type, then allocate device memory to its members, and then copy it to the struct which exists on the device.

    I have answered a similar question here. Please take a look at it.

    The fixed code may look like this:

    #include 
    #include 
    #include 
    
    // A point, part of some elements
    struct Node {
    
        float* position;
    
    };
    
    struct Graph {
        unsigned int nNode;
        Node* node;
        unsigned int nBoundary;
        unsigned int* boundary;
    };
    Graph* cudaGraphMalloc(const Graph* inGraph);
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    
    __global__ void testKernel(Graph* graph, unsigned int * d_res) {
        d_res[0] = graph->nBoundary;
    
    };
    int main()
    {
    
        // Generate some fake data on the CPU
        Graph graph;
        graph.node = (Node*)malloc(2 * sizeof(Node));
        graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
        for (int i = 0; i < 3; i++) {
            graph.boundary[i] = i + 10;
        }
        graph.nBoundary = 3;
        graph.nNode = 2;
        for (int i = 0; i < 2; i++) {
            // They can have different sizes in the original code
            graph.node[i].position = (float*)malloc(3 * sizeof(float));
            graph.node[i].position[0] = 45;
            graph.node[i].position[1] = 1;
            graph.node[i].position[2] = 2;
        }
    
        // allocate GPU memory
        Graph * d_graph = cudaGraphMalloc(&graph);
        // some dummy variables to test on GPU.
        unsigned int * d_res, *h_res;
        cudaMalloc((void **)&d_res, sizeof(unsigned int));
        h_res = (unsigned int*)malloc(sizeof(unsigned int));
    
        //Run kernel
        testKernel << <1, 1 >> >(d_graph, d_res);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
    
        printf("%u\n", graph.nBoundary);
        printf("%u\n", h_res[0]);
    
        return 0;
    }
    
    Graph* cudaGraphMalloc(const Graph* inGraph) 
    {
        //Create auxiliary Graph variable on host
        Graph temp;
    
        //copy constants
        temp.nNode = inGraph->nNode;
        temp.nBoundary = inGraph->nBoundary;
    
        // copy boundary
        gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
        gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));
    
    
        //Create nodes 
        size_t nodeBytesTotal = temp.nNode * sizeof(Node);
        gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));
    
        for (int i = 0; i < temp.nNode; i++)
        {
            //Create auxiliary node on host
            Node auxNodeHost;
    
            //Allocate device memory to position member of auxillary node
            size_t nodeBytes = 3 * sizeof(float);
            gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
            gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));
    
            //Copy auxillary host node to device
            Node* dPtr = temp.node + i;
            gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
        }
    
    
        Graph* outGraph;
        gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
        gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));
    
        return outGraph;
    }
    

    Be advised that you will have to keep the host copies of internal device pointers (i.e. the auxiliary host variables). This is because you will have to free the device memory later and since you will only have a device copy of Graph in the main code, you won't be able to access its members from the host to call cudaFree on them. In this case the variable Node auxNodeHost (created in each iteration) and Graph temp are those variables.

    The above code does not do that and is just for demonstration purpose.

    Tested on Windows 10, Visual Studio 2015, CUDA 9.2, NVIDIA Driver 397.44.

    0 讨论(0)
提交回复
热议问题