GPU versions of OpenCV algorithms slower than CPU versions on my machine?

后端 未结 1 589
梦谈多话
梦谈多话 2021-01-16 12:53

While trying to speed up a simple algorithm using the GPU with OpenCV, I noticed that on my machine (Ubuntu 12.10, NVidia 9800GT, Cuda 4.2.9, g++ 4.7.2) the GPU Version is a

相关标签:
1条回答
  • 2021-01-16 13:11

    Thanks to the comments of hubs and Eric I was able to change my test in a way that the GPU version actually became faster than the CPU version. The mistake leading to the different checksums of both versions is now also eliminated. ;-)

    #include <opencv2/opencv.hpp>
    #include <opencv2/gpu/gpu.hpp>
    
    #include <chrono>
    #include <iostream>
    
    int main()
    {
        using namespace cv;
        using namespace std;
    
        Mat img1(512, 512, CV_32FC3, Scalar(1.0f, 2.0f, 3.0f));
        Mat img2(128, 128, CV_32FC3, Scalar(4.0f, 5.0f, 6.0f));
        Mat img3(128, 128, CV_32FC3, Scalar(7.0f, 8.0f, 9.0f));
        Mat resultCPU(img2.rows, img2.cols, CV_32FC3, Scalar(0.0f, 0.0f, 0.0f));
    
        auto startCPU = chrono::high_resolution_clock::now();
        cout << "CPU ... " << flush;
        for (int y(0); y < img1.rows - img2.rows; ++y)
        {
            for (int x(0); x < img1.cols - img2.cols; ++x)
            {
                Mat roi(img1(Rect(x, y, img2.cols, img2.rows)));
                Mat diff;
                absdiff(roi, img2, diff);
                Mat diffMult(diff.mul(img3));
                resultCPU += diffMult;
            }
        }
        auto endCPU = chrono::high_resolution_clock::now();
        auto elapsedCPU = endCPU - startCPU;
        Scalar meanCPU(mean(resultCPU));
        cout << "done. " << meanCPU << " - ticks: " << elapsedCPU.count() << endl;
    
        gpu::GpuMat img1GPU(img1);
        gpu::GpuMat img2GPU(img2);
        gpu::GpuMat img3GPU(img3);
        gpu::GpuMat diffGPU(img2.rows, img2.cols, CV_32FC3);
        gpu::GpuMat diffMultGPU(img2.rows, img2.cols, CV_32FC3);
        gpu::GpuMat resultGPU(img2.rows, img2.cols, CV_32FC3, Scalar(0.0f, 0.0f, 0.0f));
    
        auto startGPU = chrono::high_resolution_clock::now();
        cout << "GPU ... " << flush;
        for (int y(0); y < img1GPU.rows - img2GPU.rows; ++y)
        {
            for (int x(0); x < img1GPU.cols - img2GPU.cols; ++x)
            {
                gpu::GpuMat roiGPU(img1GPU, Rect(x, y, img2GPU.cols, img2GPU.rows));
                gpu::absdiff(roiGPU, img2GPU, diffGPU);
                gpu::multiply(diffGPU, img3GPU, diffMultGPU);
                gpu::add(resultGPU, diffMultGPU, resultGPU);
            }
        }
        auto endGPU = chrono::high_resolution_clock::now();
        auto elapsedGPU = endGPU - startGPU;
        Mat downloadedResultGPU(resultGPU);
        Scalar meanGPU(mean(downloadedResultGPU));
        cout << "done. " << meanGPU << " - ticks: " << elapsedGPU.count() << endl;
    }
    

    Output:

    CPU ... done. [3.09658e+06, 3.53894e+06, 3.98131e+06, 0] - ticks: 34021332
    GPU ... done. [3.09658e+06, 3.53894e+06, 3.98131e+06, 0] - ticks: 20609880
    

    That is not the speedup I expected, but probably my GPU is just not the best for this stuff. Thanks guys.

    0 讨论(0)
提交回复
热议问题