I copied two examples from the book 'Hands-On GPU-Accelerated Computer Vision with OpenCV and CUDA' to compare the performance of CPU & GPU.
1st code:
cv::Mat src = cv::imread("D:/Pics/Pen.jpg", 0); // Pen.jpg is a 4096 * 4096 GrayScacle picture.
cv::Mat result_host1, result_host2, result_host3, result_host4, result_host5;
//Get initial time in miliseconds
int64 work_begin = getTickCount();
cv::threshold(src, result_host1, 128.0, 255.0, cv::THRESH_BINARY);
cv::threshold(src, result_host2, 128.0, 255.0, cv::THRESH_BINARY_INV);
cv::threshold(src, result_host3, 128.0, 255.0, cv::THRESH_TRUNC);
cv::threshold(src, result_host4, 128.0, 255.0, cv::THRESH_TOZERO);
cv::threshold(src, result_host5, 128.0, 255.0, cv::THRESH_TOZERO_INV);
//Get time after work has finished
int64 delta = getTickCount() - work_begin;
//Frequency of timer
double freq = getTickFrequency();
double work_fps = freq / delta;
std::cout << "Performance of Thresholding on CPU: " << std::endl;
std::cout << "Time: " << (1 / work_fps) << std::endl;
std::cout << "FPS: " << work_fps << std::endl;
return 0;
2nd code:
cv::Mat h_img1 = cv::imread("D:/Pics/Pen.jpg", 0); // Pen.jpg is a 4096 * 4096 GrayScacle picture.
cv::cuda::GpuMat d_result1, d_result2, d_result3, d_result4, d_result5, d_img1;
//Measure initial time ticks
int64 work_begin = getTickCount();
d_img1.upload(h_img1);
cv::cuda::threshold(d_img1, d_result1, 128.0, 255.0, cv::THRESH_BINARY);
cv::cuda::threshold(d_img1, d_result2, 128.0, 255.0, cv::THRESH_BINARY_INV);
cv::cuda::threshold(d_img1, d_result3, 128.0, 255.0, cv::THRESH_TRUNC);
cv::cuda::threshold(d_img1, d_result4, 128.0, 255.0, cv::THRESH_TOZERO);
cv::cuda::threshold(d_img1, d_result5, 128.0, 255.0, cv::THRESH_TOZERO_INV);
cv::Mat h_result1, h_result2, h_result3, h_result4, h_result5;
d_result1.download(h_result1);
d_result2.download(h_result2);
d_result3.download(h_result3);
d_result4.download(h_result4);
d_result5.download(h_result5);
//Measure difference in time ticks
int64 delta = getTickCount() - work_begin;
double freq = getTickFrequency();
//Measure frames per second
double work_fps = freq / delta;
std::cout << "Performance of Thresholding on GPU: " << std::endl;
std::cout << "Time: " << (1 / work_fps) << std::endl;
std::cout << "FPS: " << work_fps << std::endl;
return 0;
All thing is OK, except:
"Speed of GPU is less than CPU"
1st result:
Performance of Thresholding on CPU:
Time: 0.0475497
FPS: 21.0306
2nd result:
Performance of Thresholding on GPU:
Time: 0.599032
FPS: 1.66936
Then, I decided to withdraw the upload & download times:
3rd code:
cv::Mat h_img1 = cv::imread("D:/Pics/Pen.jpg", 0); // Pen.jpg is a 4096 * 4096 GrayScacle picture.
cv::cuda::GpuMat d_result1, d_result2, d_result3, d_result4, d_result5, d_img1;
d_img1.upload(h_img1);
//Measure initial time ticks
int64 work_begin = getTickCount();
cv::cuda::threshold(d_img1, d_result1, 128.0, 255.0, cv::THRESH_BINARY);
cv::cuda::threshold(d_img1, d_result2, 128.0, 255.0, cv::THRESH_BINARY_INV);
cv::cuda::threshold(d_img1, d_result3, 128.0, 255.0, cv::THRESH_TRUNC);
cv::cuda::threshold(d_img1, d_result4, 128.0, 255.0, cv::THRESH_TOZERO);
cv::cuda::threshold(d_img1, d_result5, 128.0, 255.0, cv::THRESH_TOZERO_INV);
//Measure difference in time ticks
int64 delta = getTickCount() - work_begin;
double freq = getTickFrequency();
//Measure frames per second
double work_fps = freq / delta;
std::cout << "Performance of Thresholding on GPU: " << std::endl;
std::cout << "Time: " << (1 / work_fps) << std::endl;
std::cout << "FPS: " << work_fps << std::endl;
cv::Mat h_result1, h_result2, h_result3, h_result4, h_result5;
d_result1.download(h_result1);
d_result2.download(h_result2);
d_result3.download(h_result3);
d_result4.download(h_result4);
d_result5.download(h_result5);
return 0;
But, the problem keeps on :
3rd result:
Performance of Thresholding on GPU:
Time: 0.136095
FPS: 7.34779
I am confused with this problem.
1st 2nd 3rd
CPU GPU GPU
Time: 0.0475497 0.599032 0.136095
FPS: 21.0306 1.66936 7.34779
Please help me.
GPU Specs:
*********************************************************
NVIDIA Quadro K2100M
Micro architecture: Kepler
Compute capability version: 3.0
CUDA Version: 10.1
*********************************************************
My System Specs:
*********************************************************
laptop hp ZBook
CPU: Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz 2.90 GHZ
RAM: 8.00 GB
OS: Windows 7, 64-bit, Ultimate, Service Pack 1
*********************************************************
I can think of 2 reasons why the CPU version is faster even without memory operations:
1. in the 2nd and 3rd code versions you declare the result GpuMats but you don't actually initialize them, the initialization of the result GpuMats will occur inside the threshold method by a call to GpuMat.create, this leads to 80MB of GPU memory allocation for each execution, you can see the "performance improvement" by initializing the result GpuMats once and then reuse them. With the original 3rd code I get the following results(Geforce RTX 2080):
Time: 0.010208 FPS: 97.9624
When I change the code to:
...
d_resut1.create(h_img1.size(), CV_8UC1);
d_result2.create(h_img1.size(), CV_8UC1);
d_result3.create(h_img1.size(), CV_8UC1);
d_result4.create(h_img1.size(), CV_8UC1);
d_result5.create(h_img1.size(), CV_8UC1);
d_img1.upload(h_img1);
//Measure initial time ticks
int64 work_begin = getTickCount();
cv::cuda::threshold(d_img1, d_result1, 128.0, 255.0, cv::THRESH_BINARY);
cv::cuda::threshold(d_img1, d_result2, 128.0, 255.0, cv::THRESH_BINARY_INV);
cv::cuda::threshold(d_img1, d_result3, 128.0, 255.0, cv::THRESH_TRUNC);
cv::cuda::threshold(d_img1, d_result4, 128.0, 255.0, cv::THRESH_TOZERO);
cv::cuda::threshold(d_img1, d_result5, 128.0, 255.0, cv::THRESH_TOZERO_INV);
...
I get the following results(2x better) Time: 0.00503374 FPS: 198.659
While the GpuMat result pre-allocation introduce a major performance gain, the same modification for the CPU version doesn't.
2. K2100M is not a very strong GPU (576 cores @ 665 MHz) and taking into account that the OpenCV is probably (depending on how you compiled it) using multi-threading with SIMD instructions under the hood for the CPU (2.90GHz with 8 virual cores) version the results are not that surprising
Edit: By profiling the application using NVIDIA Nsight systems you can understand better the GPU memory operations penalties :
As you can see, only allocating and freeing the memory takes 10.5ms while the thresholding itself only takes 5ms
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.