为什么OpenCV Gpu模块的性能比VisionWorks快？

Question

I have tried several functions of OpenCv gpu module and compared the same behavior with visionWorks immediate code. 我已经尝试了OpenCv gpu模块的几种功能，并将相同的行为与visionWorks立即代码进行了比较。 And surprisingly, it all circumstances the OpenCv Gpu Module is performing significantly faster than VisionWorks. 令人奇怪的是，这一切的情况下OpenCV的GPU模块正在执行比VisionWorks 显著快。

eg a Gaussian pyramid of level 4 implemented manually using opencv 例如，使用opencv手动实现的4级高斯金字塔

#include <iostream>
#include <stdio.h>


#include <stdio.h>
#include <queue>
/* OPENCV RELATED */
#include <cv.h>
#include <highgui.h>
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/gpu/gpu.hpp>  

#include "opencv2/opencv_modules.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/stitching/detail/autocalib.hpp"
#include "opencv2/stitching/detail/blenders.hpp"
#include "opencv2/stitching/detail/camera.hpp"
#include "opencv2/stitching/detail/exposure_compensate.hpp"
#include "opencv2/stitching/detail/matchers.hpp"
#include "opencv2/stitching/detail/motion_estimators.hpp"
#include "opencv2/stitching/detail/seam_finders.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/opencv.hpp>


using namespace std;
using namespace cv;

using namespace gpu;
using namespace cv::detail;


int main()
{
    Mat m = imread("br1.png");

    GpuMat d_m  = GpuMat (m);
    GpuMat d_m2;
    GpuMat l1,l2,l3,l4;
    int iter = 100;
    int64 e = getTickCount();
    float sum = 0;

    sum = 0;

    for(int i = 0 ; i < iter;  i++)
    {
        e = getTickCount();
        gpu::pyrDown(d_m,l1);
        gpu::pyrDown(l1,l2);
        gpu::pyrDown(l2,l3);
        gpu::pyrDown(l3,l4);
        sum+= (getTickCount() - e) / getTickFrequency(); 
    }

    cout <<"Time taken by Gussian Pyramid Level 4 \t\t\t"<<sum/iter<<" sec"<<endl;

    //imwrite("cv_res.jpg",res);
    return 0;
}

takes 2.5 ms on average for 100 iterations. 100次迭代平均需要2.5毫秒。 Whereas, VisionWorks 而VisionWorks

    #include <VX/vx.h>
#include <VX/vxu.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <stdio.h>


#include <stdio.h>
#include <queue>
/* OPENCV RELATED */
#include <cv.h>
#include <highgui.h>
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/gpu/gpu.hpp>  

#include "opencv2/opencv_modules.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/stitching/detail/autocalib.hpp"
#include "opencv2/stitching/detail/blenders.hpp"
#include "opencv2/stitching/detail/camera.hpp"
#include "opencv2/stitching/detail/exposure_compensate.hpp"
#include "opencv2/stitching/detail/matchers.hpp"
#include "opencv2/stitching/detail/motion_estimators.hpp"
#include "opencv2/stitching/detail/seam_finders.hpp"
#include "opencv2/stitching/detail/util.hpp"
#include "opencv2/stitching/detail/warpers.hpp"
#include "opencv2/stitching/warpers.hpp"
#include <opencv2/opencv.hpp>


using namespace std;
using namespace cv;

using namespace gpu;
using namespace cv::detail;



vx_image createImageFromMat(vx_context& context, cv::Mat& mat);


vx_status createMatFromImage(vx_image& image, cv::Mat& mat);


/* Entry point. */
int main(int argc,char* argv[])
{

    Mat cv_src1 = imread("br1.png", IMREAD_GRAYSCALE);
  int width = 1280;
  int height = 720;

  int half_width = width/2;
  int half_height = height/2;
    Mat dstMat(cv_src1.size(), cv_src1.type());
  Mat half_dstMat(Size(width/16,height/16),cv_src1.type());

  /* Image data. */


    if (cv_src1.empty() )
    {
        std::cerr << "Can't load input images" << std::endl;
        return -1;
    }


  /* Create our context. */
  vx_context context = vxCreateContext();

  /* Image to process. */
  vx_image image = createImageFromMat(context, cv_src1);
   //NVXIO_CHECK_REFERENCE(image);

  /* Intermediate images. */
  vx_image dx = vxCreateImage(context, width, height, VX_DF_IMAGE_S16);
  vx_image dy = vxCreateImage(context, width, height, VX_DF_IMAGE_S16);
  vx_image mag = vxCreateImage(context, width, height, VX_DF_IMAGE_S16);
  vx_image half_image = vxCreateImage(context, half_width, half_height,  VX_DF_IMAGE_U8);
  vx_image half_image_2 = vxCreateImage(context, half_width/2, half_height/2,  VX_DF_IMAGE_U8);
  vx_image half_image_3 = vxCreateImage(context, half_width/4, half_height/4,  VX_DF_IMAGE_U8);
  vx_image half_image_4 = vxCreateImage(context, half_width/8, half_height/8,  VX_DF_IMAGE_U8);


  int64 e = getTickCount();
  int iter = 100;
  float sum = 0.0;



  e = getTickCount();
  iter = 100;
  for(int i = 0 ; i < iter; i ++)
  {
    /* RESIZEZ OPERATION */
    if(vxuHalfScaleGaussian(context,image,half_image,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image,half_image_2,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_2,half_image_3,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_3,half_image_4,3) != VX_SUCCESS)
    {
      cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }


    sum += (getTickCount() - e) / getTickFrequency();  
  }

  cout <<"Resize to half " <<sum/iter<<endl;

  createMatFromImage(half_image_4,half_dstMat);

  imwrite("RES.jpg",half_dstMat);
  /* Tidy up. */
  vxReleaseImage(&dx);
  vxReleaseImage(&dy);
  vxReleaseImage(&mag);
  vxReleaseContext(&context);
}



vx_image createImageFromMat(vx_context& context, cv::Mat& mat)
{
    vx_imagepatch_addressing_t src_addr = {
        mat.cols, mat.rows, sizeof(vx_uint8), mat.cols * sizeof(vx_uint8), VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1 };
    void* src_ptr = mat.data;

    vx_image image = vxCreateImageFromHandle(context, VX_DF_IMAGE_U8, &src_addr, &src_ptr, VX_IMPORT_TYPE_HOST);

    return image;
}


vx_status createMatFromImage(vx_image& image, cv::Mat& mat)
{
    vx_status status = VX_SUCCESS;
    vx_uint8 *ptr = NULL;

    cout <<"Creating image "<<mat.cols << " " <<mat.rows <<endl;
    vx_rectangle_t rect;
    vxGetValidRegionImage(image, &rect);
    vx_imagepatch_addressing_t addr = {
        mat.cols, mat.rows, sizeof(vx_uint8), mat.cols * sizeof(vx_uint8), VX_SCALE_UNITY, VX_SCALE_UNITY, 1, 1 };

    status = vxAccessImagePatch(image, &rect, 0, &addr, (void **)&ptr, VX_READ_ONLY);
    mat.data = ptr;

    return status;
}

takes 11.1 ms on single execution, and 96ms on average for 100 iterations. 一次执行需要11.1毫秒，而100次迭代平均需要96毫秒。

If this is generally true, then what does visionWorks offer ? 如果这通常是正确的，那么visionWorks提供什么？

I am running "cuda-repo-l4t-r21.3-6-5-local_6.5-50" version of L4T on Jetson TK1 我在Jetson TK1上运行L4T的“ cuda-repo-l4t-r21.3-6-5-local_6.5-50”版本

Answer 1

You've made a mistake in VisionWorks code. 您在VisionWorks代码中犯了一个错误。 You start timer only once e = getTickCount(); 您只能启动一次计时器e = getTickCount(); right before the loop, but you need to start it on each iteration. 就在循环之前，但是您需要在每次迭代时启动它。

iter = 100;
for(int i = 0 ; i < iter; i ++)
{
    // START TIMER
    e = getTickCount();

    /* RESIZEZ OPERATION */
    if(vxuHalfScaleGaussian(context,image,half_image,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image,half_image_2,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_2,half_image_3,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    if(vxuHalfScaleGaussian(context,half_image_3,half_image_4,3) != VX_SUCCESS)
    {
        cout <<"ERROR :"<<"failed to perform scaling"<<endl;
    }

    // STOP TIMER
    sum += (getTickCount() - e) / getTickFrequency();  
}

Answer 2

I think that the following code is mistake. 我认为以下代码是错误的。

  Mat cv_src1 = imread("br1.png", IMREAD_GRAYSCALE);
  int width = 1280;
  int height = 720;

I think that you should be set as follows. 我认为您应该设置如下。

  Mat cv_src1 = imread("br1.png", IMREAD_GRAYSCALE);
  vx_uint32 width  = cv_src1.cols;
  vx_uint32 height = cv_src1.rows;

And, I made sample code to reproduce. 而且，我制作了示例代码以进行重现。
But, VisionWorks(about 0.3ms) faster than GpuMat(about 0.4ms) on my environment. 但是，在我的环境中，VisionWorks（约0.3毫秒）比GpuMat（约0.4毫秒）快。

https://gist.github.com/atinfinity/9c8c067db739b190ba17f2bd8dbe75d6 https://gist.github.com/atinfinity/e8c2f2da6486be51881e3924c13a311c https://gist.github.com/atinfinity/9c8c067db739b190ba17f2bd8dbe75d6 https://gist.github.com/atinfinity/e8c2f2da2486648be51881e3924c13a311c

My environment is as follows. 我的环境如下。

GPU: NVIDIA GeForce GTX 680 GPU：NVIDIA GeForce GTX 680
OS: Windows 10 Pro 64bit 操作系统：Windows 10 Pro 64bit
Compiler: Visual Studio 2013 Update5 编译器：Visual Studio 2013 Update5
VisionWorks：NVIDIA VisionWorks v1.0.25 VisionWorks：NVIDIA VisionWorks v1.0.25
OpenCV: OpenCV 3.1 OpenCV：OpenCV 3.1

为什么OpenCV Gpu模块的性能比VisionWorks快？

问题描述

2 个解决方案

解决方案1
2 2016-04-12 16:12:20

解决方案2
1 2016-04-15 14:56:05

为什么OpenCV Gpu模块的性能比VisionWorks快？

问题描述

2 个解决方案

解决方案1 2 2016-04-12 16:12:20

解决方案2 1 2016-04-15 14:56:05

解决方案1
2 2016-04-12 16:12:20

解决方案2
1 2016-04-15 14:56:05