Opencv cuda stream optimisation

Hi all,
I want to optimize the time consuming with asyn calls and streams.

This is my main function :

void TestCudaStream(bool WithStream)
{
//Create CUDA Streams Array
std::shared_ptr<std::vector<cv::cuda::Stream >> streamsArray = std::make_shared<std::vector<cv::cuda::Stream >>();

cv::cuda::Stream streamA, streamB, streamC, streamD;
streamsArray->push_back(streamA);
streamsArray->push_back(streamB);
streamsArray->push_back(streamC);
streamsArray->push_back(streamD);

//Create Pinned Memory (PAGE_LOCKED) arrays
std::shared_ptr<std::vector<cv::cuda::HostMem >> srcMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>();
std::shared_ptr<std::vector<cv::cuda::HostMem >> dstMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>();

//Create GpuMat arrays to use them on OpenCV CUDA Methods
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray = std::make_shared<std::vector<cv::cuda::GpuMat>>();
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray = std::make_shared<std::vector<cv::cuda::GpuMat>>();

//Create Output array for CPU Mat
std::shared_ptr<std::vector< cv::Mat >> outArray = std::make_shared<std::vector<cv::Mat>>();

std::shared_ptr<std::vector< Ptr<cv::cuda::StereoSGM>>> tab_sbm = std::make_shared<std::vector<Ptr<cv::cuda::StereoSGM>>>();

for (int i = 0; i < 4; i++) {
    cv::cuda::GpuMat srcMat0;
    cv::cuda::GpuMat srcMat1;
    cv::cuda::GpuMat dstMat;

    cv::Mat outMat;
    string nom1 = "D:/U1_Z";
    string nom2 = "D:/U2_Z";
    string type = ".tiff";

    cv::Mat srcHostImage0 = cv::imread(nom1 + std::to_string(i) + type, IMREAD_GRAYSCALE);
    cv::cuda::HostMem srcHostMem0 = cv::cuda::HostMem(srcHostImage0, cv::cuda::HostMem::PAGE_LOCKED);
    srcMemArray->push_back(srcHostMem0);

    gpuSrcArray->push_back(srcMat0);

    cv::Mat srcHostImage1 = cv::imread(nom2 + std::to_string(i) + type, IMREAD_GRAYSCALE);
    cv::cuda::HostMem srcHostMem1 = cv::cuda::HostMem(srcHostImage1, cv::cuda::HostMem::PAGE_LOCKED);
    srcMemArray->push_back(srcHostMem1);

    gpuSrcArray->push_back(srcMat1);

    cv::cuda::HostMem srcDstMem = cv::cuda::HostMem(outMat, cv::cuda::HostMem::PAGE_LOCKED);
    dstMemArray->push_back(srcDstMem);

    gpuDstArray->push_back(dstMat);
    outArray->push_back(outMat);

    Ptr<cv::cuda::StereoSGM> ssgm = cv::cuda::createStereoSGM(0, 256, 10, 120, 5, cv::cuda::StereoSGM::MODE_HH4);

    tab_sbm->push_back(ssgm);
}

GpuTimer my_time;
my_time.Start();
if(WithStream) computeArrayWithStream(srcMemArray, dstMemArray, gpuSrcArray, gpuDstArray, outArray, streamsArray, tab_sbm);
else computeArrayWithoutStream(srcMemArray, dstMemArray, gpuSrcArray, gpuDstArray, outArray, streamsArray, tab_sbm);
my_time.Stop();
std::printf("RESIZE time (gpu ms) = %f\n", my_time.Elapsed());

}

this is the sync function :

void computeArrayWithoutStream(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
std::shared_ptr<std::vector< cv::Mat >> outArray,
std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
std::shared_ptr < std::vector < Ptr< cv::cuda::StereoSGM>>> tab_sbm) {

//Compute for each input image with async calls
for (int i = 0; i < 4; i++) {

    //Upload Input Pinned Memory to GPU Mat
    (*gpuSrcArray)[2 * i].upload((*srcMemArray)[2 * i]);
    (*gpuSrcArray)[2 * i + 1].upload((*srcMemArray)[2 * i + 1]);

    //Use the CUDA Kernel Method
    //cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i]);
    (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i]);

    //Download result to Output Pinned Memory
    (*gpuDstArray)[i].download((*dstMemArray)[i]);

    //Obtain data back to CPU Memory
    (*outArray)[i] = (*dstMemArray)[i].createMatHeader();
}

}

this is the async function :

void computeArrayWithStream(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
std::shared_ptr<std::vector< cv::Mat >> outArray,
std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
std::shared_ptr<std::vector < Ptr < cv::cuda::StereoSGM>>> tab_sbm) {

//Compute for each input image with async calls
for (int i = 0; i < 4; i++) {

    //Upload Input Pinned Memory to GPU Mat
    (*gpuSrcArray)[2 * i].upload((*srcMemArray)[2 * i], (*streamsArray)[i]);
    (*gpuSrcArray)[2 * i + 1].upload((*srcMemArray)[2 * i + 1], (*streamsArray)[i]);

    //Use the CUDA Kernel Method
    //cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i], (*streamsArray)[i]);
    (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i],(*streamsArray)[i]);

    //Download result to Output Pinned Memory
    (*gpuDstArray)[i].download((*dstMemArray)[i], (*streamsArray)[i]);

    //Obtain data back to CPU Memory
    (*outArray)[i] = (*dstMemArray)[i].createMatHeader();
}
//All previous calls are non-blocking therefore 
//wait for each stream completetion
(*streamsArray)[0].waitForCompletion();
(*streamsArray)[1].waitForCompletion();
(*streamsArray)[2].waitForCompletion();
(*streamsArray)[3].waitForCompletion();

}

My question :
If I use cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i], (*streamsArray)[i]); (that is commented below), the async will work faster than the sync.

but if I use (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i],(*streamsArray)[i]); there is no difference between asyn and sync. Can you help me to understand why ?

It looks like the implementation internally uses a number of different internal streams to run a winner takes all algorithm. To do this it has a lot of extra internal synchronization. For more detail see below.

  1. First the stream you pass is used to run census_transform::censusTransform on the left and right image
  1. Then pathAggregation.operator() is run using 8 internal streams after first synchronizing on your stream

and then on all its internal streams when it is complete

  1. Finally a number of additional kernel’s are executed in your stream without synchronization