Opencv cuda stream optimisation

MounYo · August 12, 2022, 1:59pm

Hi all,
I want to optimize the time consuming with asyn calls and streams.

This is my main function :

void TestCudaStream(bool WithStream)
{
//Create CUDA Streams Array
std::shared_ptr<std::vector<cv::cuda::Stream >> streamsArray = std::make_shared<std::vector<cv::cuda::Stream >>();

cv::cuda::Stream streamA, streamB, streamC, streamD;
streamsArray->push_back(streamA);
streamsArray->push_back(streamB);
streamsArray->push_back(streamC);
streamsArray->push_back(streamD);

//Create Pinned Memory (PAGE_LOCKED) arrays
std::shared_ptr<std::vector<cv::cuda::HostMem >> srcMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>();
std::shared_ptr<std::vector<cv::cuda::HostMem >> dstMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>();

//Create GpuMat arrays to use them on OpenCV CUDA Methods
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray = std::make_shared<std::vector<cv::cuda::GpuMat>>();
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray = std::make_shared<std::vector<cv::cuda::GpuMat>>();

//Create Output array for CPU Mat
std::shared_ptr<std::vector< cv::Mat >> outArray = std::make_shared<std::vector<cv::Mat>>();

std::shared_ptr<std::vector< Ptr<cv::cuda::StereoSGM>>> tab_sbm = std::make_shared<std::vector<Ptr<cv::cuda::StereoSGM>>>();

for (int i = 0; i < 4; i++) {
    cv::cuda::GpuMat srcMat0;
    cv::cuda::GpuMat srcMat1;
    cv::cuda::GpuMat dstMat;

    cv::Mat outMat;
    string nom1 = "D:/U1_Z";
    string nom2 = "D:/U2_Z";
    string type = ".tiff";

    cv::Mat srcHostImage0 = cv::imread(nom1 + std::to_string(i) + type, IMREAD_GRAYSCALE);
    cv::cuda::HostMem srcHostMem0 = cv::cuda::HostMem(srcHostImage0, cv::cuda::HostMem::PAGE_LOCKED);
    srcMemArray->push_back(srcHostMem0);

    gpuSrcArray->push_back(srcMat0);

    cv::Mat srcHostImage1 = cv::imread(nom2 + std::to_string(i) + type, IMREAD_GRAYSCALE);
    cv::cuda::HostMem srcHostMem1 = cv::cuda::HostMem(srcHostImage1, cv::cuda::HostMem::PAGE_LOCKED);
    srcMemArray->push_back(srcHostMem1);

    gpuSrcArray->push_back(srcMat1);

    cv::cuda::HostMem srcDstMem = cv::cuda::HostMem(outMat, cv::cuda::HostMem::PAGE_LOCKED);
    dstMemArray->push_back(srcDstMem);

    gpuDstArray->push_back(dstMat);
    outArray->push_back(outMat);

    Ptr<cv::cuda::StereoSGM> ssgm = cv::cuda::createStereoSGM(0, 256, 10, 120, 5, cv::cuda::StereoSGM::MODE_HH4);

    tab_sbm->push_back(ssgm);
}

GpuTimer my_time;
my_time.Start();
if(WithStream) computeArrayWithStream(srcMemArray, dstMemArray, gpuSrcArray, gpuDstArray, outArray, streamsArray, tab_sbm);
else computeArrayWithoutStream(srcMemArray, dstMemArray, gpuSrcArray, gpuDstArray, outArray, streamsArray, tab_sbm);
my_time.Stop();
std::printf("RESIZE time (gpu ms) = %f\n", my_time.Elapsed());

}

this is the sync function :

void computeArrayWithoutStream(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
std::shared_ptr<std::vector< cv::Mat >> outArray,
std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
std::shared_ptr < std::vector < Ptr< cv::cuda::StereoSGM>>> tab_sbm) {
//Compute for each input image with async calls
for (int i = 0; i < 4; i++) {

    //Upload Input Pinned Memory to GPU Mat
    (*gpuSrcArray)[2 * i].upload((*srcMemArray)[2 * i]);
    (*gpuSrcArray)[2 * i + 1].upload((*srcMemArray)[2 * i + 1]);

    //Use the CUDA Kernel Method
    //cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i]);
    (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i]);

    //Download result to Output Pinned Memory
    (*gpuDstArray)[i].download((*dstMemArray)[i]);

    //Obtain data back to CPU Memory
    (*outArray)[i] = (*dstMemArray)[i].createMatHeader();
}
}

this is the async function :

void computeArrayWithStream(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
std::shared_ptr<std::vector< cv::Mat >> outArray,
std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
std::shared_ptr<std::vector < Ptr < cv::cuda::StereoSGM>>> tab_sbm) {
//Compute for each input image with async calls
for (int i = 0; i < 4; i++) {

    //Upload Input Pinned Memory to GPU Mat
    (*gpuSrcArray)[2 * i].upload((*srcMemArray)[2 * i], (*streamsArray)[i]);
    (*gpuSrcArray)[2 * i + 1].upload((*srcMemArray)[2 * i + 1], (*streamsArray)[i]);

    //Use the CUDA Kernel Method
    //cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i], (*streamsArray)[i]);
    (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i],(*streamsArray)[i]);

    //Download result to Output Pinned Memory
    (*gpuDstArray)[i].download((*dstMemArray)[i], (*streamsArray)[i]);

    //Obtain data back to CPU Memory
    (*outArray)[i] = (*dstMemArray)[i].createMatHeader();
}
//All previous calls are non-blocking therefore 
//wait for each stream completetion
(*streamsArray)[0].waitForCompletion();
(*streamsArray)[1].waitForCompletion();
(*streamsArray)[2].waitForCompletion();
(*streamsArray)[3].waitForCompletion();
}

My question :
If I use cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i], (*streamsArray)[i]); (that is commented below), the async will work faster than the sync.

but if I use (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i],(*streamsArray)[i]); there is no difference between asyn and sync. Can you help me to understand why ?

cudawarped · August 18, 2022, 8:28am

It looks like the implementation internally uses a number of different internal streams to run a winner takes all algorithm. To do this it has a lot of extra internal synchronization. For more detail see below.

First the stream you pass is used to run census_transform::censusTransform on the left and right image

github.com

opencv/opencv_contrib/blob/2a6cc95b10a5f882ae66e173067585ed50d1d0e6/modules/cudastereo/src/stereosgm.cpp#L116


      
          
          
CV_Assert(left.type() == CV_8UC1 || left.type() == CV_16UC1);
          CV_Assert(size == right.size() && left.type() == right.type());
          
          
_disparity.create(size, CV_16SC1);
          ensureSizeIsEnough(size, CV_16SC1, right_disp);
          GpuMat left_disp = _disparity.getGpuMat();
          
          
ensureSizeIsEnough(size, CV_32SC1, censused_left);
          ensureSizeIsEnough(size, CV_32SC1, censused_right);
          census_transform::censusTransform(left, censused_left, _stream);
          census_transform::censusTransform(right, censused_right, _stream);
          
          
ensureSizeIsEnough(1, size.width * size.height * params.numDisparities * num_paths, CV_8UC1, aggregated);
          ensureSizeIsEnough(size, CV_16SC1, left_disp_tmp);
          ensureSizeIsEnough(size, CV_16SC1, right_disp_tmp);
          
          
switch (params.numDisparities)
          {
          case 64:
              pathAggregation.operator()<64>(censused_left, censused_right, aggregated, params.mode, params.P1, params.P2, params.minDisparity, _stream);

Then pathAggregation.operator() is run using 8 internal streams after first synchronizing on your stream

github.com

opencv/opencv_contrib/blob/2a6cc95b10a5f882ae66e173067585ed50d1d0e6/modules/cudastereo/src/cuda/stereosgm.cu#L1341


      
          
          
template <size_t MAX_DISPARITY>
          void PathAggregation::operator() (const GpuMat& left, const GpuMat& right, GpuMat& dest, int mode, int p1, int p2, int min_disp, Stream& stream)
          {
              CV_Assert(left.size() == right.size());
              CV_Assert(left.type() == right.type());
              CV_Assert(left.type() == CV_32SC1);
          
          
    const int num_paths = mode == StereoSGBM::MODE_HH4 ? 4 : 8;
          
          
    stream.waitForCompletion();
          
          
    const Size size = left.size();
              const int buffer_step = size.width * size.height * static_cast<int>(MAX_DISPARITY);
              CV_Assert(dest.rows == 1 && buffer_step * num_paths == dest.cols);
          
          
    for (int i = 0; i < num_paths; ++i)
              {
                  subs[i] = dest.colRange(i * buffer_step, (i + 1) * buffer_step);
              }

and then on all its internal streams when it is complete

github.com

opencv/opencv_contrib/blob/2a6cc95b10a5f882ae66e173067585ed50d1d0e6/modules/cudastereo/src/cuda/stereosgm.cu#L1366


      
          
          
    if (mode == StereoSGBM::MODE_HH)
              {
                  oblique::aggregateUpleft2DownrightPath<MAX_DISPARITY>(left, right, subs[4], p1, p2, min_disp, streams[4]);
                  oblique::aggregateUpright2DownleftPath<MAX_DISPARITY>(left, right, subs[5], p1, p2, min_disp, streams[5]);
                  oblique::aggregateDownright2UpleftPath<MAX_DISPARITY>(left, right, subs[6], p1, p2, min_disp, streams[6]);
                  oblique::aggregateDownleft2UprightPath<MAX_DISPARITY>(left, right, subs[7], p1, p2, min_disp, streams[7]);
              }
          
          
    // synchronization
              for (int i = 0; i < num_paths; ++i)
              {
                  events[i].record(streams[i]);
                  stream.waitEvent(events[i]);
                  streams[i].waitForCompletion();
              }
          }
          
          
template void PathAggregation::operator()< 64>(const GpuMat& left, const GpuMat& right, GpuMat& dest, int mode, int p1, int p2, int min_disp, Stream& stream);
          template void PathAggregation::operator()<128>(const GpuMat& left, const GpuMat& right, GpuMat& dest, int mode, int p1, int p2, int min_disp, Stream& stream);
          template void PathAggregation::operator()<256>(const GpuMat& left, const GpuMat& right, GpuMat& dest, int mode, int p1, int p2, int min_disp, Stream& stream);

Finally a number of additional kernel’s are executed in your stream without synchronization

github.com

opencv/opencv_contrib/blob/2a6cc95b10a5f882ae66e173067585ed50d1d0e6/modules/cudastereo/src/stereosgm.cpp#L141


      
                  winner_takes_all::winnerTakesAll<128>(aggregated, left_disp_tmp, right_disp_tmp, (float)(100 - params.uniquenessRatio) / 100, true, params.mode, _stream);
                  break;
              case 256:
                  pathAggregation.operator()<256>(censused_left, censused_right, aggregated, params.mode, params.P1, params.P2, params.minDisparity, _stream);
                  winner_takes_all::winnerTakesAll<256>(aggregated, left_disp_tmp, right_disp_tmp, (float)(100 - params.uniquenessRatio) / 100, true, params.mode, _stream);
                  break;
              default:
                  CV_Error(Error::StsBadArg, "Unsupported num of disparities");
              }
          
          
    median_filter::medianFilter(left_disp_tmp, left_disp, _stream);
              median_filter::medianFilter(right_disp_tmp, right_disp, _stream);
              check_consistency::checkConsistency(left_disp, right_disp, left, true, _stream);
              correct_disparity_range::correctDisparityRange(left_disp, true, params.minDisparity, _stream);
          }
          } // anonymous namespace
          
          
Ptr<cuda::StereoSGM> cv::cuda::createStereoSGM(int minDisparity, int numDisparities, int P1, int P2, int uniquenessRatio, int mode)
          {
              return makePtr<StereoSGMImpl>(minDisparity, numDisparities, P1, P2, uniquenessRatio, mode);
          }

Topic		Replies	Views
CUDA flag to create a cv::cuda::Stream that supports asynchronous calls C++ gpu , cuda	1	1130	July 20, 2022
OpenCV-cuda : run the same function in parallel on diferent data using streams? C++ cuda	6	646	August 5, 2022
How to use cuda::SparsePyrLKOpticalFlow in multi thread environment C++ multithreading , cuda , optflow	11	994	July 30, 2022
What cuda stream do OpenCV::Cuda functions use? cuda	8	405	March 28, 2024
Using cv::Mat and/or cv::cuda::Mat with CUDA written custom code	10	2840	December 7, 2020

Opencv cuda stream optimisation

Related topics