Hi all,
I want to optimize the time consuming with asyn calls and streams.
This is my main function :
void TestCudaStream(bool WithStream)
{
//Create CUDA Streams Array
std::shared_ptr<std::vector<cv::cuda::Stream >> streamsArray = std::make_shared<std::vector<cv::cuda::Stream >>();cv::cuda::Stream streamA, streamB, streamC, streamD; streamsArray->push_back(streamA); streamsArray->push_back(streamB); streamsArray->push_back(streamC); streamsArray->push_back(streamD); //Create Pinned Memory (PAGE_LOCKED) arrays std::shared_ptr<std::vector<cv::cuda::HostMem >> srcMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>(); std::shared_ptr<std::vector<cv::cuda::HostMem >> dstMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>(); //Create GpuMat arrays to use them on OpenCV CUDA Methods std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray = std::make_shared<std::vector<cv::cuda::GpuMat>>(); std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray = std::make_shared<std::vector<cv::cuda::GpuMat>>(); //Create Output array for CPU Mat std::shared_ptr<std::vector< cv::Mat >> outArray = std::make_shared<std::vector<cv::Mat>>(); std::shared_ptr<std::vector< Ptr<cv::cuda::StereoSGM>>> tab_sbm = std::make_shared<std::vector<Ptr<cv::cuda::StereoSGM>>>(); for (int i = 0; i < 4; i++) { cv::cuda::GpuMat srcMat0; cv::cuda::GpuMat srcMat1; cv::cuda::GpuMat dstMat; cv::Mat outMat; string nom1 = "D:/U1_Z"; string nom2 = "D:/U2_Z"; string type = ".tiff"; cv::Mat srcHostImage0 = cv::imread(nom1 + std::to_string(i) + type, IMREAD_GRAYSCALE); cv::cuda::HostMem srcHostMem0 = cv::cuda::HostMem(srcHostImage0, cv::cuda::HostMem::PAGE_LOCKED); srcMemArray->push_back(srcHostMem0); gpuSrcArray->push_back(srcMat0); cv::Mat srcHostImage1 = cv::imread(nom2 + std::to_string(i) + type, IMREAD_GRAYSCALE); cv::cuda::HostMem srcHostMem1 = cv::cuda::HostMem(srcHostImage1, cv::cuda::HostMem::PAGE_LOCKED); srcMemArray->push_back(srcHostMem1); gpuSrcArray->push_back(srcMat1); cv::cuda::HostMem srcDstMem = cv::cuda::HostMem(outMat, cv::cuda::HostMem::PAGE_LOCKED); dstMemArray->push_back(srcDstMem); gpuDstArray->push_back(dstMat); outArray->push_back(outMat); Ptr<cv::cuda::StereoSGM> ssgm = cv::cuda::createStereoSGM(0, 256, 10, 120, 5, cv::cuda::StereoSGM::MODE_HH4); tab_sbm->push_back(ssgm); } GpuTimer my_time; my_time.Start(); if(WithStream) computeArrayWithStream(srcMemArray, dstMemArray, gpuSrcArray, gpuDstArray, outArray, streamsArray, tab_sbm); else computeArrayWithoutStream(srcMemArray, dstMemArray, gpuSrcArray, gpuDstArray, outArray, streamsArray, tab_sbm); my_time.Stop(); std::printf("RESIZE time (gpu ms) = %f\n", my_time.Elapsed());
}
this is the sync function :
void computeArrayWithoutStream(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
std::shared_ptr<std::vector< cv::Mat >> outArray,
std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
std::shared_ptr < std::vector < Ptr< cv::cuda::StereoSGM>>> tab_sbm) {//Compute for each input image with async calls for (int i = 0; i < 4; i++) { //Upload Input Pinned Memory to GPU Mat (*gpuSrcArray)[2 * i].upload((*srcMemArray)[2 * i]); (*gpuSrcArray)[2 * i + 1].upload((*srcMemArray)[2 * i + 1]); //Use the CUDA Kernel Method //cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i]); (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i]); //Download result to Output Pinned Memory (*gpuDstArray)[i].download((*dstMemArray)[i]); //Obtain data back to CPU Memory (*outArray)[i] = (*dstMemArray)[i].createMatHeader(); }
}
this is the async function :
void computeArrayWithStream(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
std::shared_ptr<std::vector< cv::Mat >> outArray,
std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
std::shared_ptr<std::vector < Ptr < cv::cuda::StereoSGM>>> tab_sbm) {//Compute for each input image with async calls for (int i = 0; i < 4; i++) { //Upload Input Pinned Memory to GPU Mat (*gpuSrcArray)[2 * i].upload((*srcMemArray)[2 * i], (*streamsArray)[i]); (*gpuSrcArray)[2 * i + 1].upload((*srcMemArray)[2 * i + 1], (*streamsArray)[i]); //Use the CUDA Kernel Method //cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i], (*streamsArray)[i]); (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i],(*streamsArray)[i]); //Download result to Output Pinned Memory (*gpuDstArray)[i].download((*dstMemArray)[i], (*streamsArray)[i]); //Obtain data back to CPU Memory (*outArray)[i] = (*dstMemArray)[i].createMatHeader(); } //All previous calls are non-blocking therefore //wait for each stream completetion (*streamsArray)[0].waitForCompletion(); (*streamsArray)[1].waitForCompletion(); (*streamsArray)[2].waitForCompletion(); (*streamsArray)[3].waitForCompletion();
}
My question :
If I use cv::cuda::absdiff((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i], (*streamsArray)[i]);
(that is commented below), the async will work faster than the sync.
but if I use (*tab_sbm)[i]->compute((*gpuSrcArray)[2 * i], (*gpuSrcArray)[2 * i + 1], (*gpuDstArray)[i],(*streamsArray)[i]);
there is no difference between asyn and sync. Can you help me to understand why ?