Thanks for your helpful reply.
I find a batch based DFT functions named cufftPlanMany in CUDA libraries. This method is not used because I think assignning multiple tasks to the single stream and these streams run parallelly. However, time cost is larger than expected. Any suggestion on how to find the reason or improve the strategy?
// This is the main part codes
for (size_t i = 0;i < cudaStreams.size();++i)
{
fun_single_stream(mat1_ls[i],mat2_ls[i],cudaStream[i]);
}
void fun_single_stream(cv::cuda::GpuMat& img_mat1,cv::cuda::GpuMat& img_mat2,cv::cuda::Stream stream_local)
{
cv::cuda::dft(mat1, tmp_gpu_mat1 regionSize, cv::DFT_SCALE, stream_local);
cv::cuda::dft(mat2, tmp_gpu_mat2, regionSize, cv::DFT_SCALE, stream_local);
cv::cuda::mulSpectrums(tmp_gpu_mat1, tmp_gpu_mat2, tmp_gpu_mat3, cv::DFT_COMPLEX_OUTPUT, true, stream_local);
cv::cuda::dft(tmp_gpu_mat1, tmp_gpu_mat2, regionSize, cv::DFT_REAL_OUTPUT | cv::DFT_INVERSE | cv::DFT_COMPLEX_INPUT,stream_local);
...
}