Hi,
I’ve been using cv::cuda::createGoodFeaturesToTrackDetector and I found a pretty weird behaviour memory-wise.
I profiled the GFTT form cuda tracing NVTX ranges and cuda calls.
The issues I saw were :
- Detect function is running some (and not for all) GPU operations on the default cuda stream, even though the detect function is being passed a non-default stream.
- Detect function will do some very slow CPU allocations/copies/frees.
So to understand that, I took a look at the cuda GFTT implementation from which I removed a verbose loop at the end:
void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
    {
        using namespace cv::cuda::device::gfft;
        GpuMat image = _image.getGpuMat();
        GpuMat mask = _mask.getGpuMat();
        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
        ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
        cornerCriteria_->compute(image, eig_, stream);
        double maxVal = 0;
        cuda::minMax(eig_, 0, &maxVal);
        cudaStream_t stream_ = StreamAccessor::getStream(stream);
        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);
        if (total == 0)
        {
            _corners.release();
            return;
        }
        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
... (CPU code not relevant to my issue)
}
It then becomes clear why one operation is ran on the defaulft cuda stream → cuda::minMax does not allow to be used with a custom cuda stream. But cuda::minMax also outputs 2 doubles and does not let you use GpuMat as output so it forces CPU host allocation/copy/free.
cuda::minMax is using cuda::findMinMax under the hood that accepts a custom stream and can be used with GpuMats as output.
void cv::cuda::findMinMax(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
{
    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
    static const func_t funcs[] =
    {
        minMaxImpl<uchar, int>,
        minMaxImpl<schar, int>,
        minMaxImpl<ushort, int>,
        minMaxImpl<short, int>,
        minMaxImpl<int, int>,
        minMaxImpl<float, float>,
        minMaxImpl<double, double>
    };
    const GpuMat src = getInputMat(_src, stream);
    const GpuMat mask = getInputMat(_mask, stream);
    CV_Assert( src.channels() == 1 );
    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
    const int src_depth = src.depth();
    const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth;
    GpuMat dst = getOutputMat(_dst, 1, 2, dst_depth, stream);
    const func_t func = funcs[src.depth()];
    func(src, mask, dst, stream);
    syncOutput(dst, _dst, stream);
}
void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask)
{
    Stream& stream = Stream::Null();
    HostMem dst;
    findMinMax(_src, dst, _mask, stream);
    stream.waitForCompletion();
    double vals[2];
    dst.createMatHeader().convertTo(Mat(1, 2, CV_64FC1, &vals[0]), CV_64F);
    if (minVal)
        *minVal = vals[0];
    if (maxVal)
        *maxVal = vals[1];
}
My question is then : Is there a reason to not use cuda::findMinMax that I’m completely missing?
And a follow up : Open Source is new to me but, is this a good Github issue to open on OpenCV repository?
My specs :
- GPU : RTX 4060
- CUDA version : 12.6
- OpenCV version : 4.11.0
