I have tested some cudafilter functions but found it slower than CPU code on Jetson Xavier NX. The opencv version is 4.5.0. CUDA version is 10.2. The code as follows:
const int iternum = 100;
Mat img = imread("../../LenaGRAY.bmp", 0);
resize(img, img, Size(8000, 4000));
Mat element = getStructuringElement(MORPH_RECT, Size(15, 15));
Mat dst0 = Mat::zeros(img.size(), CV_8UC1);
Mat img0 = img.clone();
double t0_start = getTickCount();
for(int i = 0; i < iternum; i++)
boxFilter(img0, dst0, CV_8UC1, Size(10, 10));
//GaussianBlur(img0, dst0, CV_8UC1, Size(5, 5), 0);
//morphologyEx(img0, dst0, MORPH_DILATE, element);
double t0_end = getTickCount();
cout<<"time:"<<(t0_end - t0_start) / getTickFrequency() * 1000 / iternum << "ms"<<endl;
cuda::GpuMat imgGpu;
cuda::GpuMat dstGpu;
dstGpu.create(img.size(), CV_8UC1);
Mat dst1;
double t1_start = getTickCount();
for(int i = 0; i < iternum; i++)
cv::Ptr<cv::cuda::Filter>f1 = cv::cuda::createBoxFilter(CV_8UC1, CV_8UC1, Size(10, 10));
f1->apply(imgGpu, dstGpu);
//cv::Ptr<cv::cuda::Filter>f2 = cv::cuda::createGaussianFilter(CV_8UC1, CV_8UC1, Size(5, 5), 0);
//f2->apply(imgGpu, dstGpu);
//cv::Ptr<cv::cuda::Filter>f3 = cv::cuda::createMorphologyFilter(MORPH_DILATE, CV_8UC1, element);
//f3->apply(imgGpu, dstGpu);
double t1_end = getTickCount();
cout<<"time:"<<(t1_end - t1_start) / getTickFrequency() * 1000 / iternum << "ms"<<endl;
I have tested boxFilter GaussianBlur and morpholoyEx, cuda functions is slower than on CPU.
I have also tested on Windows 10 PC. The GPU is NVIDIA GeForce RTX 3060, and CUDA version is 11.1. CUDA filter functions are still slower than CPU.