Hey! if someone could provide me with examples to implement OpenCV CUDA on shared memory it will be very helpful. How do we define Mat variable as shared memory to avoid upload/download process?
I don’t think its possible to do what you want which sounds like a UMat
structure for the cuda
namespace.
Mat
and GpuMat
are handled differently inside OpenCV and as far as I am aware there is no support for managed memory.
I came across this post from NVIDIA forum.
This piece of code is from that post as an accepted answer, though i didn’t understand it well. Can you help me out? Can you confirm if the approach mentioned in the post is correct?
#include <iostream>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <opencv2/videoio.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/cudafilters.hpp>
//comment this definition for using pinned memory instead of unified memory
#define USE_UNIFIED_MEM
int main()
{
//std::cout << cv::getBuildInformation() << std::endl;
const char* gst = "nvarguscamerasrc ! video/x-raw(memory:NVMM), format=(string)NV12, width=(int)640, height=(int)480, framerate=(fraction)30/1 ! \
nvvidconv ! video/x-raw, format=(string)BGRx, framerate=(fraction)30/1 ! \
videoconvert ! queue ! video/x-raw, format=(string)BGR, framerate=(fraction)30/1 ! \
appsink";
cv::VideoCapture cap(gst, cv::CAP_GSTREAMER);
if(!cap.isOpened()) {
std::cout<<"Failed to open camera."<<std::endl;
return (-1);
}
unsigned int width = cap.get(cv::CAP_PROP_FRAME_WIDTH);
unsigned int height = cap.get(cv::CAP_PROP_FRAME_HEIGHT);
unsigned int fps = cap.get(cv::CAP_PROP_FPS);
unsigned int pixels = width*height;
std::cout <<"Frame size : "<<width<<" x "<<height<<", "<<pixels<<" Pixels "<<fps<<" FPS"<<std::endl;
cv::namedWindow("frame_out", cv::WINDOW_AUTOSIZE );
bool hasOpenGlSupport = true;
try {
cv::namedWindow("d_frame_out", cv::WINDOW_AUTOSIZE | cv::WINDOW_OPENGL);
}
catch(cv::Exception& e) {
hasOpenGlSupport = false;
}
unsigned int frameByteSize = pixels * 3;
#ifndef USE_UNIFIED_MEM
/* Pinned memory. No cache */
std::cout << "Using pinned memory" << std::endl;
void *device_ptr, *host_ptr;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaHostAlloc((void **)&host_ptr, frameByteSize, cudaHostAllocMapped);
cudaHostGetDevicePointer((void **)&device_ptr, (void *) host_ptr , 0);
cv::Mat frame_out(height, width, CV_8UC3, host_ptr);
cv::cuda::GpuMat d_frame_out(height, width, CV_8UC3, device_ptr);
#else
/* Unified memory */
std::cout << "Using unified memory" << std::endl;
void *unified_ptr;
cudaMallocManaged(&unified_ptr, frameByteSize);
cv::Mat frame_out(height, width, CV_8UC3, unified_ptr);
cv::cuda::GpuMat d_frame_out(height, width, CV_8UC3, unified_ptr);
#endif
cv::Ptr< cv::cuda::Filter > filter = cv::cuda::createSobelFilter(CV_8UC3, CV_8UC3, 1, 1, 1, 1, cv::BORDER_DEFAULT);
cv::Mat frame_in;
while(1)
{
if (!cap.read(frame_in)) {
std::cout<<"Capture read error"<<std::endl;
break;
}
else {
frame_in.copyTo(frame_out);
// no need to copy to device
filter->apply(d_frame_out, d_frame_out);
if (hasOpenGlSupport)
cv::imshow("d_frame_out", d_frame_out);
// no need to copy back to host
cv::imshow("frame_out", frame_out);
cv::waitKey(1);
}
}
cap.release();
return 0;
}
All they are doing is allocating managed memory using the CUDA api and then explicitly creating a Mat
and a GpuMat
which are wrappers around that memory.
Then you can pass a Mat
into the host functions and the GpuMat
into the device functions. In its current form I am not sure I see the advantage over explicitly using upload and download.
If you wrapped the functionality in your own class which implicitly converted to Mat/GpuMat/InputArray/OutputArray then that would be more useful because you could pass your new class to both host and device functions. At the very least I would create a struct with both the Mat and GpuMat as members so you don’t loose track of which ones relate to each other.
I am probably not the best person to advise you hear as I don’t use managed memory because I prefer to have more control over when I upload and download to the device.