I am looking to copy a GpuMat into a 1D array using a custom Kernel. I do know that opencv has function to perform the copy, I would like to do it this way since it will be extended to do some custom padding and convert the image from HWC to CHW. The issue I am having is that I am simply getting garbage after making the copy. My code is shown below.
In short, what I can tell is that when I am making the copy in the kernal from the GpuMat to the 1D array, I am misunderstanding how data is structured in the GpuMat, as a result it is not copying properly.
If anyone has any suggestions it would be greatly appreciated!!!
Thanks,
Aidan
int main() {
/* .... Some other code that is working just fine ....*/
cv::cuda::GpuMat d_frame_resized; // This is for context. You can assume that d_frame_resized contains a valid image (I verified by writing the image to disc) of dimension 320 * 640 * 3 (HWC).
int out_height = 320;
int out_width - 640;
unsigned char * h_out = copy(&d_frame_resized, out_height, out_width);
cv::Mat A(out_height, out_width, CV_8UC3, h_out);
cv::imwrite("test.jpg", A); // This image is a bunch of garbage.
}
unsigned char * copy(
cv::cuda::GpuMat* d_in,
int in_height,
int in_width
) {
// Computing the output dimensions
int out_height = in_height;
int out_width = in_width;
// Making sure the height and width is valid
if (out_width <= 0 || out_height <= 0 || in_height <= 0 || in_width <= 0 || left_pad < 0 || right_pad < 0 || top_pad < 0 || bottom_pad < 0) {
throw std::runtime_error(std::string("INVALID: out_width <= 0 || out_height <= 0 || in_height <= 0 || in_width <= 0 || left_pad < 0 || right_pad < 0 || top_pad < 0 || bottom_pad < 0 in Cuda_Frame::crop_frame"));
}
// NOTE: Size of image is dynamic, hence the dynamic determination of the threads. If there is a better way to do this, please do let me know.
// Defining initial grid dimensions
dim3 gridDim(32, 32);
// Defining initial block dimensions
dim3 blockDim(iDivUp(out_width,gridDim.x), iDivUp(out_height,gridDim.y), 3);
// Computing the number of threads per block
int num_threads_per_block = blockDim.x * blockDim.y * 3;
// Making the number of threads per block fit requirements
while (num_threads_per_block > 512 || num_threads_per_block < 32) {
// If there are too many threads in a block, we decrease the number of threads per block
if (num_threads_per_block > 512) {
gridDim.x *= 2;
gridDim.y *= 2;
blockDim.x = iDivUp(out_width,gridDim.x);
blockDim.y = iDivUp(out_height,gridDim.y);
}
else { // We increase the number of threads per block
if (blockDim.x > 2 | blockDim.y > 2) {
gridDim.x /= 2;
gridDim.y /= 2;
blockDim.x = iDivUp(out_width,gridDim.x);
blockDim.y = iDivUp(out_height,gridDim.y);
} else {
break;
}
}
num_threads_per_block = blockDim.x * blockDim.y * 3;
}
// Pointer to the frame on device memory
unsigned char* d_out;
gpuErrchk( cudaMalloc((void**)&d_out, sizeof(unsigned char) * (out_width) * (out_height) * 3));
// Launching kernel
pad_and_convert_to_chw_kernel<<<gridDim,blockDim>>>(
*d_in,
d_out,
in_width,
in_height,
out_width,
out_height
);
gpuErrchk( cudaPeekAtLastError() );
// Forcing thread to wait for crop
gpuErrchk( cudaDeviceSynchronize() );
// Creating space for the cropped frame on the host
unsigned char * h_out = (unsigned char*) malloc(sizeof(unsigned char) * out_width * out_height * 3);
// Copying the cropped frame from the device to host
gpuErrchk( cudaMemcpy(h_out, d_out, sizeof(unsigned char) * out_width * out_height * 3, cudaMemcpyDeviceToHost));
// Freeing the device output
gpuErrchk( cudaFree(d_out));
return h_out;
}
__global__ void pad_and_convert_to_chw_kernel(
cv::cuda::PtrStepSzb input,
unsigned char * output,
int in_width,
int in_height,
int out_width,
int out_height
) {
// Computing the x position of the thread relative to the frame
int out_x = (blockIdx.x * blockDim.x) + threadIdx.x;
// Computing the y position of the thread relative to the frame
int out_y = (blockIdx.y * blockDim.y) + threadIdx.y;
// Computing the z position of the thread relative to the frame
int out_in_z = threadIdx.z;
// Performing check to make sure thread position is inside frame
if (out_x >= out_width || out_y >= out_height || out_in_z >= 3) {
return;
}
// Computing the x position of the thread relative to the original frame
int in_x = out_x
// Computing the y position of the thread relative to the original frame
int in_y = out_y
// Computing the index of the thread for the flattened original frame
int in_pos = (in_y * input.step) + (in_x * 3) + out_in_z;
// Computing the index of the thread for the flattened new frame
int out_pos = (out_y * out_width) + (out_x * 3) + out_in_z;
output[out_pos] = input[in_pos];
}