How to resize cv::mat image size with a cuda kernel?

I know that there is specific OpenCV functions for this purpose, but I really would like to implement my own solution.
I would like to shrink frames from a video, each of them represented by cv::mat. The data of cv::mat is copied to GPU side, then a cuda kernel is called to use nearest neighbour algorithm. Unfortunately, when I copy the resized image from device to host side, it is just a black picture. I suppose there is some problem when I am using uchar* from the cv::Mat::data in the cuda kernel. You can check the whole source code at GitHub - foxakarmi/imageResize

Main function

#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>

void *buffers[3];

int main() {

    cv::VideoCapture capture;
    cv::Mat frame;

    capture.open("/p.mp4");

    if (!capture.isOpened()) {
        std::cout << "can not open" << std::endl;
        return -1;
    }
    capture.read(frame);

    CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
    CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
    buffers[2] = malloc(3 * 640 * 640);

    while (capture.read(frame)) {
        CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))

        cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);

        CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))

        cv::Mat foo;
        foo.data = static_cast<uchar *>(buffers[2]);
        cv::imshow("img", foo);
        cv::waitKey(1);
    }

    capture.release();
    return 0;
}

Kernel and Kernel wrapper function

#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"

__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    int channel = 3;

    if (i < dst_h && j < dst_w) {
        int iIn = i * src_h / dst_h;
        int jIn = j * src_w / dst_h;

        dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
        dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
        dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
    }
}

cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
    if (!src_img || !dst_img)
        return cudaErrorInvalidDevicePointer;

    if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
        return cudaErrorInvalidValue;

    kernelNearestNeighbourResize <<< 3600, 256>>>(
            src_img, dst_img, src_w,
            src_h, dst_w, dst_h);

    return cudaGetLastError();
}

I am not sure if the code you have shown is the exact version you are using but you are trying to display an image with no size. To use buffers[2] in a cv::Mat you should do
cv::Mat foo(h, w, CV_8UC3, buffers[2]);

Your kernel assumes you are using blocks of threads however you are not passing block and grid dimensions to the kernel so your still won’t get a resized frame. I would suggest passing suitable grid block dims such as

dim3 block(32, 32);
dim3 grid((dst_w + block.x - 1) / block.x, (dst_h + block.y - 1) / block.y);
kernelNearestNeighbourResize << < grid, block >> > (src_img, dst_img, src_w,src_h, dst_w, dst_h);

Your code should now work however I would suggest testing with non symetric sizes because you have

        int jIn = j * src_w / dst_h;

and not

        int jIn = j * src_w / dst_w;

it will fail when dst_w=dst_h.

1 Like

Many thanks to you! :slightly_smiling_face: Now it is perfectly working! I bet there was something wrong with those kernel dimensions as well.