Basically, I tried to run my opencv cuda project to compare the execution time between cpu and gpu. But when I read the log file, it shows the execution time for GPU have some delay after 6 times. I know that the very first CUDA kernel takes longer for some GPU initialization. So we can see for the very first time the execution time takes longest. My question is why the execution time after 6 times will be so much different? After series 6 the process time start to takes long time to execute the process. The function that I apply cuda is convert, blurring, dilation and morphological operation.
//My Opencv Cuda Code
LOG_INFO << "----------------------Process Program----------------------" << ENDLOG;
LOG_INFO << "Read Image - Start" << ENDLOG;
_imageCvMat = cv::imread(image_path);
LOG_INFO << "Read Image - END" << ENDLOG;
LOG_INFO << "Upload to GPU - Start" << ENDLOG;
_imageCvCudaMat.upload(_imageCvMat);
LOG_INFO << "isContinuous " << _imageCvCudaMat.isContinuous()<< ENDLOG;
LOG_INFO << "Upload to GPU - END" << ENDLOG;
LOG_INFO << "Convert Image Cuda - Start" << ENDLOG;
cv::cuda::cvtColor(_imageCvCudaMat, _grayCvCudaMat, cv::COLOR_BGR2GRAY);
LOG_INFO << "Convert Image Cuda - END " << ENDLOG;
LOG_INFO << "GPU to CPU gray image - Start" << ENDLOG;
_grayCvCudaMat.download(_grayCvMat);
LOG_INFO << "GPU to CPU gray image- END" << ENDLOG;
LOG_INFO << "Blurring Image Cuda - Start " << ENDLOG;
cv::Ptr<cv::cuda::Filter> gaussian_filter = cv::cuda::createGaussianFilter(_grayCvCudaMat.type(), _blurredCvCudaMat.type(), cv::Size(5, 5), 0);
gaussian_filter->apply(_grayCvCudaMat, _blurredCvCudaMat);
LOG_INFO << "Blurring Image Cuda - END " << ENDLOG;
LOG_INFO << "Threshold Image - Start " << ENDLOG;
cv::threshold(_grayCvMat, _threshCvmat, 0, 255, cv::THRESH_BINARY_INV | cv::THRESH_OTSU);
LOG_INFO << "Threshold Image - END " << ENDLOG;
LOG_INFO << "Distance Transform Image - Start " << ENDLOG;
cv::distanceTransform(_threshCvmat, _distCvMat, cv::DIST_L2, 5);
LOG_INFO << "Distance Transform Image - END " << ENDLOG;
// Normalize the distance transform such that the distances lie in
// the range [0, 1] and then convert the distance transform back to
// an unsigned 8-bit integer in the range [0, 255]
LOG_INFO << "Normalize Image - Start " << ENDLOG;
cv::normalize(_distCvMat, _distCvMat, 0, 12.0, cv::NORM_MINMAX, CV_32FC1);
_distCvMat = _distCvMat * 255;
_distCvMat.convertTo(_distCvMat, CV_8U);
LOG_INFO << "Normalize Image - END " << ENDLOG;
// Threshold the distance transform using Otsu's method
LOG_INFO << "Threshold Image - Start " << ENDLOG;
cv::threshold(_distCvMat, _distCvMat, 0, 255, cv::THRESH_BINARY_INV | cv::THRESH_OTSU);
LOG_INFO << "Threshold Image - END " << ENDLOG;
LOG_INFO << "Threshold CPU to GPU - Start " << ENDLOG;
_distCvCudaMat.upload(_distCvMat);
LOG_INFO << "Threshold CPU to GPU - END " << ENDLOG;
LOG_INFO << "Dilate Image - Start " << ENDLOG;
m_kernelT1CvMat = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(7,7));
cv::Ptr<cv::cuda::Filter> dilate_filter = cv::cuda::createMorphologyFilter(cv::MORPH_DILATE, _distCvCudaMat.type(), m_kernelT1CvMat, cv::Point(-1, -1), 1);
dilate_filter->apply(_distCvCudaMat, _dilateCvCudaMat);
LOG_INFO << "Dilate Image - END " << ENDLOG;
LOG_INFO << "Opening morphological Image - Start " << ENDLOG;
m_kernelT2CvMat = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(7, 7));
cv::Ptr<cv::cuda::Filter> open_filter = cv::cuda::createMorphologyFilter(cv::MORPH_OPEN, _dilateCvCudaMat.type(), m_kernelT2CvMat, cv::Point(-1, -1), 3);
open_filter->apply(_dilateCvCudaMat, _openningCvCudaMat);