Using the following code:
const int N = 1000;
void test_surf_performance1(){
auto surf = SURF::create();
surf->setUpright(true);
Mat src = imread("aloe.png", IMREAD_GRAYSCALE);
std::uint64_t sum = 0;
volatile auto t0 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
for (int i = N; i--;){
std::vector<KeyPoint> keypoints;
std::vector<float> descriptors;
surf->detectAndCompute(src, Mat(), keypoints, descriptors);
sum += descriptors.size();
}
volatile auto t1 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
std::cout << sum << std::endl;
std::cout << (t1 - t0) * hrc / N << std::endl;
}
void test_surf_performance2(){
SURF_CUDA surf;
surf.upright = true;
surf.extended = false;
GpuMat img_gpu;
GpuMat keypoints_gpu;
GpuMat descriptors_gpu;
auto img = imread("aloe.png", IMREAD_GRAYSCALE);
img_gpu.upload(img);
std::uint64_t sum = 0;
volatile auto t0 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
for (int i = N; i--;){
surf(img_gpu, GpuMat(), keypoints_gpu, descriptors_gpu);
std::vector<float> descriptors;
surf.downloadDescriptors(descriptors_gpu, descriptors);
sum += descriptors.size();
}
volatile auto t1 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
std::cout << sum << std::endl;
std::cout << (t1 - t0) * hrc / N << std::endl;
}
the measured time is 4.34 ms per call for the CPU and 2.13 ms per call for the GPU. Commenting out downloadDescriptors() makes a difference of 0.1 ms.