SURF_CUDA performance

Using the following code:

const int N = 1000;

void test_surf_performance1(){
	auto surf = SURF::create();
	surf->setUpright(true);
	Mat src = imread("aloe.png", IMREAD_GRAYSCALE);
	std::uint64_t sum = 0;
	volatile auto t0 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
	for (int i = N; i--;){
		std::vector<KeyPoint> keypoints;
		std::vector<float> descriptors;
		surf->detectAndCompute(src, Mat(), keypoints, descriptors);
		sum += descriptors.size();
	}
	volatile auto t1 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
	std::cout << sum << std::endl;
	std::cout << (t1 - t0) * hrc / N << std::endl;
}

void test_surf_performance2(){
	SURF_CUDA surf;
	surf.upright = true;
	surf.extended = false;
	GpuMat img_gpu;
	GpuMat keypoints_gpu;
	GpuMat descriptors_gpu;
	auto img = imread("aloe.png", IMREAD_GRAYSCALE);
	img_gpu.upload(img);
	std::uint64_t sum = 0;
	volatile auto t0 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
	for (int i = N; i--;){
		surf(img_gpu, GpuMat(), keypoints_gpu, descriptors_gpu);
		std::vector<float> descriptors;
		surf.downloadDescriptors(descriptors_gpu, descriptors);
		sum += descriptors.size();
	}
	volatile auto t1 = std::chrono::high_resolution_clock::now().time_since_epoch().count();
	std::cout << sum << std::endl;
	std::cout << (t1 - t0) * hrc / N << std::endl;
}

the measured time is 4.34 ms per call for the CPU and 2.13 ms per call for the GPU. Commenting out downloadDescriptors() makes a difference of 0.1 ms.