So if cuda bases Sift and Surf aren’t much faster with OpenCV does that only leave PopSift(GitHub - alicevision/popsift: PopSift is an implementation of the SIFT algorithm in CUDA.) and CudaSift(GitHub - Celebrandil/CudaSift: A CUDA implementation of SIFT for NVidia GPUs (1.2 ms on a GTX 1060)) as fast SIFT alternatives?
Because in my testing of PopSift it ran very slow, 200ms on default settings with the code below on the same gpu and 1080p video:
// main.cpp
//#include <opencv2/opencv.hpp>
//#include <opencv2/core/core.hpp>
#include <opencv2/opencv.hpp>
#include <opencv2/opencv_modules.hpp>
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/cudafeatures2d.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/xfeatures2d/cuda.hpp>
#include <popsift/popsift.h>
#include <popsift/features.h>
#include <stdio.h>
#include <string>
#include <iostream>
#include <opencv2/features2d.hpp>
using namespace cv;
using namespace std;
using std::string;
int main()
{
cudaDeviceReset();
std::clock_t start;
popsift::Config config;
config.setDownsampling(0);
config.setFilterMaxExtrema(false);
config.setVerbose(true);
PopSift PopSift(
config,
popsift::Config::ExtractingMode,
PopSift::ByteImages
);
popsift::cuda::device_prop_t deviceInfo;
deviceInfo.print();
string filename = "1080.mp4";
VideoCapture cap(filename);
Mat frame;
if (!cap.isOpened())
{
std::cerr << "Couldn't open capture." << std::endl;
return -1;
}
cv::Mat bgr_frame, gray;
cv::cuda::GpuMat img1, img2;
cv::cuda::GpuMat keypoints1GPU;
cv::cuda::GpuMat descriptors1GPU;
for (;;)
{
cap >> frame;
if (frame.empty()) break;
cv::cvtColor(frame, gray, cv::COLOR_BGR2GRAY);
unsigned char* dataMat = gray.data;
// PopSift
SiftJob* job = PopSift.enqueue(frame.cols, frame.rows, dataMat);
start = std::clock();
popsift::Features* feature_list = job->get();
std::cout << "Time: " << (std::clock() - start) / (double)(CLOCKS_PER_SEC / 1000) << " ms" << std::endl;
cerr << "Number of feature points: " << feature_list->getFeatureCount()
<< " number of feature descriptors: " << feature_list->getDescriptorCount()
<< endl;
char c = cv::waitKey(10);
if (c == 27) break;
}
cap.release();
return 0;
}
And with CudaSift I get around 10ms but with some frames as high as 40ms
#include <iostream>
#include <cmath>
#include <iomanip>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include "cudaImage.h"
#include "cudaSift.h"
using namespace cv;
using namespace std;
using std::string;
int main(int argc, char** argv){
/* Reserve memory space for a whole bunch of SIFT features. */
SiftData siftData;
InitSiftData(siftData, 185000, true, true);
CudaImage img;
int numOctaves = 2; /* Number of octaves in Gaussian pyramid */
float initBlur = 1.0f; /* Amount of initial Gaussian blurring in standard deviations */
float thresh = 3.5f; /* Threshold on difference of Gaussians for feature pruning */
float minScale = 0.0f; /* Minimum acceptable scale to remove fine-scale features */
bool upScale = false; /* Whether to upscale image before extraction */
std::clock_t start;
cout << "Is mainSift running?";
string filename = "1080.mp4";
VideoCapture cap(filename);
Mat frame, tmp;
if (!cap.isOpened())
{
std::cerr << "Couldn't open capture." << std::endl;
return -1;
}
for (;;)
{
//cap >> frame;
cap.read(frame);
if (frame.empty()) break;
cv::imshow("frame", frame);
//cv::cvtColor(frame, gray, cv::COLOR_BGR2GRAY);
frame.convertTo(tmp, CV_32FC1);
cout << frame.cols;
/* Allocate 1280x960 pixel image with device side pitch of 1280 floats. */
/* Memory on host side already allocated by OpenCV is reused. */
int64 t1 = cv::getTickCount();
img.Allocate(frame.cols, frame.rows, 1280, false, NULL, (float*) tmp.data);
img.Download();
/* Extract SIFT features */
ExtractSift(siftData, img, numOctaves, initBlur, thresh, minScale, upScale);
int64 t2 = cv::getTickCount();
/* Free space allocated from SIFT features */
//FreeSiftData(siftData);
cout << "CURRENT END TIME OF A FRAME INFERENCE PLEASE SHOW: \n";
double time_elapsed = (t2 - t1) / cv::getTickFrequency() * 1000.0;
std::cout << "Time elapsed: " << time_elapsed << " ms" << std::endl;
char c = cv::waitKey(10);
if (c == 27) break;
}
cout << "End of loop?";
cap.release();
return 0;
}
I’m not very familiar with c, is there something wrong with the loop?
I’m not understanding how these repos claim in the order of ~1-4ms on a weaker gpu?