I have a cv::dnn::Net model that takes an image as input and outputs another one with reduced noise. The function cv::dnn::Net::forward() seems not to be multi-threaded, whether I set a single large image as input, or split the image into tiles and provide the batch of tiles as input. This makes it quite slow.
I can get a significant speed up by multi-threading it myself. I divide the tiles into as many batches as there are processors, and process all the batches in parallel.
However, I’ve found that I have to load the net separately (by calling readNet with the PB file path as the argument) in each thread. I can’t just load the model once, and call setInput(blob) and forward() on it within each thread without getting errors. I’m not surprised - it doesn’t seem a thread-safe way.
OpenCV(4.6.0) Error: Assertion failed (mapIt != reuseMap.end()) in releaseReference, file C:\opencv-4.6.0\modules\dnn\src\legacy_backend.hpp, line 167
But my solution to load the net from disk separately in each thread doesn’t seem a great one either to me. I also thought of deep-copying the net into each thread, but apparently this is not possible. Am I just going about this in completely the wrong way?
#include <opencv2/highgui.hpp>
#include <opencv2/dnn/dnn.hpp>
#include <iostream>
#include <opencv2/core/utils/filesystem.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <thread>
cv::Mat OpenImage(const std::string& file);
int main(int argc,char** argv)
{
int overlap=5; // tile overlap - allows edge artefacts to be cropped out
int dim=100; // tile dimension
if(argc!=3)
{
std::cout<<" Usage: " << argv[0] << " Model.pb Image"<<std::endl;
return -1;
}
// Load noisy image
cv::Mat noisy;
noisy=OpenImage(argv[2]); // Read the noisy image
if(noisy.empty()) // Check for invalid input
{
std::cout<< "Could not open or find the noisy image"<<std::endl ;
return -1;
}
// Add border around noisy image for tile overlapping
cv::Mat nsyPlusBrdr;
cv::copyMakeBorder(noisy,nsyPlusBrdr,overlap,overlap,overlap,overlap,cv::BORDER_REFLECT);
int sizeX=nsyPlusBrdr.cols; // image width
int sizeY=nsyPlusBrdr.rows; // image height
// Split into tiles
std::vector<cv::Mat> tiles;
int fromRow=0; // row pixel index
int fromCol=0; // col pixel index
int nRows=0;
int nCols=0;
bool finalRow=false;
while(true) // loop rows
{
nRows++;
if(fromRow+dim>sizeY)
{
fromRow=sizeY-dim;
finalRow=true;
}
bool finalCol=false;
while(true) // loop cols
{
if(finalRow) nCols++; // count columns on final row only
if(fromCol+dim>sizeX)
{
fromCol=sizeX-dim;
finalCol=true;
}
tiles.push_back(nsyPlusBrdr(cv::Rect(fromCol,fromRow,dim,dim)));
if(finalCol)
{
fromCol=0;
break;
}
else fromCol+=(dim-2*overlap);
}
if(finalRow) break;
else fromRow+=(dim-2*overlap);
}
std::cout<<nRows<<" rows of tiles"<<std::endl;
std::cout<<nCols<<" columns of tiles"<<std::endl;
std::vector<cv::Mat> inferred_tiles(tiles.size());
auto f=[&](unsigned int from,unsigned int num) // lambda expression to process batch in thread
{
// Load Net
cv::dnn::Net model=cv::dnn::readNet(argv[1]); // each thread has its own model duplicate
// model.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
// model.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);
// Infer a batch
std::vector<cv::Mat> batch;
for(unsigned int i=from;i<tiles.size()&&i<from+num;++i)
{
batch.push_back(tiles[i]);
}
cv::Mat blob=cv::dnn::blobFromImages(batch);
model.setInput(blob);
cv::Mat outblob=model.forward();
std::vector<cv::Mat> outarray;
cv::dnn::imagesFromBlob(outblob,outarray);
int count=0;
for(unsigned int i=from;i<tiles.size()&&i<from+num;++i)
{
inferred_tiles[i]=outarray[count];
count++;
}
};
unsigned int num_threads=std::thread::hardware_concurrency();
if(num_threads==0) num_threads=1;
int batch_size=tiles.size()/num_threads;
std::vector<std::thread> threads;
for(unsigned int i=0;i<num_threads;i++)
{
std::thread th(f,batch_size*i,(i==num_threads-1?tiles.size()-batch_size*i:batch_size));
threads.push_back(std::move(th));
}
for(auto& i:threads) i.join();
// Reassemble image
cv::Mat inferred=cv::Mat::zeros(noisy.size(),noisy.type());
fromRow=0; // row pixel index
fromCol=0; // col pixel index
int placeDim=dim-2*overlap; // dimension of each tile without overlap to be placed in assembled image
int count=0;
for(int i=0;i<nRows;++i)
{
if(i==nRows-1) fromRow=(sizeY-2*overlap)-placeDim;
for(int j=0;j<nCols;++j)
{
if(j==nCols-1) fromCol=(sizeX-2*overlap)-placeDim;
inferred_tiles[count](cv::Rect(overlap,overlap,placeDim,placeDim)).copyTo(
inferred(cv::Rect(fromCol,fromRow,placeDim,placeDim)));
count=count+1;
if(j==nCols-1)
{
fromCol=0;
break;
}
else fromCol+=placeDim;
}
fromRow+=placeDim;
}
inferred=cv::min(cv::max(inferred,0.),1.);
// View result
cv::namedWindow("Noisy",cv::WINDOW_NORMAL);
cv::imshow("Noisy",noisy); // Show noisy image
cv::namedWindow("Inferred",cv::WINDOW_NORMAL);
cv::imshow("Inferred",inferred); // Show noisy image
cv::waitKey(0); // Wait for a keystroke in the window
return 0;
}
cv::Mat OpenImage(const std::string& file) // returns 32 bit float
{
cv::Mat image;
if(cv::utils::fs::exists(file)) image=cv::imread(file,cv::IMREAD_UNCHANGED); // read the file
if(!image.empty()) // check for invalid input
{
if(image.channels()==4) cv::cvtColor(image,image,cv::COLOR_BGRA2BGR);
if(image.depth()==CV_8U) image.convertTo(image,CV_32F,1.0/255.0); // convert 8 bit integer to 32 bit float scaling to range 0 to 1
else if(image.depth()==CV_16U) image.convertTo(image,CV_32F,1.0/65535.0); // convert 16 bit integer to 32 bit float scaling to range 0 to 1
else if(image.depth()==CV_16F) image.convertTo(image,CV_32F); // convert 16 bit integer to 32 bit float without rescaling
else if(image.depth()==CV_32F) return image; // already 32 bit float
else return cv::Mat(); // if not 8, 16 or 32 bit, return empty
}
return image; // image will be empty if file does not exist
}