Multi-thread cv::dnn::Net::forward()

I have a cv::dnn::Net model that takes an image as input and outputs another one with reduced noise. The function cv::dnn::Net::forward() seems not to be multi-threaded, whether I set a single large image as input, or split the image into tiles and provide the batch of tiles as input. This makes it quite slow.

I can get a significant speed up by multi-threading it myself. I divide the tiles into as many batches as there are processors, and process all the batches in parallel.

However, I’ve found that I have to load the net separately (by calling readNet with the PB file path as the argument) in each thread. I can’t just load the model once, and call setInput(blob) and forward() on it within each thread without getting errors. I’m not surprised - it doesn’t seem a thread-safe way.

OpenCV(4.6.0) Error: Assertion failed (mapIt != reuseMap.end()) in releaseReference, file C:\opencv-4.6.0\modules\dnn\src\legacy_backend.hpp, line 167

But my solution to load the net from disk separately in each thread doesn’t seem a great one either to me. I also thought of deep-copying the net into each thread, but apparently this is not possible. Am I just going about this in completely the wrong way?

#include <opencv2/highgui.hpp>
#include <opencv2/dnn/dnn.hpp>
#include <iostream>
#include <opencv2/core/utils/filesystem.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <thread>

cv::Mat OpenImage(const std::string& file);

int main(int argc,char** argv)
{
	int overlap=5; // tile overlap - allows edge artefacts to be cropped out
	int dim=100; // tile dimension
	
	if(argc!=3)
	{
		std::cout<<" Usage: " << argv[0] << " Model.pb Image"<<std::endl;
		return -1;
	}
	
	// Load noisy image
	cv::Mat noisy;
	noisy=OpenImage(argv[2]); // Read the noisy image
	if(noisy.empty()) // Check for invalid input
	{
		std::cout<< "Could not open or find the noisy image"<<std::endl ;
		return -1;
	}

    // Add border around noisy image for tile overlapping
	cv::Mat nsyPlusBrdr;
    cv::copyMakeBorder(noisy,nsyPlusBrdr,overlap,overlap,overlap,overlap,cv::BORDER_REFLECT);
    int sizeX=nsyPlusBrdr.cols; // image width
    int sizeY=nsyPlusBrdr.rows; // image height

	// Split into tiles
	std::vector<cv::Mat> tiles;	
	int fromRow=0; // row pixel index
	int fromCol=0; // col pixel index
	int nRows=0;
	int nCols=0;
	bool finalRow=false;
	while(true) // loop rows
	{
		nRows++;
		if(fromRow+dim>sizeY)
		{
			fromRow=sizeY-dim;
			finalRow=true;
		}
		
		bool finalCol=false;
		while(true) // loop cols
		{
			if(finalRow) nCols++; // count columns on final row only
            
			if(fromCol+dim>sizeX)
			{
				fromCol=sizeX-dim;
				finalCol=true;
			}
            
			tiles.push_back(nsyPlusBrdr(cv::Rect(fromCol,fromRow,dim,dim)));
            
			if(finalCol)
			{
				fromCol=0;
				break;
			}
			else fromCol+=(dim-2*overlap);
		}
                
		if(finalRow) break;
		else fromRow+=(dim-2*overlap);
	}
	
	std::cout<<nRows<<" rows of tiles"<<std::endl;
	std::cout<<nCols<<" columns of tiles"<<std::endl;
	
	std::vector<cv::Mat> inferred_tiles(tiles.size());
	auto f=[&](unsigned int from,unsigned int num) // lambda expression to process batch in thread
	{
		// Load Net
		cv::dnn::Net model=cv::dnn::readNet(argv[1]); // each thread has its own model duplicate
		// model.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
		// model.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);
		
		// Infer a batch
		std::vector<cv::Mat> batch;
		for(unsigned int i=from;i<tiles.size()&&i<from+num;++i)
		{
			batch.push_back(tiles[i]);
		}
		cv::Mat blob=cv::dnn::blobFromImages(batch);
		model.setInput(blob);
		cv::Mat outblob=model.forward();
		std::vector<cv::Mat> outarray;
		cv::dnn::imagesFromBlob(outblob,outarray);
		int count=0;
		for(unsigned int i=from;i<tiles.size()&&i<from+num;++i)
		{
			inferred_tiles[i]=outarray[count];
			count++;
		}
	};
	unsigned int num_threads=std::thread::hardware_concurrency();
	if(num_threads==0) num_threads=1;
	int batch_size=tiles.size()/num_threads;
	std::vector<std::thread> threads;
	for(unsigned int i=0;i<num_threads;i++)
	{
		std::thread th(f,batch_size*i,(i==num_threads-1?tiles.size()-batch_size*i:batch_size));
		threads.push_back(std::move(th));
	}
	for(auto& i:threads) i.join();

	// Reassemble image
	cv::Mat inferred=cv::Mat::zeros(noisy.size(),noisy.type());
	fromRow=0; // row pixel index
	fromCol=0; // col pixel index
	int placeDim=dim-2*overlap; // dimension of each tile without overlap to be placed in assembled image
	int count=0;
	for(int i=0;i<nRows;++i)
	{
		if(i==nRows-1) fromRow=(sizeY-2*overlap)-placeDim;
		
		for(int j=0;j<nCols;++j)
		{
			if(j==nCols-1) fromCol=(sizeX-2*overlap)-placeDim;
            
			inferred_tiles[count](cv::Rect(overlap,overlap,placeDim,placeDim)).copyTo(
				inferred(cv::Rect(fromCol,fromRow,placeDim,placeDim)));
			count=count+1;
            
			if(j==nCols-1)
			{
				fromCol=0;
				break;
			}
			else fromCol+=placeDim;
		}
        
		fromRow+=placeDim;
	}
	inferred=cv::min(cv::max(inferred,0.),1.);
	
	// View result
	cv::namedWindow("Noisy",cv::WINDOW_NORMAL);
	cv::imshow("Noisy",noisy); // Show noisy image
	cv::namedWindow("Inferred",cv::WINDOW_NORMAL);
	cv::imshow("Inferred",inferred); // Show noisy image
	
	cv::waitKey(0); // Wait for a keystroke in the window
	return 0;
}

cv::Mat OpenImage(const std::string& file) // returns 32 bit float
{
	cv::Mat image;
	if(cv::utils::fs::exists(file)) image=cv::imread(file,cv::IMREAD_UNCHANGED); // read the file
	if(!image.empty()) // check for invalid input
	{
		if(image.channels()==4) cv::cvtColor(image,image,cv::COLOR_BGRA2BGR);
		if(image.depth()==CV_8U) image.convertTo(image,CV_32F,1.0/255.0); // convert 8 bit integer to 32 bit float scaling to range 0 to 1
		else if(image.depth()==CV_16U) image.convertTo(image,CV_32F,1.0/65535.0); // convert 16 bit integer to 32 bit float scaling to range 0 to 1
		else if(image.depth()==CV_16F) image.convertTo(image,CV_32F); // convert 16 bit integer to 32 bit float without rescaling
		else if(image.depth()==CV_32F) return image; // already 32 bit float
		else return cv::Mat(); // if not 8, 16 or 32 bit, return empty
	}
	return image; // image will be empty if file does not exist
}