Is cv::dnn::Network.forward() thread safe?

I have created 3 threads. Each thread creates it’s own cv::dnn::Network. When I use cv::dnn::Network.forward(), it crashes and I get:

{1} Thread - New shapes [1,256,3,3] make Kernels(3x3), Channels(256), Output depth(32), Groups(1) not matching weights size: 73728 vs 864

If I use cv::dnn::Network.forwardAsync() it runs fine. But I need cv::dnn::Network.forward() for cv::dnn::DNN_BACKEND_CUDA and cv::dnn::DNN_TARGET_CUDA.

It also does not seem to crash if I use a global mutex in main, outside of the threads and lock cv::dnn::Network.forward() in each thread.

Should I not be able to have an independent cv::dnn::Network in each thread?

Thread creation:
std::list<std::future<Output>> results;
for (unsigned i = 0; i < g_num_threads; ++i)
{
std::promise<Output> fc_promise;
results.push_back(fc_promise.get_future());
std::thread(Thread, raw_frames, classifier_config, std::move(fc_promise)) .detach();
}

Thread:

//--------------------------------------------------//
static void Thread(std::vector<cv::Mat> frames,
                   const NS_ACLS::Config& classifier_config,
                   std::promise<Output> output)
{
	std::cout << "Started Thread\n";

	cv::setNumThreads(0);

	std::string name = "mobilenet_v1_coco";
	std::string model =
	  absl::StrCat("./assets/algorithms/classifiers/", name, "/", name);
	std::string model_bin = absl::StrCat(model, ".pb");
	std::string model_txt = absl::StrCat(model, ".pbtxt");
	cv::dnn::Target target = cv::dnn::DNN_TARGET_OPENCL;
	cv::dnn::Backend backend = cv::dnn::DNN_BACKEND_INFERENCE_ENGINE;

	auto network = cv::dnn::readNet(model_bin, model_txt);
	// set target and backend
	setGpu(target);
	network.setPreferableBackend(backend);
	network.setPreferableTarget(target);

	int64_t counter = 0;
	for (const auto& image : frames)
		{
			std::queue<cv::AsyncArray> future_outputs;

			// pre-process and set input
			cv::Mat input_blob = preProcess(image, classifier_config);
			network.setInput(input_blob);

			// infer
			std::vector<cv::Mat> detections;
			try
				{
#if 0
					future_outputs.push(network.forwardAsync());
					while (!future_outputs.empty() &&
					       future_outputs.front().wait_for(std::chrono::seconds(0)))
						{
							cv::AsyncArray async_out = future_outputs.front();
							future_outputs.pop();
							cv::Mat out;
							async_out.get(out);
							detections = {out};
						}
#else
					{
						// std::lock_guard<std::mutex> l(g_global_lock);
						network.forward(detections);
					}
#endif
				}
			catch (cv::Exception& e)
				{
				std::cout <<	e.what() << " - code:" << e.code << "\n";
					return;
				}
			catch (std::exception& e)
				{
				std::cout <<	"code:" << e.code << "\n";
					return;
				}
			catch (...)
				{
					CHECKF();
					return;
				}

			processDetection(detections, image, classifier_config);

			++counter;
		}

	output.set_value(Output(counter));

	std::cout << "Finished Thread\n";
}

The net.forward() call with the CUDA backend must be thread-safe. It’s a bug if it isn’t.