I have created 3 threads. Each thread creates it’s own cv::dnn::Network. When I use cv::dnn::Network.forward(), it crashes and I get:
{1} Thread - New shapes [1,256,3,3] make Kernels(3x3), Channels(256), Output depth(32), Groups(1) not matching weights size: 73728 vs 864
If I use cv::dnn::Network.forwardAsync() it runs fine. But I need cv::dnn::Network.forward() for cv::dnn::DNN_BACKEND_CUDA and cv::dnn::DNN_TARGET_CUDA.
It also does not seem to crash if I use a global mutex in main, outside of the threads and lock cv::dnn::Network.forward() in each thread.
Should I not be able to have an independent cv::dnn::Network in each thread?
Thread creation:
std::list<std::future<Output>> results;
for (unsigned i = 0; i < g_num_threads; ++i)
{
std::promise<Output> fc_promise;
results.push_back(fc_promise.get_future());
std::thread(Thread, raw_frames, classifier_config, std::move(fc_promise)) .detach();
}
Thread:
//--------------------------------------------------//
static void Thread(std::vector<cv::Mat> frames,
const NS_ACLS::Config& classifier_config,
std::promise<Output> output)
{
std::cout << "Started Thread\n";
cv::setNumThreads(0);
std::string name = "mobilenet_v1_coco";
std::string model =
absl::StrCat("./assets/algorithms/classifiers/", name, "/", name);
std::string model_bin = absl::StrCat(model, ".pb");
std::string model_txt = absl::StrCat(model, ".pbtxt");
cv::dnn::Target target = cv::dnn::DNN_TARGET_OPENCL;
cv::dnn::Backend backend = cv::dnn::DNN_BACKEND_INFERENCE_ENGINE;
auto network = cv::dnn::readNet(model_bin, model_txt);
// set target and backend
setGpu(target);
network.setPreferableBackend(backend);
network.setPreferableTarget(target);
int64_t counter = 0;
for (const auto& image : frames)
{
std::queue<cv::AsyncArray> future_outputs;
// pre-process and set input
cv::Mat input_blob = preProcess(image, classifier_config);
network.setInput(input_blob);
// infer
std::vector<cv::Mat> detections;
try
{
#if 0
future_outputs.push(network.forwardAsync());
while (!future_outputs.empty() &&
future_outputs.front().wait_for(std::chrono::seconds(0)))
{
cv::AsyncArray async_out = future_outputs.front();
future_outputs.pop();
cv::Mat out;
async_out.get(out);
detections = {out};
}
#else
{
// std::lock_guard<std::mutex> l(g_global_lock);
network.forward(detections);
}
#endif
}
catch (cv::Exception& e)
{
std::cout << e.what() << " - code:" << e.code << "\n";
return;
}
catch (std::exception& e)
{
std::cout << "code:" << e.code << "\n";
return;
}
catch (...)
{
CHECKF();
return;
}
processDetection(detections, image, classifier_config);
++counter;
}
output.set_value(Output(counter));
std::cout << "Finished Thread\n";
}