Error: Number of input channels should be multiple of 3 but got 1 in function 'cv::dnn::ConvolutionLayerImpl::getMemoryShapes'

Hi!
I recently trained a model on Keras and wanted to run it on OpenCV, so I used the ReadNetFromTensorflow function. I made model predictions using the blobFromImage() and .forward() methods. Yet I keep getting this error:

`error: OpenCV(4.5.3) Error: Number of input channels should be multiple of 3 but got 1 in function ‘cv::dnn::ConvolutionLayerImpl::getMemoryShapes’

Here is the code used to train the model:

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
model = tf.keras.models.Sequential([
    
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(150, 150, 3)),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    
    tf.keras.layers.Dense(29, activation='softmax')
])

model.summary()
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.95):
            print("\nReached >95% accuracy so cancelling training!")
            self.model.stop_training = True
        
callbacks = myCallback()
train_datagen = ImageDataGenerator(
      rescale=1./255,
      rotation_range=40,
      width_shift_range=0.2, # Shifting image width by 20%
      height_shift_range=0.2,# Shifting image height by 20%
      shear_range=0.2,       # Shearing across X-axis by 20%
      zoom_range=0.2,        # Image zooming by 20%
      horizontal_flip=True,
      fill_mode='nearest')

train_generator = train_datagen.flow_from_directory(
    "/content/drive/MyDrive/train_asl",
    target_size = (150, 150),
    class_mode = 'categorical',
    batch_size = 20)
validation_datagen = ImageDataGenerator(rescale=1./255)

validation_generator = validation_datagen.flow_from_directory(
    "/content/drive/MyDrive/test_asl",
    target_size = (150, 150),
    class_mode = 'categorical',
    batch_size = 20
)
import numpy as np
history = model.fit_generator(print
      train_generator,
      steps_per_epoch = np.ceil(870/20),  # 2520 images = batch_size * steps
      epochs = 80,
      validation_data=validation_generator,
      validation_steps = np.ceil(870/20),  # 372 images = batch_size * steps
      callbacks=[callbacks],
      verbose = 2)
model.save('mymodel6',save_format='pb')

Here is the code used to run the OpenCV Image Recognition Program:

import numpy as np
import cv2
import keras
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

model = keras.models.load_model("C:\Users\mymodel3.pb")

background = None
accumulated_weight = 0.5

ROI_top = 100
ROI_bottom = 300
ROI_right = 150
ROI_left = 350



def cal_accum_avg(frame, accumulated_weight):

    global background
    
    if background is None:
        background = frame.copy().astype("float")
        return None

    cv2.accumulateWeighted(frame, background, accumulated_weight)



def segment_hand(frame, threshold=25):
    global background
    
    diff = cv2.absdiff(background.astype("uint8"), frame)

    
    _ , thresholded = cv2.threshold(diff, threshold, 255, cv2.THRESH_BINARY)
    
    #Fetching contours in the frame (These contours can be of hand or any other object in foreground) ...
    image, contours, hierarchy = cv2.findContours(thresholded.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # If length of contours list = 0, means we didn't get any contours...
    if len(contours) == 0:
        return None
    else:
        # The largest external contour should be the hand 
        hand_segment_max_cont = max(contours, key=cv2.contourArea)
        
        # Returning the hand segment(max contour) and the thresholded image of hand...
        return (thresholded, hand_segment_max_cont)

cam = cv2.VideoCapture(0)
num_frames =0
while True:
    ret, frame = cam.read()

    # filpping the frame to prevent inverted image of captured frame...
    frame = cv2.flip(frame, 1)

    frame_copy = frame.copy()

    # ROI from the frame
    roi = frame[ROI_top:ROI_bottom, ROI_right:ROI_left]

    gray_frame = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    gray_frame = cv2.GaussianBlur(gray_frame, (9, 9), 0)


    if num_frames < 70:
        
        cal_accum_avg(gray_frame, accumulated_weight)
        
        cv2.putText(frame_copy, "FETCHING BACKGROUND...PLEASE WAIT", (80, 400), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,0,255), 2)
    
    else: 
        # segmenting the hand region
        hand = segment_hand(gray_frame)
        

        # Checking if we are able to detect the hand...
        if hand is not None:
            
            thresholded, hand_segment = hand

            # Drawing contours around hand segment
            cv2.drawContours(frame_copy, [hand_segment + (ROI_right, ROI_top)], -1, (255, 0, 0),1)
            
            cv2.imshow("Thesholded Hand Image", thresholded)
            
            thresholded = cv2.resize(thresholded, (64, 64))
            thresholded = cv2.cvtColor(thresholded, cv2.COLOR_GRAY2RGB)
            thresholded = np.reshape(thresholded, (1,thresholded.shape[0],thresholded.shape[1],3))

            
            model.setInput(cv2.dnn.blobFromImage(thresholded,size=(300,300),swapRB=True,crop=False))
            predict = model.forward()
            cv2.putText(frame_copy, word_dict[np.argmax(pred)], (170, 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
            
    # Draw ROI on frame_copy
    cv2.rectangle(frame_copy, (ROI_left, ROI_top), (ROI_right, ROI_bottom), (255,128,0), 3)

    # incrementing the number of frames for tracking
    num_frames += 1

    # Display the frame with segmented hand
    cv2.putText(frame_copy, "DataFlair hand sign recognition_ _ _", (10, 20), cv2.FONT_ITALIC, 0.5, (51,255,51), 1)
    cv2.imshow("Sign Detection", frame_copy)


    # Close windows with Esc
    k = cv2.waitKey(1) & 0xFF

    if k == 27:
        break

# Release the camera and destroy all the windows
cam.release()
cv2.destroyAllWindows()

i guess, your model was trained on rgb images
(and now you feed it a thresholded 1 channel image)

can you show one of your train images ?
(if those are tightly cropped, you probably should crop the (bgr !) frame to the hand rect, and feed that into your nn)

I’m not quite sure what you mean… but this is one of my training images
B0002_test

i’d suggest, you throw out the whole thresholding / findContours part and use the bgr frame as is (maybe cropped with your fixed(?) ROI_xxx coords)

(having different train / inference image processing is a bad idea anyway)

btw, proof, it wants 3 channels !

and please, close your issue here (it’s not a library bug, but an error on your side !)

Hi,
I’m a beginner to OpenCV, so I’m not quite sure what you mean by BGR frames. I removed all the extra findContours parts and am still getting the same error. I also changed the size numbers in the blobFromImage() function - no change.
Any suggestions?

My new code is below:

from imutils.video import VideoStream
from imutils.video import FPS
import numpy as np
import argparse
import imutils
import time
import cv2


ap = argparse.ArgumentParser()
ap.add_argument("-c", "--confidence", type=float, default=0.8,
    help="minimum probability to filter weak detections")
args = vars(ap.parse_args())


CLASSES = ["a", "b", "c", "d", "del", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "nothing", "o", "p", "q", "r", "s", "space", "t", "u", "v", "w", "x", "y", "z"]


#This is to pull the information about what each object should look like
weightsPath = "/home/mhhs/Downloads/frozen_graph.pb"


COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

cvNet = cv2.dnn.readNetFromTensorflow(weightsPath)

cap = cv2.VideoCapture(0)  

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    h = frame.shape[0]
    w = frame.shape[1]
    img = np.array(frame)
    cvNet.setInput(cv2.dnn.blobFromImage(img, size=(150, 150), swapRB=True, crop=False))
    detections = cvNet.forward()


    for i in np.arange(0, detections.shape[2]):
        # extract the confidence (i.e., probability) associated with
        # the prediction
        confidence = detections[0, 0, i, 2]

        if confidence > args["confidence"]:
            idx = int(detections[0, 0, i, 1])
            print(idx   )
            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
            (startX, startY, endX, endY) = box.astype("int")

            label = "{}: {:.2f}%".format(CLASSES[idx],
                confidence * 100)
            cv2.rectangle(img, (startX, startY), (endX, endY),
                COLORS[idx], 2)
            y = startY - 15 if startY - 15 > 15 else startY + 15
            cv2.putText(img, label, (startX, y),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2)

            #print(label)


    out_img = cv2.resize(img, (640, 480))
    cv2.imshow('img', out_img)
    if cv2.waitKey(25) & 0xFF == ord('q'):
        cv2.destroyAllWindows()

i’m having a hard time to believe it. (simply works for me …)

import cv2, numpy as np
print(cv2.__version__)

n = cv2.dnn.readNet("mdl.pb") # link taken from gh issue
print(n.empty())

im = cv2.imread("img.png") # your image above ! 
b = cv2.dnn.blobFromImage(im, size=(150, 150), swapRB=True, crop=False)
   
n.setInput(b)
o = n.forward()
print(o.shape)
print(np.argmax(o))

############
4.5.5-dev
False
(1, 29)
8

I’m a little confused as to why this is happening… could it be that I’m using a video stream instead of a single image?
I probably should have mentioned: I am now getting an error in the np.arrange(0,detections.shape[2]) stage, that states “IndexError: tuple index out of range”.

yea, the output shape is (1,29), so there is no detections.shape[2]

you blindly copypasted that from an object detection code ?
your network does classification only, and all you need is a

gesture_id = np.argmax(network_output)

(throw away all that useless junk below cvNet.forward())