We are trying to do real-time object detection using YOLO. We tested our code with two different approaches.
The first approach is the save video then detect objects.
The second approach is to detect objects in real-time. We use gstreamer to pass frames to OpenCV.
The first approach, as you can see forward of cv2.dnn_Net takes 0.035, which is sufficient for our application.
The second approach, as you can see takes 3x times more than the first approach.
(Can`t embed more than one item because I am a new user. The following link is the second profile of our second approach.)
Gstreamer code that we use to capture frames:
cv2.VideoCapture(
"udpsrc port=5600 ! application/x-rtp,media=video,payload=26,clock-rate=90000,encoding-name=H264,framerate=30/1 ! rtph264depay ! avdec_h264 ! videoconvert ! appsink", cv2.CAP_GSTREAMER)
The rest of the remaining code is the same. We tested both of these codes on the same machine and in the same OS.
Thanks in advance for your help. Do not hesitate to ask for details, I can share the codes if you want.
Edit: Here is the code that we use to detect objects
class ProcessImage:
def __init__(self):
self.net = cv2.dnn.readNet("yolo_files/yolov4-tiny-custom_best.weights",
"yolo_files/yolov4-tiny-custom.cfg")
self.classes = []
with open("yolo_files/obj.names", "r") as f:
self.classes = [line.strip() for line in f.readlines()]
self.layer_names = self.net.getLayerNames()
self.output_layers = [self.layer_names[i[0] - 1]
for i in self.net.getUnconnectedOutLayers()]
self.colors = np.random.uniform(0, 255, size=(len(self.classes), 3))
self.prev_frame_time = 0
self.new_frame_time = 0
self.center_x, self.center_y, self.x_axis, self.y_axis, self.box_width, self.box_height = 0, 0, 0, 0, 0, 0
self.cap = cv2.VideoCapture(
"udpsrc port=5600 ! application/x-rtp,media=video,payload=26,clock-rate=90000,encoding-name=H264,framerate=30/1 ! rtph264depay ! avdec_h264 ! videoconvert ! appsink", cv2.CAP_GSTREAMER)
self.w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
self.h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
atexit.register(self.cleanup)
def cleanup(self):
if self.cap.isOpened():
self.cap.release()
self.video_writer.release()
self.logger.info(
"{} | Video capture and video writer released".format(datetime.now()))
def process(self):
while True:
ret, frame = self.cap.read()
if not ret:
exit(1)
if self.process_image:
yoloResults = self.yoloRecognition(frame)
if yoloResults[0]:
positionMessage = self.targetObjectPositionWithFOV()
self.drawBoxAndInformation(
frame, yoloResults[1], yoloResults[2], yoloResults[3], yoloResults[4])
cv2.imshow("Frame", frame)
if cv2.waitKey(1) & 0XFF == ord('q'):
break
def yoloRecognition(self, frame):
# Detecting objects
blob = cv2.dnn.blobFromImage(
frame, 0.00392, (self.w, self.h), (0, 0, 0), True, crop=False)
self.net.setInput(blob)
outs = self.net.forward(self.output_layers)
isObjectFound = False
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.5:
# Object detected
isObjectFound = True
self.center_x = int(detection[0] * self.w)
self.center_y = int(detection[1] * self.h)
self.box_width = int(detection[2] * self.w)
self.box_height = int(detection[3] * self.h)
# Rectangle coordinates
self.x_axis = int(self.center_x - self.box_width / 2)
self.y_axis = int(self.center_y - self.box_height / 2)
boxes.append([self.x_axis, self.y_axis,
self.box_width, self.box_height])
confidences.append(float(confidence))
class_ids.append(class_id)
# to eliminate multiple boxes for same object
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
center_coordinates = str(self.center_x) + "," + str(self.center_y)
return [isObjectFound, boxes, indexes, class_ids, center_coordinates]