Help with optimization opencv videocapture optical flow analysis

Hello,

I am now using python opencv in docker to analyse video files. There are about 24 files of 1 hour on which i do movement analysis with nvidia optical flow.

However it takes about 40 minutes to analyze one day (24 hours, so 24 files of 1 hour).

I would like this to speed up, i now do the processing in parralel en my cpu is 100% because of the reading of the video files i think.

The video files are 30fps but i am comparing frame 1 with frame 31 by skipping 30 frames.

I think that the bottleneck is the reading of the files by the cpu, is this correct?

How could i speed this up, i tried using hw acceleration with opencv FFMPEG for reading the video files, but this was not faster. Would using cuda videocapture be faster? That is not available in python is it?

As you can see in my code i tried experimenting with cv2.cuda functions, however it was not faster, maybe because of downloading and uploading from cpu/gpu when using cuda funcion but still using cv.videocapture?

The actual optical flow analysis using nvidia optical flow is realy fast! So i think my code can run mucht faster if the reading of frames can be sped up. I now do not even resize the frames as the actual optical flow analysis is so fast it does not matter.

Or is the best way to just get a faster cpu?

would love to get some input, i am not very experienced and would love to learn! I already tried to optimize as much as possible, but for now i am stuck, and on the verge of buying a faster cpu. If anybody could help me, and tell me if it would be better to use my gpu nvidia: 3080 than that would be greatly appreciated!

My cpu now is a 3600XT, but i want to switch to Intel i7-14700K
Kind regards,

Jari

relevant code:

import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
import concurrent.futures
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
import threading  # Import threading for the lock
from collections import defaultdict
import pathlib
import sys


frame_counter_value = 30  # Set the frame counter value
movement_threshold = 800.0  # Adjust this value as needed
resample_frequency = '5T'

# Define the device_id
device_id = 0

# Get the tory path from the command line arguments
video_dir_path = sys.argv[1] if len(sys.argv) > 1 else '.'

#logging code
log_file_path = 'processed_files.log'
# Check if the log file exists, if not, create an empty one
if not pathlib.Path(log_file_path).is_file():
    with open(log_file_path, 'w') as f:
        pass
# Read the log file into a set (for faster lookups)
with open(log_file_path, 'r') as f:
    processed_files = set(line.strip() for line in f)
#loggin code

# Create a global progress bar and lock
pbar = None
pbar_lock = threading.Lock()

def calculate_movement(video_file):
    global pbar  # Access the global progress bar
    #os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'video_codec;hevc_cuvid'
    cap = cv2.VideoCapture(video_file)
    fps = cap.get(cv2.CAP_PROP_FPS)  # Adjust fps for frame skipping

    frame_counter = 0
    movement_percentages = []  # Initialize an empty list to store the movement percentages
    
    # Read the first frame before the loop
    ret, frame = cap.read()
    if not ret:
        return [], fps
    
    # Upload the first frame to GPU memory
    #frame_gpu = cv2.cuda_GpuMat()
    #frame_gpu.upload(frame)

    #Resize the frame on the GPU
    #frame_gpu = cv2.cuda.resize(frame_gpu, (framesize, int(frame.shape[0] * framesize / frame.shape[1])))

    # Convert the frame to grayscale on the GPU
    #frame_gpu = cv2.cuda.cvtColor(frame_gpu, cv2.COLOR_BGR2GRAY)

    # Convert the frame to grayscale
    frame_gpu = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    prvs = frame_gpu

    # Get the original size of the frames
    frame_size = (prvs.shape[1], prvs.shape[0])

        # Initialize the optical flow object using Farneback Optical Flow
    #flow = cv2.cuda_FarnebackOpticalFlow.create(winSize=WIN_SIZE, numIters=NUM_ITERS)
    optical_flow = cv2.cuda_NvidiaOpticalFlow_2_0.create(
        frame_size, 
        cv2.cuda.NvidiaOpticalFlow_2_0_NV_OF_PERF_LEVEL_SLOW, 
        cv2.cuda.NvidiaOpticalFlow_2_0_NV_OF_OUTPUT_VECTOR_GRID_SIZE_4, 
        cv2.cuda.NvidiaOpticalFlow_2_0_NV_OF_HINT_VECTOR_GRID_SIZE_4, 
        False, 
        False, 
        False, 
        device_id
    )

    # Initialize an empty list to store the mag for each frame
    while True:
        frame_counter += 1

        # Skip to the next frame every framecounter frames
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * frame_counter_value)

        ret, frame = cap.read()
        if not ret:
            break

        # Resize the frame and convert it to grayscale (on the CPU)
        #frame = cv2.resize(frame, (framesize, int(frame.shape[0] * framesize / frame.shape[1])))
        #frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Upload the frame to GPU memory
        #frame_gpu = cv2.cuda_GpuMat()
        #frame_gpu.upload(frame)

        # Convert the frame to grayscale
        frame_gpu = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        next = frame_gpu

        flow = optical_flow.calc(prvs, next, None)

        # Ensure the flow is in the correct format
        flow = flow[0].astype(np.float32)

        # Calculate the magnitude and angle of the 2D vectors
        mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])

        # Print the mag array
        #if frame_counter % 100 == 0: print(mag)

        # Calculate the percentage of pixels where the magnitude is above the threshold
        movement_percentage = np.sum(mag > movement_threshold) / mag.size * 100
        movement_percentages.append(movement_percentage)

        # Update prvs every frame_counter_value frames
        if frame_counter % frame_counter_value == 0:
            prvs = next

        # Update the progress bar with a lock to ensure thread safety
        with pbar_lock:
            pbar.update(1)

    cap.release()

    return movement_percentages, fps





#later in the code i have this for the parralel processing:

            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = {executor.submit(calculate_movement, os.path.join(video_dir_path, video_file)) for video_file in video_files}

This is my dockerfile for building opencv:

# Use nvidia/cuda as base image
FROM nvidia/cuda:12.3.1-devel-ubuntu20.04

# Install unzip
RUN apt-get update && apt-get install -y unzip

# Copy the NVIDIA Video Codec SDK and Nvidia Optical Flow SDK into the Docker image
# Copy the NVIDIA Video Codec SDK and Nvidia Optical Flow SDK into the Docker image
# Copy the unzipped NVIDIA Video Codec SDK and Nvidia Optical Flow SDK into the Docker image
COPY Video_Codec_SDK_12.1.14 /tmp/Video_Codec_SDK
COPY Optical_Flow_SDK5.0.7 /home/Downloads/OpticalFlowSDK


ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
# Move the necessary libraries to the appropriate directories
RUN mv /tmp/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so /usr/lib/x86_64-linux-gnu/ && \
    cp /tmp/Video_Codec_SDK/Interface/* /usr/local/include/

# Move the necessary headers to the appropriate directories
RUN cp /home/Downloads/OpticalFlowSDK/NvOFInterface/* /usr/local/include/

# Update package lists and install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    python3-pip \
    python3-dev \
    build-essential \
    cmake \
    git \
    libgtk2.0-dev \
    pkg-config \
    #libavcodec-dev \
    #libavformat-dev \
    libswscale-dev \
    libtbb2 \
    libtbb-dev \
    libjpeg-dev \
    libpng-dev \
    libtiff-dev \
    libdc1394-22-dev \
    libssl-dev \
    libffi-dev \
    libxml2-dev \
    libxslt1-dev \
    zlib1g-dev \
    python-pil \
    python-lxml \
    python-tk \
    #ffmpeg \
    libgstreamer1.0-0 \
    gstreamer1.0-plugins-base \
    gstreamer1.0-plugins-good \
    gstreamer1.0-plugins-bad \
    gstreamer1.0-plugins-ugly \
    gstreamer1.0-libav \
    gstreamer1.0-doc \
    gstreamer1.0-tools \
    gstreamer1.0-x \
    gstreamer1.0-alsa \
    gstreamer1.0-gl \
    gstreamer1.0-gtk3 \
    gstreamer1.0-qt5 \
    gstreamer1.0-pulseaudio \
    libgstreamer1.0-dev \
    libgstreamer-plugins-base1.0-dev \
    yasm \
    libx264-dev \
    libx265-dev \
    libvpx-dev \
    libfdk-aac-dev \
    libmp3lame-dev \
    libopus-dev \
    libunistring-dev \
    libgnutls28-dev \
    libass-dev \
    libwebp-dev \
    librtmp-dev \
    libtheora-dev \
    libvorbis-dev \
    libopencore-amrnb-dev \
    libopencore-amrwb-dev \
    libsdl2-dev \
    libva-dev \
    libvdpau-dev \
    libdrm-dev \
    libopenjp2-7-dev \
    libxvidcore-dev \
    liblzma-dev \
    libgsm1-dev \
    libspeex-dev \
    libsoxr-dev \
    libgme-dev \
    libshine-dev \
    libmodplug-dev \
    libbluray-dev \
    libtwolame-dev \
    libwavpack-dev \
    libopenmpt-dev \
    libchromaprint-dev \
    libvidstab-dev \
    libaom-dev \
    #libsvtav1-dev \
    #libdav1d-dev \
    libzmq3-dev \
    libzvbi-dev \
    #libvmaf-dev \
    #libxavs2-dev \
    libnuma-dev \
    libssl-dev \
    libomxil-bellagio-dev \
    #libnvenc-dev \
    #libnpp-dev \
    #libcuda1 \
    libavresample-dev \
    libmfx-dev && \
    rm -rf /var/lib/apt/lists/*

# Set the pkg-config path to include CUDA libraries
ENV PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/cuda/lib64/pkgconfig

# Clone ffnvcodec from the FFmpeg GitHub
RUN git clone https://github.com/FFmpeg/nv-codec-headers.git
WORKDIR nv-codec-headers
RUN make install
WORKDIR /
# Clone FFmpeg source
RUN git clone https://github.com/ffmpeg/ffmpeg.git /ffmpeg

WORKDIR /ffmpeg

ENV CFLAGS="-fPIC" CXXFLAGS="-fPIC"
    #deze weet ik niet zeker
    #--enable-libnpp \
# Configure FFmpeg with necessary flags
RUN ./configure \
    --enable-nonfree \
    --enable-shared \
    --enable-gpl \
    --enable-version3 \
    --enable-pic \  
    --enable-cuda-nvcc \
    --enable-cuvid \
    --enable-nvenc \
    --extra-cflags="-I/usr/local/include" \
    --extra-ldflags="-L/usr/local/lib" && \
    make -j$(nproc) && \
    make install && \
    cd .. && \
    rm -rf /ffmpeg

#add variable
ENV LD_LIBRARY_PATH=/ffmpeg/libavcodec:$LD_LIBRARY_PATH
# Add Python binary to PATH
ENV PATH="/usr/bin:${PATH}"


# Install the shared version of libavcodec
WORKDIR /
RUN git clone https://github.com/opencv/opencv.git
RUN git clone https://github.com/opencv/opencv_contrib.git


WORKDIR /opencv
RUN mkdir build

WORKDIR /opencv/build
RUN cmake -D CMAKE_BUILD_TYPE=RELEASE \
    -D CMAKE_INSTALL_PREFIX=/usr/local \
    -D INSTALL_C_EXAMPLES=ON \
    -D INSTALL_PYTHON_EXAMPLES=ON \
    -D OPENCV_GENERATE_PKGCONFIG=ON \
    -D OPENCV_EXTRA_MODULES_PATH=/opencv_contrib/modules \
    -D BUILD_EXAMPLES=ON \
    -D WITH_CUDA=ON \
    -D WITH_CUBLAS=ON \
    -D WITH_FFMPEG=ON \
    -D WITH_GSTREAMER=ON \
    -D WITH_NVCUVID=ON \
    -D CUDA_ARCH_BIN=8.6 \
    -D WITH_NVIDIA_OPTICAL_FLOW=ON \
    -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
    ..
#indepentent code is new
#verbose make -j$(nproc)
#RUN make VERBOSE=1 -j$(nproc)
RUN make -j$(nproc)
RUN make install

# Create a symbolic link to the cv2 module
RUN ln -s /opencv/build/lib/python3/cv2.cpython-38-x86_64-linux-gnu.so /usr/local/lib/python3.8/dist-packages/cv2.so

# Install additional Python packages
RUN pip3 install numpy matplotlib pandas tqdm cupy

so youā€™re decoding at 36x realtime. assuming 30 fps, thatā€™s 1080 fps decode performance.

what resolution?

I think your GPUā€™s hardware decoder might be limiting this. which specific model is it?

you might not be doing that.

what container format? what video stream format? do you know what keyframes are?

i compare every 1 frame to every 31 frame. So that would be (24 * 60 * 60* 30) / 30 = 86400 frames in 40 minutes. That would be 86400 / (40 * 60) = 36 frames per second. My gpu is a GeForce RTX 3080.

If i do not skip frames it will take alot longer, i also thought about keyframes, but if i read every frame it takes much longer, which seems to indictate the bottleneck is at the reading/converting of frames.

when i test reading of video files with this code:

import cv2
import os
import sys
import time
import concurrent.futures
import threading

# Parameters for frame skipping
frame_counter_value = 30

# Directory containing the video files
video_dir_path = sys.argv[1] if len(sys.argv) > 1 else '.'

# Shared variables for accumulating total frames processed
total_frames_processed = 0
start_time = time.time()
lock = threading.Lock()

# Function to continuously update and display the total FPS
def display_fps_continuously():
    global total_frames_processed, start_time
    while not fps_update_stop_event.is_set():
        with lock:
            elapsed_time = time.time() - start_time
            if elapsed_time > 0:
                current_total_fps = total_frames_processed / elapsed_time
                print(f"\rCurrent Total FPS: {current_total_fps:.2f}", end='')
        time.sleep(1)  # Update every second

# Function to process a single video file and count frames
def process_video(video_file_path):
    global total_frames_processed
    cap = cv2.VideoCapture(video_file_path)
    frame_count = 0

    while True:
        # Skip frames to simulate processing
        success = cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count * frame_counter_value)
        if not success:
            break

        ret, _ = cap.read()
        if not ret:
            break  # Stop if no more frames are available

        with lock:
            total_frames_processed += 1
        frame_count += 1

    cap.release()

# Initialize the stop event for the FPS display thread
fps_update_stop_event = threading.Event()

if __name__ == "__main__":
    # Start the FPS display thread
    fps_display_thread = threading.Thread(target=display_fps_continuously)
    fps_display_thread.start()

    # Process videos in parallel
    video_files = [os.path.join(video_dir_path, f) for f in os.listdir(video_dir_path) if f.endswith('.mp4')]
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(process_video, video_files)

    # Stop the FPS display thread
    fps_update_stop_event.set()
    fps_display_thread.join()

    print("\nFinished processing all files.")

I get about 30 fps

Wihtout frame skipping i get about 110 fps.

so not skipping frames speeds up the process by about 3x, but because there are 30x more frames to process it is still slower. That is why i skip the frames.

which seems to indicate the reading of frames is the bottleneck right?

For my whole program i removed frame skipping and it was about 6x as fast. With frame skipping about 30fps so 40 minutes for a day. Witouth frame skipping 185fps, however because there are 30x more frames to process, it would still be a lot slower overall.

So when i remove:

    # Skip to the next frame every framecounter frames
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * frame_counter_value)

it speeds up, however it still takes longer. what are other options?

check out pyav.

itā€™s not able (or not easy to make it anyway) to use the GPU for decoding but the CPU is plenty fast (it really is!) AND it lets you jump around in the video a lot quicker than OpenCV does.

if you tell OpenCV to go to a particular timestamp (canā€™t jump to indices, thatā€™s an illusion), it will go to the preceding keyframe and then walk (decode) to the target. for any seek, it does that. it canā€™t do anything else.

with pyav, you can tell it to jump somewhere and not walk the rest of the way. itā€™ll put you down on that keyframe.

your video likely has regular keyframes, probably not at 30 frame intervals. if you arenā€™t particular about that stride, pick the stride that your video file has. or, well, donā€™t actually ā€œpickā€ a stride, just jump to places in whatever stride you want, and accept where it sets you down.

what Iā€™m saying is that even with your frame skips, you notice that you donā€™t get a 30x speedup. thatā€™s because youā€™re still decoding frames you donā€™t care aboutā€¦ with OpenCV anyway.

all this assumes that your video has keyframes and P/B-frames. if itā€™s fully intra-coded, then every seek directly lands on a keyframe and thereā€™s nothing to improve.

maybe a different way of ā€˜skippingā€™ frames:

instead of seeking to a position,
you could try to iterate normally, but NOT decode unwanted frames:

That is, you call VideoCapture::grab() for each camera and after that call the slower method VideoCapture::retrieve() to decode and get frame from each camera. This way the overhead on demosaicing or motion jpeg decompression etc. is eliminated and the retrieved frames from different cameras will be closer in time.

skip = 5 # only process every Nth frame
counter = 0
while True:
    ok = cap.grab() # next img
    if not ok: break
    counter++
    if counter % skip == 0: # only decompress needed
        frame = cap.retrieve()
        # process frame
1 Like

Hi Berak,

Thanks for your input, for some reason i could not get it working so i went ahead and bought the new CPU I7 14700K with ddr5 memory.

After installing the new CPU and motherboard i saw an increase of 2x. from about 30 frames per second to about 70 fps.

I tried again what you suggested and for some reason now i was suddenly able to do it. I saw an increase from 70 to about 140fps!

I am still glad i went ahead with buying the new cpu, as the improvement is still 2x, my cpu is still 100% but now the nvidia 3080 is also at about 95 procent. Which means i have hit the end goal of my optimizations. Faster is not possible , only when getting a new gpu. Because now the actual optical flow analysis which is being done on the gpu maximises the gpu usage.

Thanks for your input!

i still have to see if the actual frame comparison is done right, becuse in the graphs i saw some discrepancies between the 2 methods, but it should be working as it is supposed to because i saw the increase in gpu utalization.

Now i am able to calculate optical flow for 24 hours of video of 1080p when compaaring every 30th frame in 10 minutes. Which i think is very fast.

If someone would be so kind as to check something for me in my code:

When skip is set to 30 then does it compare every 30th frame with each other? i Think so right?

So what i hope is that i have coded the following:
Frame 1 is compared with frame 31.
Frame 31 is compared with frame 61.
Frame 61 is compared with frame 91, and so on.

def calculate_movement(video_file):
    global pbar  # Access the global progress bar
    cap = cv2.VideoCapture(video_file)
    fps = cap.get(cv2.CAP_PROP_FPS)

    movement_percentages = []
    counter = 0

    # Get the first frame
    ret, frame = cap.read()
    if not ret:
        return [], fps

    frame_gpu = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    prvs = frame_gpu

    # Get the original size of the frames
    frame_size = (prvs.shape[1], prvs.shape[0])

    # Initialize optical flow object
    optical_flow = cv2.cuda_NvidiaOpticalFlow_2_0.create(
        frame_size, 
        cv2.cuda.NvidiaOpticalFlow_2_0_NV_OF_PERF_LEVEL_SLOW, 
        cv2.cuda.NvidiaOpticalFlow_2_0_NV_OF_OUTPUT_VECTOR_GRID_SIZE_4, 
        cv2.cuda.NvidiaOpticalFlow_2_0_NV_OF_HINT_VECTOR_GRID_SIZE_4, 
        False, 
        False, 
        False, 
        device_id
    )

    while True:
        counter += 1
        ok = cap.grab()
        if not ok:
            break

        # Only decode and process every skip-th frame
        if counter % skip == 0:
            ret, frame = cap.retrieve()
            if not ret:
                break