Detecting and tracking visual features of static objects in a scene

I have a task to detect and track static visual features(features from static objects) on a video.

The camera that records video isn’t stable it’s a video from a drone. Basically from my understanding, I need to do the next things:

  1. I need first to detect all the visual features.
  2. Then skip a few frames and detect features again.
  3. After I use Matcher to get matched features.
  4. Then pass the features from the previous frame and the next frame to findHomography; this should indicate where the features should be located according to the camera motion.
  5. Also use optical flow to add more precision and check points not by descriptors but by pixels.
  6. execute perspectiveTransform on refined points to get warped points(expected locations for feature points).
  7. Get the difference between refined points from the next frame and expected points. Those points in theory should belong to the moving objects.

Here is the Python implementation for the list above.

import cv2
import numpy as np

frame_idx = 0
orb = cv2.ORB.create(nfeatures=1000)
prev_keypoints = None
prev_descriptors = None
prev_frame = None
detect_interval = 5 # get a new features each 5 frames
matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
lk_params = dict(
    winSize=(15, 15),
    maxLevel=2,
    criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03),
)

def process_frame(frame):
    """Process a single video frame."""
    next_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    global frame_idx
    global prev_keypoints
    global prev_descriptors
    global prev_frame

    if prev_frame is None:
        keypoints, descriptors = orb.detectAndCompute(next_frame, None)
        prev_keypoints = keypoints
        prev_descriptors = descriptors
        prev_frame = next_frame
        frame_idx += 1
        return

    if frame_idx % detect_interval == 0:
        next_keypoints, next_descriptors = orb.detectAndCompute(next_frame, None)
        matches = matcher.match(prev_descriptors, next_descriptors)
        # Sort them in the order of their distance.
        matches = sorted(matches, key=lambda x: x.distance)
        # Draw the first 300 matches.
        img3 = cv2.drawMatches(
            prev_frame.copy(),
            prev_keypoints,
            frame.copy(),
            next_keypoints,
            matches[:300],
            None,
            flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
        )
        cv2.imshow("Matches", img3)

        # Extract matched points
        prev_matched_points = np.array(
            [prev_keypoints[m.queryIdx].pt for m in matches], dtype=np.float32
        )
        next_matched_points = np.array(
            [next_keypoints[m.trainIdx].pt for m in matches], dtype=np.float32
        )

        # Step 3: Estimate global motion using RANSAC
        homography, inliers = cv2.findHomography(
            prev_matched_points, next_matched_points, cv2.RANSAC, 5.0
        )
        inliers_mask = inliers.flatten().astype(bool)

        prev_inlier_points = prev_matched_points[inliers_mask]

        # Step 4: Refine inlier points with Optical Flow
        refined_next_points, status, error = cv2.calcOpticalFlowPyrLK(
            prev_frame, next_frame, prev_inlier_points, None, **lk_params
        )
        valid_status = status.flatten() == 1
        refined_prev_points = prev_inlier_points[valid_status]
        refined_next_points = refined_next_points[valid_status]

        # Step 5: Compensate for camera motion
        # Warp previous points using the estimated homography
        warped_points = cv2.perspectiveTransform(refined_prev_points.reshape(-1, 1, 2), homography).reshape(-1, 2)


        # Compute residual motion (difference between actual and expected motion)
        residuals = np.linalg.norm(refined_next_points - warped_points, axis=1)

        # print("residuals" + str(residuals))

        # Step 6: Separate moving and static points

        # Median threshold
        threshold = np.percentile(residuals, 50)

        moving_points = refined_next_points[residuals > threshold]
        static_points = refined_next_points[residuals <= threshold]

        output_frame = frame.copy()

        # Draw moving points (in red)
        for pt in moving_points:
            x, y = pt.ravel()
            cv2.circle(output_frame, (int(x), int(y)), 5, (0, 0, 255), -1)

        # Draw static points (in green)
        for pt in static_points:
            x, y = pt.ravel()
            cv2.circle(output_frame, (int(x), int(y)), 5, (0, 255, 0), -1)

        cv2.imshow("Points", output_frame)

        prev_frame = next_frame
        prev_descriptors = next_descriptors
        prev_keypoints = next_keypoints

    frame_idx += 1


def main():
    cap = cv2.VideoCapture("1.mp4")  # Load your video
    tracker = FeatureTracker()

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        process_frame(frame)
        cv2.imshow("Feature Tracking", frame)

        key = cv2.waitKey(0)
        while key not in [ord("q"), ord("k")]:
            key = cv2.waitKey(0)

        if key == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()

Frankly speaking a lot of the code I take from tutorials, so I didn’t completely understand how each algorithm works, I just know the end purpose of it.

The results are not very good:


red dots are moving features, and green is static.
Could you please point me to how I could improve this approach?