I have a task to detect and track static visual features(features from static objects) on a video.
The camera that records video isn’t stable it’s a video from a drone. Basically from my understanding, I need to do the next things:
- I need first to detect all the visual features.
- Then skip a few frames and detect features again.
- After I use Matcher to get matched features.
- Then pass the features from the previous frame and the next frame to findHomography; this should indicate where the features should be located according to the camera motion.
- Also use optical flow to add more precision and check points not by descriptors but by pixels.
- execute perspectiveTransform on refined points to get warped points(expected locations for feature points).
- Get the difference between refined points from the next frame and expected points. Those points in theory should belong to the moving objects.
Here is the Python implementation for the list above.
import cv2
import numpy as np
frame_idx = 0
orb = cv2.ORB.create(nfeatures=1000)
prev_keypoints = None
prev_descriptors = None
prev_frame = None
detect_interval = 5 # get a new features each 5 frames
matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
lk_params = dict(
winSize=(15, 15),
maxLevel=2,
criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03),
)
def process_frame(frame):
"""Process a single video frame."""
next_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
global frame_idx
global prev_keypoints
global prev_descriptors
global prev_frame
if prev_frame is None:
keypoints, descriptors = orb.detectAndCompute(next_frame, None)
prev_keypoints = keypoints
prev_descriptors = descriptors
prev_frame = next_frame
frame_idx += 1
return
if frame_idx % detect_interval == 0:
next_keypoints, next_descriptors = orb.detectAndCompute(next_frame, None)
matches = matcher.match(prev_descriptors, next_descriptors)
# Sort them in the order of their distance.
matches = sorted(matches, key=lambda x: x.distance)
# Draw the first 300 matches.
img3 = cv2.drawMatches(
prev_frame.copy(),
prev_keypoints,
frame.copy(),
next_keypoints,
matches[:300],
None,
flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
)
cv2.imshow("Matches", img3)
# Extract matched points
prev_matched_points = np.array(
[prev_keypoints[m.queryIdx].pt for m in matches], dtype=np.float32
)
next_matched_points = np.array(
[next_keypoints[m.trainIdx].pt for m in matches], dtype=np.float32
)
# Step 3: Estimate global motion using RANSAC
homography, inliers = cv2.findHomography(
prev_matched_points, next_matched_points, cv2.RANSAC, 5.0
)
inliers_mask = inliers.flatten().astype(bool)
prev_inlier_points = prev_matched_points[inliers_mask]
# Step 4: Refine inlier points with Optical Flow
refined_next_points, status, error = cv2.calcOpticalFlowPyrLK(
prev_frame, next_frame, prev_inlier_points, None, **lk_params
)
valid_status = status.flatten() == 1
refined_prev_points = prev_inlier_points[valid_status]
refined_next_points = refined_next_points[valid_status]
# Step 5: Compensate for camera motion
# Warp previous points using the estimated homography
warped_points = cv2.perspectiveTransform(refined_prev_points.reshape(-1, 1, 2), homography).reshape(-1, 2)
# Compute residual motion (difference between actual and expected motion)
residuals = np.linalg.norm(refined_next_points - warped_points, axis=1)
# print("residuals" + str(residuals))
# Step 6: Separate moving and static points
# Median threshold
threshold = np.percentile(residuals, 50)
moving_points = refined_next_points[residuals > threshold]
static_points = refined_next_points[residuals <= threshold]
output_frame = frame.copy()
# Draw moving points (in red)
for pt in moving_points:
x, y = pt.ravel()
cv2.circle(output_frame, (int(x), int(y)), 5, (0, 0, 255), -1)
# Draw static points (in green)
for pt in static_points:
x, y = pt.ravel()
cv2.circle(output_frame, (int(x), int(y)), 5, (0, 255, 0), -1)
cv2.imshow("Points", output_frame)
prev_frame = next_frame
prev_descriptors = next_descriptors
prev_keypoints = next_keypoints
frame_idx += 1
def main():
cap = cv2.VideoCapture("1.mp4") # Load your video
tracker = FeatureTracker()
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
process_frame(frame)
cv2.imshow("Feature Tracking", frame)
key = cv2.waitKey(0)
while key not in [ord("q"), ord("k")]:
key = cv2.waitKey(0)
if key == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
Frankly speaking a lot of the code I take from tutorials, so I didn’t completely understand how each algorithm works, I just know the end purpose of it.
The results are not very good:
red dots are moving features, and green is static.
Could you please point me to how I could improve this approach?