Issues with aligning handheld images taken at different focal distance and varying exposures

I’ve been working on a project that involves aligning a series of handheld images taken at different focal lengths and exposures. My goal is to achieve pixel-perfect alignment to facilitate focus stacking, but I’ve encountered challenges with noticeable zoom differences and imperfect cropping among the images. Despite attempts to correct for these issues programmatically, some zoom discrepancies and misalignments persist.

Here is an overview of the steps I’ve taken:

  1. Feature Detection and Matching: Utilized SIFT for feature detection and FLANN for matching keypoints between each image and a reference image.
  2. Estimation of Scale and Rotation: Implemented a function to estimate average scale and rotation differences between matched keypoints, intending to correct zoom differences and slight rotations before alignment.
  3. Homography and Alignment: Applied a homography transform based on the good matches to align each image to the reference.
  4. Cropping to Common Area: Attempted to automatically crop all images to the largest common area post-alignment to address borders introduced by the alignment process.

Despite these steps, the final images exhibit misalignment, particularly in the edges, and the automatic cropping doesn’t perfectly isolate the common area across all images, likely due to residual scale and alignment discrepancies.

Key Challenges:

  • Zoom Differences: Even after scale and rotation correction, some images still don’t align perfectly, suggesting the corrections may not fully compensate for the zoom differences.
  • Cropping: The automatic cropping sometimes either cuts off more than necessary or leaves edges that should have been cropped out, indicating the alignment might not be as precise as needed.

Here is the code that I am currently trying to use

import cv2
import numpy as np
import os
import glob
import math

def estimate_scale_rotation(keypoints1, keypoints2, matches):
    scales = []
    rotations = []
    for m in matches:
        p1 = keypoints1[m.queryIdx].pt
        p2 = keypoints2[m.trainIdx].pt

        # Compute Euclidean distance between matched points for scale estimation
        scale = np.linalg.norm(np.array(p2) - np.array([0, 0])) / np.linalg.norm(np.array(p1) - np.array([0, 0]))

        # Compute angle difference for rotation estimation
        angle1 = math.atan2(p1[1], p1[0])
        angle2 = math.atan2(p2[1], p2[0])
        rotation = angle2 - angle1

    avg_scale = np.median(scales)
    avg_rotation = np.median(rotations)
    return avg_scale, avg_rotation

def align_images(image_to_align, reference_image, keypoints1, descriptors1):
    # Convert image to align to grayscale
    gray2 = cv2.cvtColor(image_to_align, cv2.COLOR_BGR2GRAY)

    # FLANN matcher
    index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
    search_params = dict(checks=50)
    flann = cv2.FlannBasedMatcher(index_params, search_params)

    # Detect features in the second image
    sift = cv2.SIFT_create()
    keypoints2, descriptors2 = sift.detectAndCompute(gray2, None)

    matches = flann.knnMatch(descriptors1, descriptors2, k=2)
    good_matches = [m for m, n in matches if m.distance < 0.7*n.distance]

    # Estimate scale and rotation
    avg_scale, avg_rotation = estimate_scale_rotation(keypoints1, keypoints2, good_matches)

    # Correct for scale and rotation
    scaling_matrix = cv2.getRotationMatrix2D((gray2.shape[1] / 2, gray2.shape[0] / 2), math.degrees(avg_rotation), avg_scale)
    scaled_rotated_image = cv2.warpAffine(image_to_align, scaling_matrix, (gray2.shape[1], gray2.shape[0]))

    # Now apply homography for final alignment using scaled and rotated image
    # Convert scaled and rotated image and reference image to grayscale
    gray1 = cv2.cvtColor(scaled_rotated_image, cv2.COLOR_BGR2GRAY)
    gray2 = cv2.cvtColor(reference_image, cv2.COLOR_BGR2GRAY)

    # Detect SIFT features and compute descriptors again if necessary
    keypoints1, descriptors1 = sift.detectAndCompute(gray1, None)
    keypoints2, descriptors2 = sift.detectAndCompute(gray2, None)

    # Find good matches again
    matches = flann.knnMatch(descriptors1, descriptors2, k=2)
    good_matches = [m for m, n in matches if m.distance < 0.7*n.distance]

    # Extract location of good matches
    points1 = np.float32([keypoints1[m.queryIdx].pt for m in good_matches])
    points2 = np.float32([keypoints2[m.trainIdx].pt for m in good_matches])

    # Find homography
    H, mask = cv2.findHomography(points1, points2, cv2.RANSAC)

    # Use homography to warp images
    height, width = reference_image.shape[:2]
    image_aligned = cv2.warpPerspective(scaled_rotated_image, H, (width, height))
    return image_aligned

def crop_to_common_area(images):
    # Assuming all images are already aligned and have the same size
    height, width = images[0].shape[:2]
    common_area = [0, 0, width, height]  # x, y, w, h

    for image in images:
        # Find contours to identify the largest common area
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 1, 255, cv2.THRESH_BINARY)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        if contours:
            x, y, w, h = cv2.boundingRect(contours[0])
            common_area = [
                max(common_area[0], x),
                max(common_area[1], y),
                min(common_area[2], x + w),
                min(common_area[3], y + h)

    # Crop images to the common area
    cropped_images = []
    for image in images:
        cropped = image[common_area[1]:common_area[3], common_area[0]:common_area[2]]
    return cropped_images

# Setup for reading and writing images
image_folder = '/Users/karan/Library/CloudStorage/Dropbox/Apps/rugStudio/11/ul'
result_folder = '/Users/karan/Library/CloudStorage/Dropbox/Apps/rugStudio/11/ul/results'
os.makedirs(result_folder, exist_ok=True)

# Read images
images = [cv2.imread(file) for file in sorted(glob.glob(f"{image_folder}/*.jpeg"))]

# Use the first image as reference for feature detection
sift = cv2.SIFT_create()
keypoints1, descriptors1 = sift.detectAndCompute(cv2.cvtColor(images[0], cv2.COLOR_BGR2GRAY), None)

# Align and crop images
aligned_images = [images[0]]  # Include reference image as the first in the list
for image in images[1:]:
    aligned = align_images(image, images[0], keypoints1, descriptors1)

cropped_images = crop_to_common_area(aligned_images)

# Save results
for i, image in enumerate(cropped_images):
    cv2.imwrite(os.path.join(result_folder, f"aligned_{i}.jpeg"), image)

I suspect my methodology for estimating and correcting scale/rotation might be oversimplified or incorrectly applied, and my cropping logic may not adequately account for the nuances of post-alignment image borders.

Request for Assistance: Could anyone suggest improvements to my approach or recommend alternative methods to more accurately align images with varying zoom levels and ensure precise cropping? Any insights into handling such alignment challenges, especially with handheld images, would be greatly appreciated.

Thank you in advance for your time and help!