Panorama stitching using OpenCV in Python

I finally solved the problem by using the known orientation of the camera to project each cubemap onto the sphere and blending between the resulting images. For the blending, I simply used the distance to the edge of each cubemap as the weight.

def calculate_R(orientation, up=cp.array([0, -1, 0])):

    orientation = orientation / cp.linalg.norm(orientation)

    if cp.allclose(orientation, up):
        return cp.array([[1, 0, 0], [0, 0, 1], [0, -1, 0]])

    if cp.allclose(orientation, -up):
        return cp.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]])

    x = cp.cross(up, orientation)
    x = x / cp.linalg.norm(x)

    y = cp.cross(orientation, x)
    y = y / cp.linalg.norm(y)

    R = cp.array([x, y, orientation])

    return R

def rotate_xyz(R, x, y, z):

    xyz = cp.stack((x.ravel(), y.ravel(), z.ravel()))
    x, y, z = cp.matmul(R, xyz).reshape(3, *x.shape)
    
    return x, y, z

def uv_pano(orientations, cubemap_shape, pano_shape):

    u_pano, v_pano = cp.meshgrid(cp.arange(pano_shape[0]), cp.arange(pano_shape[1]), indexing='ij')

    masks   = cp.zeros((len(orientations), pano_shape[0], pano_shape[1]), dtype=cp.float32)
    u_cubes = cp.empty((len(orientations), pano_shape[0], pano_shape[1]), dtype=cp.float32)
    v_cubes = cp.empty((len(orientations), pano_shape[0], pano_shape[1]), dtype=cp.float32)

    for i in range(len(orientations)):

        phi = (u_pano / pano_shape[0] * 1 + 0.5) * cp.pi
        theta = (v_pano / pano_shape[1] * 2 + 0.5) * cp.pi

        x = cp.cos(phi) * cp.cos(theta)
        y = cp.sin(phi)
        z = cp.cos(phi) * cp.sin(theta)

        R = calculate_R(orientations[i])
        x, y, z = rotate_xyz(R, x, y, z)

        condition = (abs(z) > abs(x)) & (abs(z) > abs(y)) & (z < 0)
        masks[i][condition] = 1
        a =  y
        b = -x
        max_abs_coord = abs(z)

        u_cubes[i][condition] = ((a[condition] / max_abs_coord[condition]) + 1.0) * 0.5 * (cubemap_shape[0] - 1)
        v_cubes[i][condition] = ((b[condition] / max_abs_coord[condition]) + 1.0) * 0.5 * (cubemap_shape[1] - 1)

        u_cubes = u_cubes.astype(int)
        v_cubes = v_cubes.astype(int)

    return masks, u_cubes, v_cubes

def transform(cubemaps):
    
    cubemap_shape = cubemaps[0].shape
    if len(cubemaps[0].shape) > 2:
        pano_shape = (int(cubemap_shape[0] * 2), int(cubemap_shape[1] * 4), cubemap_shape[2])
    else:
        pano_shape = (int(cubemap_shape[0] * 2), int(cubemap_shape[1] * 4))
    pano = cp.zeros(pano_shape, dtype=cp.float32)

    masks, u_cubes, v_cubes = uv_pano(orientations, cubemap_shape, pano_shape)

    for i in range(len(cubemaps)):
        masks[i] = ndimage.distance_transform_edt(masks[i])
        if cp.max(masks[i]) > 0:
            masks[i] = masks[i] / cp.max(masks[i])

    for i in range(len(cubemaps)):
        weight = cp.stack([masks[i][masks[i] > 0], masks[i][masks[i] > 0], masks[i][masks[i] > 0]], axis=1)
        pano[masks[i] > 0] += cubemaps[i][u_cubes[i][masks[i] > 0], v_cubes[i][masks[i] > 0]] * weight

    weight_sum = cp.stack([cp.sum(masks, axis=0), cp.sum(masks, axis=0), cp.sum(masks, axis=0)], axis=2)
    weight_sum = cp.where(weight_sum <= 0, 1, weight_sum)
    pano = pano / weight_sum

    pano = cp.clip(pano, 0, 255).astype(cp.uint8)

    return pano