Homography matrix

I have multi camera view of the same region.
For both the cameras , I am having the camera calibration parameters.
Using that I got the Transformation matrix from camera 1 view to camera 2 view.

Now , using an deep learning algorithm , I have the bounding boxes of people in camera 1 , and i want to get the corresponding bbox of people in camera 2 . When i tried transforming bbox coordinate from view 1 to view 2 , all the coordinate values in view 2 are way out of range than the actual image size. i,e i am getting a coordinate point as (840 , 500) , while my actual image size is (120 , 480) (both cameras)

What should i do to solve it.

My Code

def parse_camera_parameters(xml_file):
tree = ET.parse(xml_file)
root = tree.getroot()

geometry_elem = root.find('Geometry')
intrinsic_elem = root.find('Intrinsic')
extrinsic_elem = root.find('Extrinsic')

camera_params = {
    'geometry': {
        'width': float(geometry_elem.attrib.get('width', '0')),
        'height': float(geometry_elem.attrib.get('height', '0'))
    },
    'intrinsic': {
        'focal': float(intrinsic_elem.get('focal', '0')),
        'kappa1': float(intrinsic_elem.get('kappa1', '0')),
        'cx': float(intrinsic_elem.get('cx', '0')),
        'cy': float(intrinsic_elem.get('cy', '0')),
        'sx': float(intrinsic_elem.get('sx', '0'))
    },
    'extrinsic': {
        'tx': float(extrinsic_elem.get('tx', '0')),
        'ty': float(extrinsic_elem.get('ty', '0')),
        'tz': float(extrinsic_elem.get('tz', '0')),
        'rx': float(extrinsic_elem.get('rx', '0')),
        'ry': float(extrinsic_elem.get('ry', '0')),
        'rz': float(extrinsic_elem.get('rz', '0'))
    }
}

return camera_params

def compute_homography(cam1_params, cam2_params):
# Intrinsic parameters of camera 1
K1 = np.array([
[cam1_params[‘intrinsic’][‘focal’], 0, cam1_params[‘intrinsic’][‘cx’]],
[0, cam1_params[‘intrinsic’][‘focal’] * cam1_params[‘intrinsic’][‘sx’], cam1_params[‘intrinsic’][‘cy’]],
[0, 0, 1]
])

# Extrinsic parameters of camera 1
R1 = np.array([
    [1, 0, 0],
    [0, np.cos(cam1_params['extrinsic']['rx']), -np.sin(cam1_params['extrinsic']['rx'])],
    [0, np.sin(cam1_params['extrinsic']['rx']), np.cos(cam1_params['extrinsic']['rx'])]
]) @ np.array([
    [np.cos(cam1_params['extrinsic']['ry']), 0, np.sin(cam1_params['extrinsic']['ry'])],
    [0, 1, 0],
    [-np.sin(cam1_params['extrinsic']['ry']), 0, np.cos(cam1_params['extrinsic']['ry'])]
]) @ np.array([
    [np.cos(cam1_params['extrinsic']['rz']), -np.sin(cam1_params['extrinsic']['rz']), 0],
    [np.sin(cam1_params['extrinsic']['rz']), np.cos(cam1_params['extrinsic']['rz']), 0],
    [0, 0, 1]
])

t1 = np.array([[cam1_params['extrinsic']['tx']], [cam1_params['extrinsic']['ty']], [cam1_params['extrinsic']['tz']]])

RT1 = np.hstack((R1, t1))

K2 = np.array([
    [cam2_params['intrinsic']['focal'], 0, cam2_params['intrinsic']['cx']],
    [0, cam2_params['intrinsic']['focal'] * cam2_params['intrinsic']['sx'], cam2_params['intrinsic']['cy']],
    [0, 0, 1]
])

R2 = np.array([
    [1, 0, 0],
    [0, np.cos(cam2_params['extrinsic']['rx']), -np.sin(cam2_params['extrinsic']['rx'])],
    [0, np.sin(cam2_params['extrinsic']['rx']), np.cos(cam2_params['extrinsic']['rx'])]
]) @ np.array([
    [np.cos(cam2_params['extrinsic']['ry']), 0, np.sin(cam2_params['extrinsic']['ry'])],
    [0, 1, 0],
    [-np.sin(cam2_params['extrinsic']['ry']), 0, np.cos(cam2_params['extrinsic']['ry'])]
]) @ np.array([
    [np.cos(cam2_params['extrinsic']['rz']), -np.sin(cam2_params['extrinsic']['rz']), 0],
    [np.sin(cam2_params['extrinsic']['rz']), np.cos(cam2_params['extrinsic']['rz']), 0],
    [0, 0, 1]
])

t2 = np.array([[cam2_params['extrinsic']['tx']], [cam2_params['extrinsic']['ty']], [cam2_params['extrinsic']['tz']]])

RT2 = np.hstack((R2, t2))

H = K2 @ RT2 @ np.linalg.pinv(RT1) @ np.linalg.pinv(K1)
    
return H

Code

xml_file_cam1 = ‘calibration_cam0.xml’

xml_file_cam2 = ‘calibration_cam1.xml’

camera_params_cam1 = parse_camera_parameters(xml_file_cam1)

camera_params_cam2 = parse_camera_parameters(xml_file_cam2)

image_cam1 = cv2.imread(‘RLCAFTCONF-C0_100000.jpeg’)

model1, model2, model3 = load_models()

bbox_cam1, labels_cam1, ids_cam1, confs_cam1, image_with_bboxes_cam1, final_bbs_cam1 = run_object_detection(image_cam1, model1, model2, model3)

H_cam1_to_cam2 = compute_homography(camera_params_cam1, camera_params_cam2)
bbox_cam2 =
for bid, bbox in enumerate(bbox_cam1):
x1, y1, x2, y2 = bbox
pt_tl = np.array([[x1, y1]], dtype=np.float32).reshape(-1, 1, 2)
transformed_pt_tl = cv2.perspectiveTransform(pt_tl, H_cam1_to_cam2)

pt_br = np.array([[x2, y2]], dtype=np.float32).reshape(-1, 1, 2)
transformed_pt_br = cv2.perspectiveTransform(pt_br, H_cam1_to_cam2)

x1_trans, y1_trans = transformed_pt_tl[0][0]
x2_trans, y2_trans = transformed_pt_br[0][0]
bbox_cam2.append([x1_trans, y1_trans, x2_trans, y2_trans])

image_cam2 = cv2.imread(‘RLCAFTCONF-C1_100000.jpeg’)

for bbox, id in zip(bbox_cam1, ids_cam1):
x1, y1, x2, y2 = map(int, bbox)
cv2.rectangle(image_cam1, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(image_cam1, f’ID: {id}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

print(bbox_cam2)
for bbox, id in zip(bbox_cam2, ids_cam1):
x1, y1, x2, y2 = map(int, bbox)
cv2.rectangle(image_cam2, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(image_cam2, f’ID: {id}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

comb = np.hstack((image_cam1 , image_cam2))

cv2.imwrite(‘combined.png’ , comb)

Output image

do you have distance/depth information for your targets, or do you just assume that they’ll stand in a fixed plane for which that homography is valid?

I assume it’s a fixed plane