I have multi camera view of the same region.
For both the cameras , I am having the camera calibration parameters.
Using that I got the Transformation matrix from camera 1 view to camera 2 view.
Now , using an deep learning algorithm , I have the bounding boxes of people in camera 1 , and i want to get the corresponding bbox of people in camera 2 . When i tried transforming bbox coordinate from view 1 to view 2 , all the coordinate values in view 2 are way out of range than the actual image size. i,e i am getting a coordinate point as (840 , 500) , while my actual image size is (120 , 480) (both cameras)
What should i do to solve it.
My Code
def parse_camera_parameters(xml_file):
tree = ET.parse(xml_file)
root = tree.getroot()
geometry_elem = root.find('Geometry')
intrinsic_elem = root.find('Intrinsic')
extrinsic_elem = root.find('Extrinsic')
camera_params = {
'geometry': {
'width': float(geometry_elem.attrib.get('width', '0')),
'height': float(geometry_elem.attrib.get('height', '0'))
},
'intrinsic': {
'focal': float(intrinsic_elem.get('focal', '0')),
'kappa1': float(intrinsic_elem.get('kappa1', '0')),
'cx': float(intrinsic_elem.get('cx', '0')),
'cy': float(intrinsic_elem.get('cy', '0')),
'sx': float(intrinsic_elem.get('sx', '0'))
},
'extrinsic': {
'tx': float(extrinsic_elem.get('tx', '0')),
'ty': float(extrinsic_elem.get('ty', '0')),
'tz': float(extrinsic_elem.get('tz', '0')),
'rx': float(extrinsic_elem.get('rx', '0')),
'ry': float(extrinsic_elem.get('ry', '0')),
'rz': float(extrinsic_elem.get('rz', '0'))
}
}
return camera_params
def compute_homography(cam1_params, cam2_params):
# Intrinsic parameters of camera 1
K1 = np.array([
[cam1_params[‘intrinsic’][‘focal’], 0, cam1_params[‘intrinsic’][‘cx’]],
[0, cam1_params[‘intrinsic’][‘focal’] * cam1_params[‘intrinsic’][‘sx’], cam1_params[‘intrinsic’][‘cy’]],
[0, 0, 1]
])
# Extrinsic parameters of camera 1
R1 = np.array([
[1, 0, 0],
[0, np.cos(cam1_params['extrinsic']['rx']), -np.sin(cam1_params['extrinsic']['rx'])],
[0, np.sin(cam1_params['extrinsic']['rx']), np.cos(cam1_params['extrinsic']['rx'])]
]) @ np.array([
[np.cos(cam1_params['extrinsic']['ry']), 0, np.sin(cam1_params['extrinsic']['ry'])],
[0, 1, 0],
[-np.sin(cam1_params['extrinsic']['ry']), 0, np.cos(cam1_params['extrinsic']['ry'])]
]) @ np.array([
[np.cos(cam1_params['extrinsic']['rz']), -np.sin(cam1_params['extrinsic']['rz']), 0],
[np.sin(cam1_params['extrinsic']['rz']), np.cos(cam1_params['extrinsic']['rz']), 0],
[0, 0, 1]
])
t1 = np.array([[cam1_params['extrinsic']['tx']], [cam1_params['extrinsic']['ty']], [cam1_params['extrinsic']['tz']]])
RT1 = np.hstack((R1, t1))
K2 = np.array([
[cam2_params['intrinsic']['focal'], 0, cam2_params['intrinsic']['cx']],
[0, cam2_params['intrinsic']['focal'] * cam2_params['intrinsic']['sx'], cam2_params['intrinsic']['cy']],
[0, 0, 1]
])
R2 = np.array([
[1, 0, 0],
[0, np.cos(cam2_params['extrinsic']['rx']), -np.sin(cam2_params['extrinsic']['rx'])],
[0, np.sin(cam2_params['extrinsic']['rx']), np.cos(cam2_params['extrinsic']['rx'])]
]) @ np.array([
[np.cos(cam2_params['extrinsic']['ry']), 0, np.sin(cam2_params['extrinsic']['ry'])],
[0, 1, 0],
[-np.sin(cam2_params['extrinsic']['ry']), 0, np.cos(cam2_params['extrinsic']['ry'])]
]) @ np.array([
[np.cos(cam2_params['extrinsic']['rz']), -np.sin(cam2_params['extrinsic']['rz']), 0],
[np.sin(cam2_params['extrinsic']['rz']), np.cos(cam2_params['extrinsic']['rz']), 0],
[0, 0, 1]
])
t2 = np.array([[cam2_params['extrinsic']['tx']], [cam2_params['extrinsic']['ty']], [cam2_params['extrinsic']['tz']]])
RT2 = np.hstack((R2, t2))
H = K2 @ RT2 @ np.linalg.pinv(RT1) @ np.linalg.pinv(K1)
return H
Code
xml_file_cam1 = ‘calibration_cam0.xml’
xml_file_cam2 = ‘calibration_cam1.xml’
camera_params_cam1 = parse_camera_parameters(xml_file_cam1)
camera_params_cam2 = parse_camera_parameters(xml_file_cam2)
image_cam1 = cv2.imread(‘RLCAFTCONF-C0_100000.jpeg’)
model1, model2, model3 = load_models()
bbox_cam1, labels_cam1, ids_cam1, confs_cam1, image_with_bboxes_cam1, final_bbs_cam1 = run_object_detection(image_cam1, model1, model2, model3)
H_cam1_to_cam2 = compute_homography(camera_params_cam1, camera_params_cam2)
bbox_cam2 =
for bid, bbox in enumerate(bbox_cam1):
x1, y1, x2, y2 = bbox
pt_tl = np.array([[x1, y1]], dtype=np.float32).reshape(-1, 1, 2)
transformed_pt_tl = cv2.perspectiveTransform(pt_tl, H_cam1_to_cam2)
pt_br = np.array([[x2, y2]], dtype=np.float32).reshape(-1, 1, 2)
transformed_pt_br = cv2.perspectiveTransform(pt_br, H_cam1_to_cam2)
x1_trans, y1_trans = transformed_pt_tl[0][0]
x2_trans, y2_trans = transformed_pt_br[0][0]
bbox_cam2.append([x1_trans, y1_trans, x2_trans, y2_trans])
image_cam2 = cv2.imread(‘RLCAFTCONF-C1_100000.jpeg’)
for bbox, id in zip(bbox_cam1, ids_cam1):
x1, y1, x2, y2 = map(int, bbox)
cv2.rectangle(image_cam1, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(image_cam1, f’ID: {id}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
print(bbox_cam2)
for bbox, id in zip(bbox_cam2, ids_cam1):
x1, y1, x2, y2 = map(int, bbox)
cv2.rectangle(image_cam2, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(image_cam2, f’ID: {id}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
comb = np.hstack((image_cam1 , image_cam2))
cv2.imwrite(‘combined.png’ , comb)
Output image