If you want a 20% performance improvement, you can do a coalesced write in this section: opencv_contrib/modules/cudaimgproc/src/cuda/connectedcomponents.cu at 6b5142ff657ca676ab35233556b49a532e75e2b7 · opencv/opencv_contrib · GitHub
if (col + 1 < labels.cols) {
uint32_t uint_buf[2];
uint_buf[0] = labels_index + father_offset;
uint_buf[1] = info;
*(reinterpret_cast<int64_t*>(labels.data + labels_index)) = *(reinterpret_cast<int64_t*>(uint_buf));
}
else {
labels.data[labels_index] = labels_index + father_offset;
if (row + 1 < labels.rows) {
last_pixel = reinterpret_cast<unsigned char*>(labels.data + labels_index + labels.step / labels.elem_size);
}
*last_pixel = info;
}