CUDA Connected Components: 20% performance improvement

If you want a 20% performance improvement, you can do a coalesced write in this section: opencv_contrib/modules/cudaimgproc/src/cuda/connectedcomponents.cu at 6b5142ff657ca676ab35233556b49a532e75e2b7 · opencv/opencv_contrib · GitHub

        if (col + 1 < labels.cols) {
            uint32_t uint_buf[2];
            uint_buf[0] = labels_index + father_offset;
            uint_buf[1] = info;
            *(reinterpret_cast<int64_t*>(labels.data + labels_index)) = *(reinterpret_cast<int64_t*>(uint_buf));
        }
        else {
            labels.data[labels_index] = labels_index + father_offset;
            if (row + 1 < labels.rows) {
                last_pixel = reinterpret_cast<unsigned char*>(labels.data + labels_index + labels.step / labels.elem_size);
            }
            *last_pixel = info;
        }