Statically Cross compile OpenCV with CUDA

Hey, I have been working on cross compiling OpenCV 4.5 (currently 4.5.0 for compatibility reasons but the goal is to upgrade it to 4.5.latest and 4.6).

Questions

  1. Is it possible to statically linked libcudart while cross compiling?
  2. Is it possible to statically linked libcublas while cross compiling?
  3. Does it make sense to statically link these libs or should I use only dynamic linking and load CUDA from a proper ARM64 device?

Description

I am using the following versions:

CUDA: 10.2
OpenCV: 4.5.0
cmake: 3.18
Ubuntu: 18.04

I have managed to cross compile it successfully. However, at the moment if I want to use the generated binaries, I have to have CUDA available on in my docker container, otherwise I end with:

/My_Test: error while loading shared libraries: libcudart.so.10.2: cannot open shared object file: No such file or directory

Steps Taken

cmake ..  \
    -DCMAKE_TOOLCHAIN_FILE=../cmake_aarch64.toolchain \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_INSTALL_PREFIX=/output \
    -DBUILD_SHARED_LIBS=OFF \
    -DBUILD_ZLIB=ON \
    -DBUILD_PNG=ON \
    -DWITH_CUDA=ON \
    -DWITH_OPENEXR=OFF \
    -DWITH_WEBP=OFF \
    -DWITH_OPENCL=OFF \
    -DWITH_1394=OFF \
    -DWITH_GTK=ON \
    -DWITH_FFMPEG=ON \
    -DCUDA_INC_PATH=/usr/local/cuda-${DESIRED_CUDA_VERSION}/targets/aarch64-linux/include \
    -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda \
    -DCMAKE_FIND_ROOT_PATH=/opencv/opencv/cmake  \
    -DOPENCV_EXTRA_MODULES_PATH=/opencv/opencv/opencv_contrib-${OPENCV_VERSION}/modules \
    -DBUILD_TIFF=ON \
    -DBUILD_TBB=ON \
    -DWITH_LAPACK=OFF \
    -DBUILD_NEW_PYTHON_SUPPORT=ON \
    -DBUILD_JPEG=ON \
    -DBUILD_JASPER=ON \
    -DBUILD_EXAMPLES=OFF \
    -DBUILD_JAVA=OFF \
    -DBUILD_opencv_python2=ON \
    -DBUILD_opencv_python3=ON \
    -DCUDA_NVCC_FLAGS="-D_FORCE_INLINES" \
    -DENABLE_NEON=ON \
    -DWITH_OPENMP=OFF \
    -DWITH_GSTREAMER=OFF \
    -DWITH_GSTREAMER_0_10=OFF \
    -DWITH_VTK=OFF \
    -DWITH_TBB=ON \
    -DCUDA_ARCH_BIN=7.2 \
    -DCUDA_ARCH_PTX="" \
    -DBUILD_EXAMPLES=OFF \
    -DINSTALL_C_EXAMPLES=ON \
    -D BUILD_PERF_TESTS=OFF \
    -D BUILD_TESTS=OFF \
    -DINSTALL_TESTS=OFF \
    -DOPENCV_ENABLE_NONFREE=ON\
    -DBUILD_opencv_xobjdetect=OFF
cat ../cmake_aarch64.toolchain


set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR aarch64)
set(TRT_PLATFORM_ID "aarch64")
set(CUDA_PLATFORM_ID "aarch64-linux")
set(CMAKE_CROSSCOMPILING TRUE)
set(CMAKE_C_COMPILER_TARGET aarch64)
set(CMAKE_CXX_COMPILER_TARGET aarch64)
set(CMAKE_C_COMPILER /usr/xcc/aarch64-unknown-linux-gnueabi/bin/aarch64-unknown-linux-gnueabi-gcc)
set(CMAKE_CXX_COMPILER  /usr/xcc/aarch64-unknown-linux-gnueabi/bin/aarch64-unknown-linux-gnueabi-g++)
set(DISABLE_SWIG TRUE)
set(CUDA_LIB_PATH /usr/local/cuda-$ENV{CUDA_VERSION}/targets/aarch64-linux/)
set(CUDA_BIN_PATH /usr/local/cuda/)
set(CUDA_TARGET_TRIPLET aarch64-linux)
set(CUDA_PATH /usr/local/cuda-$ENV{CUDA_VERSION}/targets/aarch64-linux/)
set(ADDITIONAL_PLATFORM_LIB_FLAGS -L${CUDA_LIB_PATH} -lcublas -lcudart -lstdc++ -lm)

Running My_Test in the same container where I build OpenCV and my own MyTest binary, works as expected.

When I run My_Test in a nvcr.io/nvidia/l4t-base:r32.4.3 docker container with CUDA, it works as expected.

Only when I run My_Test in a non CUDA machine, that’s when it fails running.

The reason is related to libcudart.so.10.2 as mentioned above:

ldd ./My_Test
	libGCBase_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libGCBase_gcc_v3_1_Basler_pylon.so (0x00000055022a3000)
	libpylonbase-6.2.0.so => /usr/local/pylon/lib/libpylonbase-6.2.0.so (0x00000055022d0000)
	libGenApi_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libGenApi_gcc_v3_1_Basler_pylon.so (0x0000005503042000)
	libpylonutility-6.2.0.so => /usr/local/pylon/lib/libpylonutility-6.2.0.so (0x00000055033a3000)
	librt.so.1 => /lib/aarch64-linux-gnu/librt.so.1 (0x0000005503599000)
	libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x00000055035b0000)
	libdl.so.2 => /lib/aarch64-linux-gnu/libdl.so.2 (0x00000055035de000)
	libcudart.so.10.2 => not found
	libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x00000055035f3000)
	libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x00000055036ac000)
	/lib/ld-linux-aarch64.so.1 (0x0000005500000000)
	libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000005503805000)
	libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x000000550399b000)
	libLog_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libLog_gcc_v3_1_Basler_pylon.so (0x00000055039bf000)
	libMathParser_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libMathParser_gcc_v3_1_Basler_pylon.so (0x00000055039da000)
	libXmlParser_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libXmlParser_gcc_v3_1_Basler_pylon.so (0x00000055039f5000)
	libNodeMapData_gcc_v3_1_Basler_pylon.so => /usr/local/pylon/lib/libNodeMapData_gcc_v3_1_Basler_pylon.so (0x0000005503b30000)

Also nm -e ./My_Test | grep -i cuda:

...
U __cudaPopCallConfiguration@@libcudart.so.10.2
U __cudaPushCallConfiguration@@libcudart.so.10.2
U __cudaRegisterFatBinary@@libcudart.so.10.2
U __cudaRegisterFatBinaryEnd@@libcudart.so.10.2
U __cudaRegisterFunction@@libcudart.so.10.2
U __cudaRegisterVar@@libcudart.so.10.2
U __cudaUnregisterFatBinary@@libcudart.so.10.2
U cudaDeviceReset@@libcudart.so.10.2
U cudaDeviceSynchronize@@libcudart.so.10.2
U cudaDriverGetVersion@@libcudart.so.10.2
U cudaEventCreateWithFlags@@libcudart.so.10.2
U cudaEventDestroy@@libcudart.so.10.2
U cudaEventElapsedTime@@libcudart.so.10.2
U cudaEventQuery@@libcudart.so.10.2
U cudaEventRecord@@libcudart.so.10.2
U cudaEventSynchronize@@libcudart.so.10.2
U cudaFree@@libcudart.so.10.2
U cudaFreeHost@@libcudart.so.10.2
U cudaGetDevice@@libcudart.so.10.2
U cudaGetDeviceCount@@libcudart.so.10.2
U cudaGetDeviceProperties@@libcudart.so.10.2
U cudaGetErrorString@@libcudart.so.10.2
U cudaGetLastError@@libcudart.so.10.2
U cudaHostAlloc@@libcudart.so.10.2
U cudaHostGetDevicePointer@@libcudart.so.10.2
U cudaHostRegister@@libcudart.so.10.2
U cudaHostUnregister@@libcudart.so.10.2
U cudaLaunchKernel@@libcudart.so.10.2
U cudaMalloc@@libcudart.so.10.2
U cudaMallocPitch@@libcudart.so.10.2
U cudaMemGetInfo@@libcudart.so.10.2
U cudaMemcpy2D@@libcudart.so.10.2
U cudaMemcpy2DAsync@@libcudart.so.10.2
U cudaMemset2D@@libcudart.so.10.2
U cudaMemset2DAsync@@libcudart.so.10.2
U cudaRuntimeGetVersion@@libcudart.so.10.2
U cudaSetDevice@@libcudart.so.10.2
U cudaStreamAddCallback@@libcudart.so.10.2
U cudaStreamCreate@@libcudart.so.10.2
U cudaStreamDestroy@@libcudart.so.10.2
U cudaStreamQuery@@libcudart.so.10.2
U cudaStreamSynchronize@@libcudart.so.10.2
U cudaStreamWaitEvent@@libcudart.so.10.2

While I cannot show the content of My_Test, basically it doesn’t make any use of CUDA at the moment (it will in the future) but it does use OpenCV with CUDA enabled and therefore it fails.