From 17d2d2bcaf63405b10997e65d3f0b2c7bcabbbbe Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 22:26:16 +0000 Subject: [PATCH] slurm/dlr: tensorflow is dumb Workaround for this crash on Tensorflow 2.13: Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 52bedf6..7dafb48 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -21,6 +21,13 @@ command_exists() { ####################################################### +# Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error +if [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]] && [[ -d "${HOME}/cuda" ]]; then + echo "[slurm_runner] csgpu cluster detected, sourcing extra CUDA setup script" >&2; + #shellcheck source=/dev/null + source "${HOME}/cuda/activate.sh"; +fi + # No modules on the CS cluster if command_exists module && [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]]; then module load utilities/multi