slurm/dlr: tensorflow is dumb

Workaround for this crash on Tensorflow 2.13:

Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory
This commit is contained in:
Starbeamrainbowlabs 2024-11-14 22:26:16 +00:00
parent 52af6f00ec
commit 17d2d2bcaf
Signed by: sbrl
GPG key ID: 1BE5172E637709C2

View file

@ -21,6 +21,13 @@ command_exists() {
#######################################################
# Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error
if [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]] && [[ -d "${HOME}/cuda" ]]; then
echo "[slurm_runner] csgpu cluster detected, sourcing extra CUDA setup script" >&2;
#shellcheck source=/dev/null
source "${HOME}/cuda/activate.sh";
fi
# No modules on the CS cluster
if command_exists module && [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]]; then
module load utilities/multi