I *hate* Tensorflow SO MUCH......

This commit is contained in:
Starbeamrainbowlabs 2024-11-14 22:38:27 +00:00
parent 7c4f3d325d
commit fe374560a1
Signed by: sbrl
GPG key ID: 1BE5172E637709C2

View file

@ -21,11 +21,17 @@ command_exists() {
####################################################### #######################################################
# Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error if [[ "${SLURM_CLUSTER_NAME}" == "cs-cluster" ]]; then
if [[ "${SLURM_CLUSTER_NAME}" == "cs-cluster" ]] && [[ -d "${HOME}/cuda" ]]; then echo "[slurm_runner] csgpu cluster detected, applying CUDA workarounds" >&2;
echo "[slurm_runner] csgpu cluster detected, sourcing extra CUDA setup script" >&2; # Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error
#shellcheck source=/dev/null if [[ -d "${HOME}/cuda" ]]; then
source "${HOME}/cuda/activate.sh"; echo "[slurm_runner] sourcing extra CUDA setup script" >&2;
#shellcheck source=/dev/null
source "${HOME}/cuda/activate.sh";
fi
export XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/lib/cuda"; # weird... this wasn't needed before? Fixes
echo "[slurm_runner] set XLA_FLAGS=\"${XLA_FLAGS}\"" >&2;
fi fi
# No modules on the CS cluster # No modules on the CS cluster