From 04ea305b7053349ad691a842c96793639d8ade2b Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Fri, 8 Nov 2024 21:47:55 +0000 Subject: [PATCH 1/2] dlr/slurm: implement USE_CONDA, module command opt-support --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 42 +++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 206a3cb..ceca681 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -12,13 +12,28 @@ # ---> in MiB # no-requeue: ref https://support.hull.ac.uk/tas/public/ssp/content/detail/incident?unid=652db7ac6e73485c9f7658db78b2b628 -module load utilities/multi -module load readline/7.0 -module load gcc/10.2.0 -module load cuda/11.5.0 -export XLA_FLAGS="--xla_gpu_cuda_data_dir=/home/ViperAppsFiles/cuda/11.5.0"; # weird... this wasn't needed before? +####################################################### + +command_exists() { + command -v "$1" >/dev/null 2>&1 + return $?; +} + +####################################################### + +if command_exists module; then + module load utilities/multi + module load readline/7.0 + module load gcc/10.2.0 + module load cuda/11.5.0; + export XLA_FLAGS="--xla_gpu_cuda_data_dir=/home/ViperAppsFiles/cuda/11.5.0"; # weird... this wasn't needed before? + + module load python/anaconda/4.6/miniconda/3.7 +else + echo "[bash/runner]: module command not present, not loading modules" >&2; +fi + -module load python/anaconda/4.6/miniconda/3.7 show_help() { @@ -28,6 +43,8 @@ show_help() { echo -e " sbatch slurm-TEST-deeplabv3p-rainfall.job" >&2; echo -e "" >&2; echo -e "....where:" >&2; + echo -e " USE_CONDA Optional. Set to any value to use conda when running the experiment. REQUIRED ON VIPER." >&2; + echo -e "" >&2; echo -e " IMAGE_SIZE=128 Optional. Sets the size of the 'images' that the DeepLabV3+ model will work with." >&2; echo -e " BATCH_SIZE=64 Optional. Sets the batch size to train the model with." >&2; echo -e " DIR_RAINFALLWATER The path to the directory the .tfrecord files containing the rainfall radar / water depth data." >&2; @@ -82,8 +99,17 @@ export PATH=$HOME/software/bin:$PATH; export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE SPLIT_VALIDATE SPLIT_TEST; echo ">>> Installing requirements"; -conda run -n py38 pip install -q -r requirements.txt; +if [[ -n "${USE_CONDA}" ]]; then + conda run -n py38 pip install -q -r requirements.txt; +else + echo "[bash/runner]: USE_CONDA env var NOT specified, not installing pip packages." >&2; +fi echo ">>> Training model"; #shellcheck disable=SC2016 -/usr/bin/env time -v conda run -n py38 bash -c 'src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log"; echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log";'; +if [[ -n "${USE_CONDA}" ]]; then + /usr/bin/env time -v conda run -n py38 bash -c 'src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log"; echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log";'; +else + /usr/bin/env time -v src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log"; + echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log"; +fi echo ">>> exited with code $?"; From d0dc1b42802062edd03d2bbc7e08d89fa6262f2d Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Fri, 8 Nov 2024 22:05:13 +0000 Subject: [PATCH 2/2] .gitignore: ignore some more backup files: draw.io, .bak.png --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index d8e9669..76571e2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ *.out *.log output +*.drawio.bkp +*.bak.png + # Created by https://www.toptal.com/developers/gitignore/api/python,node,git,visualstudiocode # Edit at https://www.toptal.com/developers/gitignore?templates=python,node,git,visualstudiocode