From 4ac7082754076bad75529cc67f66a1c74ecb5bcd Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:17:53 +0000 Subject: [PATCH 01/12] write commit info to file in DIR_OUTPUT --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 3 +++ aimodel/src/deeplabv3_plus_test_rainfall.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 51298ea..7f5ba81 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -92,6 +92,9 @@ mkdir -p "${DIR_OUTPUT}"; echo -e ">>> NOW: $(date)"; echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}"; echo -e ">>> Additional args: ${ARGS}"; +echo -e ">>> GIT COMMIT: $(git rev-parse HEAD)"; + +{ echo "*****"; git rev-parse HEAD; git status; git log -1 | cat; } >>"${DIR_OUTPUT}/commit.txt"; export PATH=$HOME/software/bin:$PATH; export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE SPLIT_VALIDATE SPLIT_TEST; diff --git a/aimodel/src/deeplabv3_plus_test_rainfall.py b/aimodel/src/deeplabv3_plus_test_rainfall.py index eeb472c..ff9d394 100755 --- a/aimodel/src/deeplabv3_plus_test_rainfall.py +++ b/aimodel/src/deeplabv3_plus_test_rainfall.py @@ -40,7 +40,7 @@ logger.info(f"Starting at {str(datetime.now().isoformat())}") # ███████ ██ ████ ████ ██ ██ ██ ██████ ██ ████ ██ ██ ███████ ██ ████ ██ IMAGE_SIZE = env.read("IMAGE_SIZE", int, 128) # was 512; 128 is the highest power of 2 that fits the data -BATCH_SIZE = env.read("BATCH_SIZE", int, 64) +BATCH_SIZE = env.read("BATCH_SIZE", int, 32) NUM_CLASSES = 2 DIR_RAINFALLWATER = env.read("DIR_RAINFALLWATER", str) PATH_HEIGHTMAP = env.read("PATH_HEIGHTMAP", str) @@ -48,10 +48,10 @@ PATH_COLOURMAP = env.read("PATH_COLOURMAP", str) PARALLEL_READS = env.read("PARALLEL_READS", float, 1.5) STEPS_PER_EPOCH = env.read("STEPS_PER_EPOCH", int, None) REMOVE_ISOLATED_PIXELS = env.read("NO_REMOVE_ISOLATED_PIXELS", bool, True) -EPOCHS = env.read("EPOCHS", int, 50) +EPOCHS = env.read("EPOCHS", int, 25) LOSS = env.read("LOSS", str, "cross-entropy-dice") # other possible values: cross-entropy DICE_LOG_COSH = env.read("DICE_LOG_COSH", bool, False) -LEARNING_RATE = env.read("LEARNING_RATE", float, 0.001) +LEARNING_RATE = env.read("LEARNING_RATE", float, 0.00001) WATER_THRESHOLD = env.read("WATER_THRESHOLD", float, 0.1) UPSAMPLE = env.read("UPSAMPLE", int, 2) SPLIT_VALIDATE = env.read("SPLIT_VALIDATE", float, 0.2) @@ -59,7 +59,7 @@ SPLIT_TEST = env.read("SPLIT_TEST", float, 0) # NOTE: RANDSEED is declared and handled in src/lib/dataset/primitives/shuffle.py STEPS_PER_EXECUTION = env.read("STEPS_PER_EXECUTION", int, 1) -JIT_COMPILE = env.read("JIT_COMPILE", bool, False) +JIT_COMPILE = env.read("JIT_COMPILE", bool, True) DIR_OUTPUT = env.read("DIR_OUTPUT", str, f"output/{datetime.utcnow().date().isoformat()}_deeplabv3plus_rainfall_TEST") PATH_CHECKPOINT = env.read("PATH_CHECKPOINT", str, None) PREDICT_COUNT = env.read("PREDICT_COUNT", int, 25) From bd2c6b1c3f2db052d3b56b58ae561b96052c0a07 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:34:04 +0000 Subject: [PATCH 02/12] slurm/dlr: don't runmodule load .... on csgpu cluster --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 7f5ba81..3e1559c 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -21,7 +21,8 @@ command_exists() { ####################################################### -if command_exists module; then +# No modules on the CS cluster +if command_exists module && [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]]; then module load utilities/multi module load readline/7.0 module load gcc/10.2.0 From 2b69d2c4f274a414ded73d629384ebd2e401e581 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:37:11 +0000 Subject: [PATCH 03/12] slurm/dlr: correct logging msg --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 3e1559c..e0842f5 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -31,7 +31,7 @@ if command_exists module && [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]]; then module load python/anaconda/4.6/miniconda/3.7 else - echo "[slurm_runner]: module command not present, not loading modules" >&2; + echo "[slurm_runner]: module command not present or csgpu cluster detected, not loading modules" >&2; fi From e83d90177993877c0ca5a9996e8eda26c45935a7 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:49:03 +0000 Subject: [PATCH 04/12] env.py: fix crash --- aimodel/src/lib/primitives/env.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aimodel/src/lib/primitives/env.py b/aimodel/src/lib/primitives/env.py index 7ab0c3a..c90928e 100644 --- a/aimodel/src/lib/primitives/env.py +++ b/aimodel/src/lib/primitives/env.py @@ -8,6 +8,7 @@ import os ### ## Changelog: +# 2024-11-14: Fix crash on line #107 unterminated string literal # 2024-09-29: Create this changelog, prepare for reuse ############################################################################## @@ -104,8 +105,7 @@ def print_all(table=True): # Create the table format string - format_string = f"| {{:<{width_name}}} | {{:<{ - width_type}}} | {{:<{width_value}}} | {{:<{width_flags}}} |" + format_string = f"| {{:<{width_name}}} | {{:<{width_type}}} | {{:<{width_value}}} | {{:<{width_flags}}} |" # Calculate total width total_width = width_name + width_type + width_value + \ From a7ab5ee341e241abf418757b832631bb5d92f861 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:49:54 +0000 Subject: [PATCH 05/12] =?UTF-8?q?slurm/dlr:=20cpu=20cores=2014=E2=86=929?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index e0842f5..d40dfb6 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -1,7 +1,7 @@ #!/usr/bin/env bash #SBATCH -J DeepRain #SBATCH -N 1 -#SBATCH -n 14 +#SBATCH -n 9 #SBATCH --gres=gpu:1 #SBATCH -o %j.%N.%a.deeplab-rainfall.out.log #SBATCH -e %j.%N.%a.deeplab-rainfall.err.log From 7be22222078ee4d3c2c15f931fd42ff6b7f12bb8 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:52:41 +0000 Subject: [PATCH 06/12] dlr: import ...as env --- aimodel/src/deeplabv3_plus_test_rainfall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimodel/src/deeplabv3_plus_test_rainfall.py b/aimodel/src/deeplabv3_plus_test_rainfall.py index ff9d394..54d7bfb 100755 --- a/aimodel/src/deeplabv3_plus_test_rainfall.py +++ b/aimodel/src/deeplabv3_plus_test_rainfall.py @@ -20,7 +20,7 @@ import matplotlib.pyplot as plt import tensorflow as tf -import lib.primitives.env +import lib.primitives.env as env from lib.dataset.dataset_mono import dataset_mono, dataset_mono_predict from lib.ai.components.LossCrossEntropyDice import LossCrossEntropyDice from lib.ai.components.MetricDice import metric_dice_coefficient as dice_coefficient From 090ab784571b1b07a60a1a93e747c377c86292e1 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:53:59 +0000 Subject: [PATCH 07/12] dlr: fix (another) crash --- aimodel/src/deeplabv3_plus_test_rainfall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimodel/src/deeplabv3_plus_test_rainfall.py b/aimodel/src/deeplabv3_plus_test_rainfall.py index 54d7bfb..926ca84 100755 --- a/aimodel/src/deeplabv3_plus_test_rainfall.py +++ b/aimodel/src/deeplabv3_plus_test_rainfall.py @@ -95,7 +95,7 @@ if not PREDICT_AS_ONE: do_remove_isolated_pixels=REMOVE_ISOLATED_PIXELS, parallel_reads_multiplier=PARALLEL_READS, percentage_validate=SPLIT_VALIDATE, - percentage_test=SPLIT_TESTs + percentage_test=SPLIT_TEST ) logger.info("Train Dataset:", dataset_train) From 159f8a4679e7506f78620f54eded8a8c780f23cf Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 21:59:32 +0000 Subject: [PATCH 08/12] =?UTF-8?q?slurm/dlr:=20DIR=5FRAINFALLWATER=20defaul?= =?UTF-8?q?t=20=E2=86=92=20~/data/....?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index d40dfb6..52bedf6 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -75,7 +75,7 @@ show_help() { exit; } -DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/rainfallwater_records_tfrecord}"; +DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/data/rainfallwater_records_tfrecord}"; PATH_HEIGHTMAP="${PATH_HEIGHTMAP:-$HOME/data/terrain50-nimrodsized.json.gz}"; PATH_COLOURMAP="${PATH_COLOURMAP:-$HOME/data/instance-level-human-parsing/instance-level_human_parsing/human_colormap.mat}"; From 52af6f00ec757d0271fdff320837abfe12ba6402 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 22:03:22 +0000 Subject: [PATCH 09/12] dlr/dataset_mono: fix crash in new ssplit3 setup --- aimodel/src/lib/dataset/dataset_mono.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimodel/src/lib/dataset/dataset_mono.py b/aimodel/src/lib/dataset/dataset_mono.py index 7e5e8b3..1a428be 100644 --- a/aimodel/src/lib/dataset/dataset_mono.py +++ b/aimodel/src/lib/dataset/dataset_mono.py @@ -165,7 +165,7 @@ def dataset_mono(dirpath_input, percentage_validate=0.2, percentage_test=0, **kw filepaths_count = len(filepaths) split_trainvalidate=math.floor(filepaths_count * (1-(percentage_validate+percentage_test))) - split_validatetest=math.floor(filepaths * (1 - percentage_test)) + split_validatetest=math.floor(filepaths_count * (1 - percentage_test)) filepaths_train = filepaths[:split_trainvalidate] From 17d2d2bcaf63405b10997e65d3f0b2c7bcabbbbe Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 22:26:16 +0000 Subject: [PATCH 10/12] slurm/dlr: tensorflow is dumb Workaround for this crash on Tensorflow 2.13: Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 52bedf6..7dafb48 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -21,6 +21,13 @@ command_exists() { ####################################################### +# Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error +if [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]] && [[ -d "${HOME}/cuda" ]]; then + echo "[slurm_runner] csgpu cluster detected, sourcing extra CUDA setup script" >&2; + #shellcheck source=/dev/null + source "${HOME}/cuda/activate.sh"; +fi + # No modules on the CS cluster if command_exists module && [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]]; then module load utilities/multi From 7c4f3d325d65dbfab29185ced93ff096dd1bf374 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 22:28:39 +0000 Subject: [PATCH 11/12] slurm/dlr: fix workaround logic --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 7dafb48..103ce79 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -22,7 +22,7 @@ command_exists() { ####################################################### # Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error -if [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]] && [[ -d "${HOME}/cuda" ]]; then +if [[ "${SLURM_CLUSTER_NAME}" == "cs-cluster" ]] && [[ -d "${HOME}/cuda" ]]; then echo "[slurm_runner] csgpu cluster detected, sourcing extra CUDA setup script" >&2; #shellcheck source=/dev/null source "${HOME}/cuda/activate.sh"; From fe374560a1cf4c6b12e741d907319eb46673def3 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 14 Nov 2024 22:38:27 +0000 Subject: [PATCH 12/12] I *hate* Tensorflow SO MUCH...... --- aimodel/slurm-TEST-deeplabv3p-rainfall.job | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 103ce79..f794e60 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -21,11 +21,17 @@ command_exists() { ####################################################### -# Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error -if [[ "${SLURM_CLUSTER_NAME}" == "cs-cluster" ]] && [[ -d "${HOME}/cuda" ]]; then - echo "[slurm_runner] csgpu cluster detected, sourcing extra CUDA setup script" >&2; - #shellcheck source=/dev/null - source "${HOME}/cuda/activate.sh"; +if [[ "${SLURM_CLUSTER_NAME}" == "cs-cluster" ]]; then + echo "[slurm_runner] csgpu cluster detected, applying CUDA workarounds" >&2; + # Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error + if [[ -d "${HOME}/cuda" ]]; then + echo "[slurm_runner] sourcing extra CUDA setup script" >&2; + #shellcheck source=/dev/null + source "${HOME}/cuda/activate.sh"; + fi + + export XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/lib/cuda"; # weird... this wasn't needed before? Fixes + echo "[slurm_runner] set XLA_FLAGS=\"${XLA_FLAGS}\"" >&2; fi # No modules on the CS cluster