diff --git a/aimodel/slurm-TEST-deeplabv3p-rainfall.job b/aimodel/slurm-TEST-deeplabv3p-rainfall.job index 51298ea..f794e60 100755 --- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job +++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job @@ -1,7 +1,7 @@ #!/usr/bin/env bash #SBATCH -J DeepRain #SBATCH -N 1 -#SBATCH -n 14 +#SBATCH -n 9 #SBATCH --gres=gpu:1 #SBATCH -o %j.%N.%a.deeplab-rainfall.out.log #SBATCH -e %j.%N.%a.deeplab-rainfall.err.log @@ -21,7 +21,21 @@ command_exists() { ####################################################### -if command_exists module; then +if [[ "${SLURM_CLUSTER_NAME}" == "cs-cluster" ]]; then + echo "[slurm_runner] csgpu cluster detected, applying CUDA workarounds" >&2; + # Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error + if [[ -d "${HOME}/cuda" ]]; then + echo "[slurm_runner] sourcing extra CUDA setup script" >&2; + #shellcheck source=/dev/null + source "${HOME}/cuda/activate.sh"; + fi + + export XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/lib/cuda"; # weird... this wasn't needed before? Fixes + echo "[slurm_runner] set XLA_FLAGS=\"${XLA_FLAGS}\"" >&2; +fi + +# No modules on the CS cluster +if command_exists module && [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]]; then module load utilities/multi module load readline/7.0 module load gcc/10.2.0 @@ -30,7 +44,7 @@ if command_exists module; then module load python/anaconda/4.6/miniconda/3.7 else - echo "[slurm_runner]: module command not present, not loading modules" >&2; + echo "[slurm_runner]: module command not present or csgpu cluster detected, not loading modules" >&2; fi @@ -74,7 +88,7 @@ show_help() { exit; } -DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/rainfallwater_records_tfrecord}"; +DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/data/rainfallwater_records_tfrecord}"; PATH_HEIGHTMAP="${PATH_HEIGHTMAP:-$HOME/data/terrain50-nimrodsized.json.gz}"; PATH_COLOURMAP="${PATH_COLOURMAP:-$HOME/data/instance-level-human-parsing/instance-level_human_parsing/human_colormap.mat}"; @@ -92,6 +106,9 @@ mkdir -p "${DIR_OUTPUT}"; echo -e ">>> NOW: $(date)"; echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}"; echo -e ">>> Additional args: ${ARGS}"; +echo -e ">>> GIT COMMIT: $(git rev-parse HEAD)"; + +{ echo "*****"; git rev-parse HEAD; git status; git log -1 | cat; } >>"${DIR_OUTPUT}/commit.txt"; export PATH=$HOME/software/bin:$PATH; export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE SPLIT_VALIDATE SPLIT_TEST; diff --git a/aimodel/src/deeplabv3_plus_test_rainfall.py b/aimodel/src/deeplabv3_plus_test_rainfall.py index 8e0f3cb..5ba6640 100755 --- a/aimodel/src/deeplabv3_plus_test_rainfall.py +++ b/aimodel/src/deeplabv3_plus_test_rainfall.py @@ -20,7 +20,7 @@ import matplotlib.pyplot as plt import tensorflow as tf -import lib.primitives.env +import lib.primitives.env as env from lib.dataset.dataset_mono import dataset_mono, dataset_mono_predict from lib.ai.components.LossCrossEntropyDice import LossCrossEntropyDice from lib.ai.components.MetricDice import metric_dice_coefficient as dice_coefficient @@ -40,7 +40,7 @@ logger.info(f"Starting at {str(datetime.now().isoformat())}") # ███████ ██ ████ ████ ██ ██ ██ ██████ ██ ████ ██ ██ ███████ ██ ████ ██ IMAGE_SIZE = env.read("IMAGE_SIZE", int, 128) # was 512; 128 is the highest power of 2 that fits the data -BATCH_SIZE = env.read("BATCH_SIZE", int, 64) +BATCH_SIZE = env.read("BATCH_SIZE", int, 32) NUM_CLASSES = 2 DIR_RAINFALLWATER = env.read("DIR_RAINFALLWATER", str) PATH_HEIGHTMAP = env.read("PATH_HEIGHTMAP", str) @@ -48,10 +48,10 @@ PATH_COLOURMAP = env.read("PATH_COLOURMAP", str) PARALLEL_READS = env.read("PARALLEL_READS", float, 1.5) STEPS_PER_EPOCH = env.read("STEPS_PER_EPOCH", int, None) REMOVE_ISOLATED_PIXELS = env.read("NO_REMOVE_ISOLATED_PIXELS", bool, True) -EPOCHS = env.read("EPOCHS", int, 50) +EPOCHS = env.read("EPOCHS", int, 25) LOSS = env.read("LOSS", str, "cross-entropy-dice") # other possible values: cross-entropy DICE_LOG_COSH = env.read("DICE_LOG_COSH", bool, False) -LEARNING_RATE = env.read("LEARNING_RATE", float, 0.001) +LEARNING_RATE = env.read("LEARNING_RATE", float, 0.00001) WATER_THRESHOLD = env.read("WATER_THRESHOLD", float, 0.1) UPSAMPLE = env.read("UPSAMPLE", int, 2) SPLIT_VALIDATE = env.read("SPLIT_VALIDATE", float, 0.2) @@ -59,7 +59,7 @@ SPLIT_TEST = env.read("SPLIT_TEST", float, 0) # NOTE: RANDSEED is declared and handled in src/lib/dataset/primitives/shuffle.py STEPS_PER_EXECUTION = env.read("STEPS_PER_EXECUTION", int, 1) -JIT_COMPILE = env.read("JIT_COMPILE", bool, False) +JIT_COMPILE = env.read("JIT_COMPILE", bool, True) DIR_OUTPUT = env.read("DIR_OUTPUT", str, f"output/{datetime.utcnow().date().isoformat()}_deeplabv3plus_rainfall_TEST") PATH_CHECKPOINT = env.read("PATH_CHECKPOINT", str, None) PREDICT_COUNT = env.read("PREDICT_COUNT", int, 25) @@ -95,7 +95,7 @@ if not PREDICT_AS_ONE: do_remove_isolated_pixels=REMOVE_ISOLATED_PIXELS, parallel_reads_multiplier=PARALLEL_READS, percentage_validate=SPLIT_VALIDATE, - percentage_test=SPLIT_TESTs + percentage_test=SPLIT_TEST ) logger.info("Train Dataset:", dataset_train) diff --git a/aimodel/src/lib/dataset/dataset_mono.py b/aimodel/src/lib/dataset/dataset_mono.py index 7e5e8b3..1a428be 100644 --- a/aimodel/src/lib/dataset/dataset_mono.py +++ b/aimodel/src/lib/dataset/dataset_mono.py @@ -165,7 +165,7 @@ def dataset_mono(dirpath_input, percentage_validate=0.2, percentage_test=0, **kw filepaths_count = len(filepaths) split_trainvalidate=math.floor(filepaths_count * (1-(percentage_validate+percentage_test))) - split_validatetest=math.floor(filepaths * (1 - percentage_test)) + split_validatetest=math.floor(filepaths_count * (1 - percentage_test)) filepaths_train = filepaths[:split_trainvalidate] diff --git a/aimodel/src/lib/primitives/env.py b/aimodel/src/lib/primitives/env.py index 7ab0c3a..c90928e 100644 --- a/aimodel/src/lib/primitives/env.py +++ b/aimodel/src/lib/primitives/env.py @@ -8,6 +8,7 @@ import os ### ## Changelog: +# 2024-11-14: Fix crash on line #107 unterminated string literal # 2024-09-29: Create this changelog, prepare for reuse ############################################################################## @@ -104,8 +105,7 @@ def print_all(table=True): # Create the table format string - format_string = f"| {{:<{width_name}}} | {{:<{ - width_type}}} | {{:<{width_value}}} | {{:<{width_flags}}} |" + format_string = f"| {{:<{width_name}}} | {{:<{width_type}}} | {{:<{width_value}}} | {{:<{width_flags}}} |" # Calculate total width total_width = width_name + width_type + width_value + \