research-rainfallradar/aimodel/slurm-TEST-deeplabv3p-rainfall.job

#!/usr/bin/env bash
#SBATCH -J DeepRain
#SBATCH -N 1
#SBATCH -n 14
#SBATCH --gres=gpu:1
#SBATCH -o %j.%N.%a.deeplab-rainfall.out.log
#SBATCH -e %j.%N.%a.deeplab-rainfall.err.log
#SBATCH -p gpu
#SBATCH --no-requeue
#SBATCH --time=5-00:00:00
#SBATCH --mem=30000
# ---> in MiB
# no-requeue: ref https://support.hull.ac.uk/tas/public/ssp/content/detail/incident?unid=652db7ac6e73485c9f7658db78b2b628

module load utilities/multi
module load readline/7.0
module load gcc/10.2.0
module load cuda/11.5.0
export XLA_FLAGS="--xla_gpu_cuda_data_dir=/home/ViperAppsFiles/cuda/11.5.0"; # weird... this wasn't needed before?

module load python/anaconda/4.6/miniconda/3.7


show_help() {
	echo -e "Trains a TEST DeepLabv3+ model using rainfall radar and water depth data." >&2;
	echo -e "" >&2;
	echo -e "Usage:" >&2;
	echo -e "    sbatch slurm-TEST-deeplabv3p-rainfall.job" >&2;
	echo -e "" >&2;
	echo -e "....where:" >&2;
	echo -e "    IMAGE_SIZE=128		Optional. Sets the size of the 'images' that the DeepLabV3+ model will work with." >&2;
	echo -e "    BATCH_SIZE=64		Optional. Sets the batch size to train the model with." >&2;
	echo -e "    DIR_RAINFALLWATER	The path to the directory the .tfrecord files containing the rainfall radar / water depth data." >&2;
	echo -e "    PATH_HEIGHTMAP 	The path to the heightmap jsonl file to read in." >&2;
	echo -e "    PATH_COLOURMAP 	The path to the colourmap for predictive purposes." >&2;
	echo -e "    PARALLEL_READS		Multiplier for the number of files to read in parallel. 1 = number of CPU cores available. Very useful on high-read-latency systems (e.g. HPC like Viper) to avoid starving the GPU of data. WILL MANGLE THE ORDERING OF DATA. Set to 0 to disable and read data sequentially. WILL ONLY NOT MANGLE DATA IF PREDICT_AS_ONE IS SET. Defaults to 1.5." >&2;
	echo -e "    STEPS_PER_EPOCH	The number of steps to consider an epoch. Defaults to None, which means use the entire dataset." >&2;
	echo -e "    NO_REMOVE_ISOLATED_PIXELS	Set to any value to avoid the engine from removing isolated pixels - that is, water pixels with no other surrounding pixels, either side to side to diagonally." >&2;
	echo -e "    EPOCHS				The number of epochs to train for." >&2;
	echo -e "    LOSS               The loss function to use. Default: cross-entropy (possible values: cross-entropy, cross-entropy-dice)." >&2;
	echo -e "    DICE_LOG_COSH      When in cross-entropy-dice mode, in addition do loss = cel + log(cosh(dice_loss)) instead of just loss = cel + dice_loss." >&2;
	echo -e "    WATER_THRESHOLD    The threshold to cut water off at when training, in metres. Default: 0.1" >&2;
	echo -e "    PATH_CHECKPOINT 	The path to a checkpoint to load. If specified, a model will be loaded instead of being trained." >&2;
	echo -e "    LEARNING_RATE      The learning rate to use. Default: 0.001." >&2;
	echo -e "    UPSAMPLE           How much to upsample by at the beginning of the model. A value of disables upscaling. Default: 2." >&2;
	echo -e "    SPLIT_VALIDATE     Percentage of the available files in the dataset to be allocated to the validation split. Default: 0.2" >&2;
	echo -e "    SPLIT_TEST         Percentage of the available files in the dataset to be allocated to the test split. Default: 0.2" >&2;
	echo -e "    STEPS_PER_EXECUTION How many steps to perform before surfacing from the GPU to e.g. do callbacks. Default: 16." >&2;
	echo -e "    RANDSEED           The random seed to use when shuffling filepaths. Default: unset, which means use a random value." >&2;
	echo -e "    JIT_COMPILE        Set to any value to compile the model with XLA." >&2;
	echo -e "    PREDICT_COUNT		The number of items from the (SCRAMBLED) dataset to make a prediction for." >&2;
	echo -e "    PREDICT_AS_ONE		[prediction only] Set to any value to avoid splitting the input dataset into training/validation and instead treat it as a single dataset. Default: False (treat it as training/validation)" >&2;
	echo -e "    POSTFIX			Postfix to append to the output dir (auto calculated)." >&2;
	echo -e "    ARGS				Optional. Any additional arguments to pass to the python program." >&2;
	echo -e "" >&2;
	echo -e "It is strongly advised that all filepaths do NOT contain spaces." >&2;
	echo -e "" >&2;
	echo -e "The code used to identify the run is taken automatically from the filename of the config file." >&2;
	exit;
}

DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/rainfallwater_records_tfrecord}";
PATH_HEIGHTMAP="${PATH_HEIGHTMAP:-$HOME/data/terrain50-nimrodsized.json.gz}";
PATH_COLOURMAP="${PATH_COLOURMAP:-$HOME/data/instance-level-human-parsing/instance-level_human_parsing/human_colormap.mat}";

CODE="deeplabv3+_rainfall";

if [[ -n "${POSTFIX}" ]]; then
	echo -e ">>> Applying postfix of ${POSTFIX}" >&2;
	CODE="${CODE}_${POSTFIX}";
fi

DIR_OUTPUT="${DIR_OUTPUT:-output/$(date -u --rfc-3339=date)_${CODE}}";

mkdir -p "${DIR_OUTPUT}";

echo -e ">>> NOW: $(date)";
echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}";
echo -e ">>> Additional args: ${ARGS}";

export PATH=$HOME/software/bin:$PATH;
export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE SPLIT_VALIDATE SPLIT_TEST;

echo ">>> Installing requirements";
conda run -n py38 pip install -q -r requirements.txt;
echo ">>> Training model";
#shellcheck disable=SC2016
/usr/bin/env time -v conda run -n py38 bash -c 'src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log"; echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log";';
echo ">>> exited with code $?";
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`#!/usr/bin/env bash`
			`#SBATCH -J DeepRain`
			`#SBATCH -N 1`
			`#SBATCH -n 14`
			`#SBATCH --gres=gpu:1`
slur dlr: log file names correct 2023-01-05 19:47:51 +00:00			`#SBATCH -o %j.%N.%a.deeplab-rainfall.out.log`
			`#SBATCH -e %j.%N.%a.deeplab-rainfall.err.log`
annoying 2023-01-13 17:00:47 +00:00			`#SBATCH -p gpu`
dlr: add UPSAMPLE env var ...AND actually add the functionality this time! 2023-05-04 16:40:16 +00:00			`#SBATCH --no-requeue`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`#SBATCH --time=5-00:00:00`
			`#SBATCH --mem=30000`
			`# ---> in MiB`
dlr: add no-requeue Ref https://support.hull.ac.uk/tas/public/ssp/content/detail/incident?unid=652db7ac6e73485c9f7658db78b2b628 2023-01-17 18:20:26 +00:00			`# no-requeue: ref https://support.hull.ac.uk/tas/public/ssp/content/detail/incident?unid=652db7ac6e73485c9f7658db78b2b628`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00
			`module load utilities/multi`
			`module load readline/7.0`
			`module load gcc/10.2.0`
			`module load cuda/11.5.0`
weird, XLA_FLAGS cuda data dir wasn't needed before libdevice not found at ./libdevice.10.bc 2023-02-10 13:28:34 +00:00			`export XLA_FLAGS="--xla_gpu_cuda_data_dir=/home/ViperAppsFiles/cuda/11.5.0"; # weird... this wasn't needed before?`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00
			`module load python/anaconda/4.6/miniconda/3.7`


			`show_help() {`
			`echo -e "Trains a TEST DeepLabv3+ model using rainfall radar and water depth data." >&2;`
			`echo -e "" >&2;`
			`echo -e "Usage:" >&2;`
			`echo -e " sbatch slurm-TEST-deeplabv3p-rainfall.job" >&2;`
			`echo -e "" >&2;`
			`echo -e "....where:" >&2;`
			`echo -e " IMAGE_SIZE=128 Optional. Sets the size of the 'images' that the DeepLabV3+ model will work with." >&2;`
			`echo -e " BATCH_SIZE=64 Optional. Sets the batch size to train the model with." >&2;`
slurm deeplab rainfall: fix variable naming 2023-01-05 17:08:57 +00:00			`echo -e " DIR_RAINFALLWATER The path to the directory the .tfrecord files containing the rainfall radar / water depth data." >&2;`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`echo -e " PATH_HEIGHTMAP The path to the heightmap jsonl file to read in." >&2;`
			`echo -e " PATH_COLOURMAP The path to the colourmap for predictive purposes." >&2;`
dlr: add PARALLEL_READS env var, update docs 2023-11-30 16:33:22 +00:00			`echo -e " PARALLEL_READS Multiplier for the number of files to read in parallel. 1 = number of CPU cores available. Very useful on high-read-latency systems (e.g. HPC like Viper) to avoid starving the GPU of data. WILL MANGLE THE ORDERING OF DATA. Set to 0 to disable and read data sequentially. WILL ONLY NOT MANGLE DATA IF PREDICT_AS_ONE IS SET. Defaults to 1.5." >&2;`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`echo -e " STEPS_PER_EPOCH The number of steps to consider an epoch. Defaults to None, which means use the entire dataset." >&2;`
dlr: Add support for stripping isolated water pixels That is, water pixels that have no other water pixels immediately adjacent thereto (diagonals count). 2023-01-13 16:57:26 +00:00			`echo -e " NO_REMOVE_ISOLATED_PIXELS Set to any value to avoid the engine from removing isolated pixels - that is, water pixels with no other surrounding pixels, either side to side to diagonally." >&2;`
add moar env vars 2023-01-12 18:54:39 +00:00			`echo -e " EPOCHS The number of epochs to train for." >&2;`
dlr: add cross-entropy + dice loss fn option 2023-01-13 17:58:00 +00:00			`echo -e " LOSS The loss function to use. Default: cross-entropy (possible values: cross-entropy, cross-entropy-dice)." >&2;`
dlr CHANGE: Add optional log(cosh(dice_loss)) Ref https://doi.org/10.1109/cibcb48159.2020.9277638 2023-03-10 20:24:13 +00:00			`echo -e " DICE_LOG_COSH When in cross-entropy-dice mode, in addition do loss = cel + log(cosh(dice_loss)) instead of just loss = cel + dice_loss." >&2;`
dlr: add env var for water thresholding 2023-03-14 20:18:39 +00:00			`echo -e " WATER_THRESHOLD The threshold to cut water off at when training, in metres. Default: 0.1" >&2;`
dlr: add PARALLEL_READS env var, update docs 2023-11-30 16:33:22 +00:00			`echo -e " PATH_CHECKPOINT The path to a checkpoint to load. If specified, a model will be loaded instead of being trained." >&2;`
slurm dlr: shell out in conda; redirect stderr & stdout to disk inside the experiments folder Also, if the job restarts, we still save the previous run's results because we append rather than overwrite 2023-01-16 17:32:22 +00:00			`echo -e " LEARNING_RATE The learning rate to use. Default: 0.001." >&2;`
dlr: add UPSAMPLE env var ...AND actually add the functionality this time! 2023-05-04 16:40:16 +00:00			`echo -e " UPSAMPLE How much to upsample by at the beginning of the model. A value of disables upscaling. Default: 2." >&2;`
Implement initial UNTESTED support for split_validation and split_test 2024-08-29 18:33:40 +00:00			`echo -e " SPLIT_VALIDATE Percentage of the available files in the dataset to be allocated to the validation split. Default: 0.2" >&2;`
			`echo -e " SPLIT_TEST Percentage of the available files in the dataset to be allocated to the test split. Default: 0.2" >&2;`
dlr: add JIT_COMPILE 2023-05-04 17:22:18 +00:00			`echo -e " STEPS_PER_EXECUTION How many steps to perform before surfacing from the GPU to e.g. do callbacks. Default: 16." >&2;`
dlr: add RANDSEED to slurm 2023-05-11 15:02:13 +00:00			`echo -e " RANDSEED The random seed to use when shuffling filepaths. Default: unset, which means use a random value." >&2;`
dlr: add JIT_COMPILE 2023-05-04 17:22:18 +00:00			`echo -e " JIT_COMPILE Set to any value to compile the model with XLA." >&2;`
add moar env vars 2023-01-12 18:54:39 +00:00			`echo -e " PREDICT_COUNT The number of items from the (SCRAMBLED) dataset to make a prediction for." >&2;`
dlr: add PREDICT_AS_ONE 2023-06-16 17:23:40 +00:00			`echo -e " PREDICT_AS_ONE [prediction only] Set to any value to avoid splitting the input dataset into training/validation and instead treat it as a single dataset. Default: False (treat it as training/validation)" >&2;`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`echo -e " POSTFIX Postfix to append to the output dir (auto calculated)." >&2;`
			`echo -e " ARGS Optional. Any additional arguments to pass to the python program." >&2;`
			`echo -e "" >&2;`
			`echo -e "It is strongly advised that all filepaths do NOT contain spaces." >&2;`
			`echo -e "" >&2;`
			`echo -e "The code used to identify the run is taken automatically from the filename of the config file." >&2;`
			`exit;`
			`}`

slurm deeplab rainfall: fix variable naming 2023-01-05 17:08:57 +00:00			`DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/rainfallwater_records_tfrecord}";`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`PATH_HEIGHTMAP="${PATH_HEIGHTMAP:-$HOME/data/terrain50-nimrodsized.json.gz}";`
slurm dlr: fix pathing 2023-01-05 19:35:56 +00:00			`PATH_COLOURMAP="${PATH_COLOURMAP:-$HOME/data/instance-level-human-parsing/instance-level_human_parsing/human_colormap.mat}";`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00
slurm dlr: fix output dir 2023-01-05 19:42:42 +00:00			`CODE="deeplabv3+_rainfall";`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00
			`if [[ -n "${POSTFIX}" ]]; then`
			`echo -e ">>> Applying postfix of ${POSTFIX}" >&2;`
			`CODE="${CODE}_${POSTFIX}";`
			`fi`

dlr eo: allow setting DIR_OUTPUT directly 2023-03-01 16:54:15 +00:00			`DIR_OUTPUT="${DIR_OUTPUT:-output/$(date -u --rfc-3339=date)_${CODE}}";`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00
slurm dlr: shell out in conda; redirect stderr & stdout to disk inside the experiments folder Also, if the job restarts, we still save the previous run's results because we append rather than overwrite 2023-01-16 17:32:22 +00:00			`mkdir -p "${DIR_OUTPUT}";`

			`echo -e ">>> NOW: $(date)";`
			`echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}";`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`echo -e ">>> Additional args: ${ARGS}";`

			`export PATH=$HOME/software/bin:$PATH;`
Implement initial UNTESTED support for split_validation and split_test 2024-08-29 18:33:40 +00:00			`export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE SPLIT_VALIDATE SPLIT_TEST;`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00
			`echo ">>> Installing requirements";`
slurm dlr: quiet, pip 2023-01-10 18:12:35 +00:00			`conda run -n py38 pip install -q -r requirements.txt;`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`echo ">>> Training model";`
slurm dlr: shell out in conda; redirect stderr & stdout to disk inside the experiments folder Also, if the job restarts, we still save the previous run's results because we append rather than overwrite 2023-01-16 17:32:22 +00:00			`#shellcheck disable=SC2016`
slurm dlr: log exit code 2023-01-17 15:18:26 +00:00			`/usr/bin/env time -v conda run -n py38 bash -c 'src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log"; echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log";';`
work on slurm for deeplabv3+ rainfall, but it's NOT FINISHED YET 2022-12-16 19:52:44 +00:00			`echo ">>> exited with code $?";`