research-rainfallradar/aimodel/slurm-TEST-deeplabv3p-rainfall.job

80 lines
3.9 KiB
Bash
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
#SBATCH -J DeepRain
#SBATCH -N 1
#SBATCH -n 14
#SBATCH --gres=gpu:1
#SBATCH -o %j.%N.%a.deeplab-rainfall.out.log
#SBATCH -e %j.%N.%a.deeplab-rainfall.err.log
#SBATCH -p gpu
#SBATCH --no-requeue
#SBATCH --time=5-00:00:00
#SBATCH --mem=30000
# ---> in MiB
# no-requeue: ref https://support.hull.ac.uk/tas/public/ssp/content/detail/incident?unid=652db7ac6e73485c9f7658db78b2b628
module load utilities/multi
module load readline/7.0
module load gcc/10.2.0
module load cuda/11.5.0
export XLA_FLAGS="--xla_gpu_cuda_data_dir=/home/ViperAppsFiles/cuda/11.5.0"; # weird... this wasn't needed before?
module load python/anaconda/4.6/miniconda/3.7
show_help() {
echo -e "Trains a TEST DeepLabv3+ model using rainfall radar and water depth data." >&2;
echo -e "" >&2;
echo -e "Usage:" >&2;
echo -e " sbatch slurm-TEST-deeplabv3p-rainfall.job" >&2;
echo -e "" >&2;
echo -e "....where:" >&2;
echo -e " IMAGE_SIZE=128 Optional. Sets the size of the 'images' that the DeepLabV3+ model will work with." >&2;
echo -e " BATCH_SIZE=64 Optional. Sets the batch size to train the model with." >&2;
echo -e " DIR_RAINFALLWATER The path to the directory the .tfrecord files containing the rainfall radar / water depth data." >&2;
echo -e " PATH_HEIGHTMAP The path to the heightmap jsonl file to read in." >&2;
echo -e " PATH_COLOURMAP The path to the colourmap for predictive purposes." >&2;
echo -e " STEPS_PER_EPOCH The number of steps to consider an epoch. Defaults to None, which means use the entire dataset." >&2;
echo -e " NO_REMOVE_ISOLATED_PIXELS Set to any value to avoid the engine from removing isolated pixels - that is, water pixels with no other surrounding pixels, either side to side to diagonally." >&2;
echo -e " EPOCHS The number of epochs to train for." >&2;
echo -e " LOSS The loss function to use. Default: cross-entropy (possible values: cross-entropy, cross-entropy-dice)." >&2;
echo -e " PATH_CHECKPOINT The path to a checkcpoint to load. If specified, a model will be loaded instead of being trained." >&2;
echo -e " LEARNING_RATE The learning rate to use. Default: 0.001." >&2;
echo -e " PREDICT_COUNT The number of items from the (SCRAMBLED) dataset to make a prediction for." >&2;
echo -e " POSTFIX Postfix to append to the output dir (auto calculated)." >&2;
echo -e " ARGS Optional. Any additional arguments to pass to the python program." >&2;
echo -e "" >&2;
echo -e "It is strongly advised that all filepaths do NOT contain spaces." >&2;
echo -e "" >&2;
echo -e "The code used to identify the run is taken automatically from the filename of the config file." >&2;
exit;
}
DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/rainfallwater_records_tfrecord}";
PATH_HEIGHTMAP="${PATH_HEIGHTMAP:-$HOME/data/terrain50-nimrodsized.json.gz}";
PATH_COLOURMAP="${PATH_COLOURMAP:-$HOME/data/instance-level-human-parsing/instance-level_human_parsing/human_colormap.mat}";
CODE="deeplabv3+_rainfall";
if [[ -n "${POSTFIX}" ]]; then
echo -e ">>> Applying postfix of ${POSTFIX}" >&2;
CODE="${CODE}_${POSTFIX}";
fi
DIR_OUTPUT="${DIR_OUTPUT:-output/$(date -u --rfc-3339=date)_${CODE}}";
mkdir -p "${DIR_OUTPUT}";
echo -e ">>> NOW: $(date)";
echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}";
echo -e ">>> Additional args: ${ARGS}";
export PATH=$HOME/software/bin:$PATH;
export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE;
echo ">>> Installing requirements";
conda run -n py38 pip install -q -r requirements.txt;
echo ">>> Training model";
#shellcheck disable=SC2016
/usr/bin/env time -v conda run -n py38 bash -c 'src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log"; echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log";';
echo ">>> exited with code $?";