mirror of
https://github.com/sbrl/research-rainfallradar
synced 2024-12-22 22:25:01 +00:00
132 lines
7.6 KiB
Bash
Executable file
132 lines
7.6 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
#SBATCH -J DeepRain
|
|
#SBATCH -N 1
|
|
#SBATCH -n 9
|
|
#SBATCH --gres=gpu:1
|
|
#SBATCH -o %j.%N.%a.deeplab-rainfall.out.log
|
|
#SBATCH -e %j.%N.%a.deeplab-rainfall.err.log
|
|
#SBATCH -p gpu
|
|
#SBATCH --no-requeue
|
|
#SBATCH --time=5-00:00:00
|
|
#SBATCH --mem=30000
|
|
# ---> in MiB
|
|
# no-requeue: ref https://support.hull.ac.uk/tas/public/ssp/content/detail/incident?unid=652db7ac6e73485c9f7658db78b2b628
|
|
|
|
#######################################################
|
|
|
|
command_exists() {
|
|
command -v "$1" >/dev/null 2>&1
|
|
return $?;
|
|
}
|
|
|
|
#######################################################
|
|
|
|
if [[ "${SLURM_CLUSTER_NAME}" == "cs-cluster" ]]; then
|
|
echo "[slurm_runner] csgpu cluster detected, applying CUDA workarounds" >&2;
|
|
# Fix "Could not load library libcublasLt.so.12. Error: libcublasLt.so.12: cannot open shared object file: No such file or directory" error
|
|
if [[ -d "${HOME}/cuda" ]]; then
|
|
echo "[slurm_runner] sourcing extra CUDA setup script" >&2;
|
|
#shellcheck source=/dev/null
|
|
source "${HOME}/cuda/activate.sh";
|
|
fi
|
|
|
|
export XLA_FLAGS="--xla_gpu_cuda_data_dir=/usr/lib/cuda"; # weird... this wasn't needed before? Fixes
|
|
echo "[slurm_runner] set XLA_FLAGS=\"${XLA_FLAGS}\"" >&2;
|
|
fi
|
|
|
|
# No modules on the CS cluster
|
|
if command_exists module && [[ "${SLURM_CLUSTER_NAME}" != "cs-cluster" ]]; then
|
|
module load utilities/multi
|
|
module load readline/7.0
|
|
module load gcc/10.2.0
|
|
module load cuda/11.5.0;
|
|
export XLA_FLAGS="--xla_gpu_cuda_data_dir=/home/ViperAppsFiles/cuda/11.5.0"; # weird... this wasn't needed before?
|
|
|
|
module load python/anaconda/4.6/miniconda/3.7
|
|
else
|
|
echo "[slurm_runner]: module command not present or csgpu cluster detected, not loading modules" >&2;
|
|
fi
|
|
|
|
|
|
show_help() {
|
|
echo -e "Trains a TEST DeepLabv3+ model using rainfall radar and water depth data." >&2;
|
|
echo -e "" >&2;
|
|
echo -e "Usage:" >&2;
|
|
echo -e " sbatch slurm-TEST-deeplabv3p-rainfall.job" >&2;
|
|
echo -e "" >&2;
|
|
echo -e "....where:" >&2;
|
|
echo -e " USE_CONDA Optional. Set to any value to use conda when running the experiment. REQUIRED ON VIPER." >&2;
|
|
echo -e "" >&2;
|
|
echo -e " IMAGE_SIZE=128 Optional. Sets the size of the 'images' that the DeepLabV3+ model will work with." >&2;
|
|
echo -e " BATCH_SIZE=64 Optional. Sets the batch size to train the model with." >&2;
|
|
echo -e " DIR_RAINFALLWATER The path to the directory the .tfrecord files containing the rainfall radar / water depth data." >&2;
|
|
echo -e " PATH_HEIGHTMAP The path to the heightmap jsonl file to read in." >&2;
|
|
echo -e " PATH_COLOURMAP The path to the colourmap for predictive purposes." >&2;
|
|
echo -e " PARALLEL_READS Multiplier for the number of files to read in parallel. 1 = number of CPU cores available. Very useful on high-read-latency systems (e.g. HPC like Viper) to avoid starving the GPU of data. WILL MANGLE THE ORDERING OF DATA. Set to 0 to disable and read data sequentially. WILL ONLY NOT MANGLE DATA IF PREDICT_AS_ONE IS SET. Defaults to 1.5." >&2;
|
|
echo -e " STEPS_PER_EPOCH The number of steps to consider an epoch. Defaults to None, which means use the entire dataset." >&2;
|
|
echo -e " NO_REMOVE_ISOLATED_PIXELS Set to any value to avoid the engine from removing isolated pixels - that is, water pixels with no other surrounding pixels, either side to side to diagonally." >&2;
|
|
echo -e " EPOCHS The number of epochs to train for." >&2;
|
|
echo -e " LOSS The loss function to use. Default: cross-entropy (possible values: cross-entropy, cross-entropy-dice)." >&2;
|
|
echo -e " DICE_LOG_COSH When in cross-entropy-dice mode, in addition do loss = cel + log(cosh(dice_loss)) instead of just loss = cel + dice_loss." >&2;
|
|
echo -e " WATER_THRESHOLD The threshold to cut water off at when training, in metres. Default: 0.1" >&2;
|
|
echo -e " PATH_CHECKPOINT The path to a checkpoint to load. If specified, a model will be loaded instead of being trained." >&2;
|
|
echo -e " LEARNING_RATE The learning rate to use. Default: 0.001." >&2;
|
|
echo -e " UPSAMPLE How much to upsample by at the beginning of the model. A value of disables upscaling. Default: 2." >&2;
|
|
echo -e " SPLIT_VALIDATE Percentage of the available files in the dataset to be allocated to the validation split. Default: 0.2" >&2;
|
|
echo -e " SPLIT_TEST Percentage of the available files in the dataset to be allocated to the test split. Default: 0.2" >&2;
|
|
echo -e " STEPS_PER_EXECUTION How many steps to perform before surfacing from the GPU to e.g. do callbacks. Default: 16." >&2;
|
|
echo -e " RANDSEED The random seed to use when shuffling filepaths. Default: unset, which means use a random value." >&2;
|
|
echo -e " JIT_COMPILE Set to any value to compile the model with XLA." >&2;
|
|
echo -e " PREDICT_COUNT The number of items from the (SCRAMBLED) dataset to make a prediction for." >&2;
|
|
echo -e " PREDICT_AS_ONE [prediction only] Set to any value to avoid splitting the input dataset into training/validation and instead treat it as a single dataset. Default: False (treat it as training/validation)" >&2;
|
|
echo -e " POSTFIX Postfix to append to the output dir (auto calculated)." >&2;
|
|
echo -e " ARGS Optional. Any additional arguments to pass to the python program." >&2;
|
|
echo -e "" >&2;
|
|
echo -e "It is strongly advised that all filepaths do NOT contain spaces." >&2;
|
|
echo -e "" >&2;
|
|
echo -e "The code used to identify the run is taken automatically from the filename of the config file." >&2;
|
|
exit;
|
|
}
|
|
|
|
DIR_RAINFALLWATER="${DIR_RAINFALLWATER:-$HOME/data/rainfallwater_records_tfrecord}";
|
|
PATH_HEIGHTMAP="${PATH_HEIGHTMAP:-$HOME/data/terrain50-nimrodsized.json.gz}";
|
|
PATH_COLOURMAP="${PATH_COLOURMAP:-$HOME/data/instance-level-human-parsing/instance-level_human_parsing/human_colormap.mat}";
|
|
|
|
CODE="deeplabv3+_rainfall";
|
|
|
|
if [[ -n "${POSTFIX}" ]]; then
|
|
echo -e ">>> Applying postfix of ${POSTFIX}" >&2;
|
|
CODE="${CODE}_${POSTFIX}";
|
|
fi
|
|
|
|
DIR_OUTPUT="${DIR_OUTPUT:-output/$(date -u --rfc-3339=date)_${CODE}}";
|
|
|
|
mkdir -p "${DIR_OUTPUT}";
|
|
|
|
echo -e ">>> NOW: $(date)";
|
|
echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}";
|
|
echo -e ">>> Additional args: ${ARGS}";
|
|
echo -e ">>> GIT COMMIT: $(git rev-parse HEAD)";
|
|
|
|
{ echo "*****"; git rev-parse HEAD; git status; git log -1 | cat; } >>"${DIR_OUTPUT}/commit.txt";
|
|
|
|
export PATH=$HOME/software/bin:$PATH;
|
|
export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE SPLIT_VALIDATE SPLIT_TEST;
|
|
|
|
echo ">>> Installing requirements";
|
|
if [[ -n "${USE_CONDA}" ]]; then
|
|
conda run -n py38 pip install -q -r requirements.txt;
|
|
else
|
|
echo "[slurm_runner]: USE_CONDA env var NOT specified, not installing pip packages." >&2;
|
|
fi
|
|
echo ">>> Training model";
|
|
#shellcheck disable=SC2016
|
|
if [[ -n "${USE_CONDA}" ]]; then
|
|
echo "[slurm_runner]: Running harness via conda as USE_CONDA env var was defined" >&2;
|
|
/usr/bin/env time -v conda run -n py38 bash -c 'src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log"; echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log";';
|
|
else
|
|
echo "[slurm_runner]: Ignoring conda and running harness directly as USE_CONDA env var was NOT defined" >&2;
|
|
/usr/bin/env time -v src/deeplabv3_plus_test_rainfall.py >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log" 2>>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.err.log";
|
|
echo "[slurm_runner] EXIT_CODE: $?" >>"${DIR_OUTPUT}/experiment.${SLURM_JOB_ID}.out.log";
|
|
fi
|
|
echo ">>> exited with code $?";
|