From dbf929325aa4bff9da73ebfd0af6b0aa63492da4 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Wed, 31 Aug 2022 16:32:17 +0100 Subject: [PATCH] typo; add pretrain slurm job file --- aimodel/slurm-pretrain.job | 68 +++++++++++++++++++++++++++++ aimodel/src/subcommands/pretrain.py | 2 +- 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 aimodel/slurm-pretrain.job diff --git a/aimodel/slurm-pretrain.job b/aimodel/slurm-pretrain.job new file mode 100644 index 0000000..b559e0a --- /dev/null +++ b/aimodel/slurm-pretrain.job @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +#SBATCH -J TweetAI +#SBATCH -N 1 +#SBATCH -n 4 +#SBATCH --gres=gpu:1 +#SBATCH -o %j.%N.%a.out +#SBATCH -e %j.%N.%a.err +#SBATCH -p gpu05 +#SBATCH --time=5-00:00:00 +#SBATCH --exclusive + +module load utilities/multi +module load readline/7.0 +module load gcc/10.2.0 +module load cuda/11.5.0 + +module load python/anaconda/4.6/miniconda/3.7 + + +show_help() { + echo -e "Usage:" >&2; + echo -e " [INPUT='\$HOME/rainfallwater_records_tfrecord'] [POSTFIX=''] sbatch slurm-pretrain.job" >&2; + echo -e "" >&2; + echo -e "....where:" >&2; + echo -e " INPUT The path to the directory containing the .tfrecord files to use as training data (see the rainfallwrangler for making these)" >&2; + echo -e " POSTFIX Optional. A suffix to apply to the run code name." >&2; + echo -e "" >&2; + echo -e "The code used to identify the run is taken automatically from the filename of the config file." >&2; + exit; +} + +CONFIG="${CONFIG:-configs/clip.toml}"; + +if [[ -z "${CONFIG}" ]]; then + echo -e "Error: No CONFIG environment variable specified.\n" >&2; + show_help; + exit 0; +fi + +if [[ ! -d "${INPUT}" ]]; then + echo -e "Error: The directory '${INPUT}' containing the input .tfrecord dataset either doesn't exist or isn't a directory."; + show_help; + exit 1; +fi + + +CODE="$(basename "${CONFIG}")"; +CODE="${CODE%.*}"; + +if [[ -n "${POSTFIX}" ]]; then + echo -e ">>> Applying postfix of ${POSTFIX}" >&2; + CODE="${CODE}_${POSTFIX}"; +fi + +echo -e ">>> Input dirpath: ${INPUT}" >&2; +echo -e ">>> Code: ${CODE}" >&2; +echo -e ">>> Additional args: ${ARGS}"; + +dir_output="output/$(date -u --rfc-3339=date)_${CODE}"; + +export PATH=$HOME/software/bin:$PATH; + +echo ">>> Installing requirements"; +conda run -n py38 pip install -r requirements.txt; +echo ">>> Training model"; +#shellcheck disable=SC2086 +/usr/bin/env time -v conda run -n py38 src/index.py --input "${CONFIG}" --output "${dir_output}" ${ARGS}; +echo ">>> exited with code $?"; diff --git a/aimodel/src/subcommands/pretrain.py b/aimodel/src/subcommands/pretrain.py index 16fdbc4..d21457b 100644 --- a/aimodel/src/subcommands/pretrain.py +++ b/aimodel/src/subcommands/pretrain.py @@ -15,7 +15,7 @@ def parse_args(): parser.add_argument("--output", "-o", help="Path to output directory to write output to (will be automatically created if it doesn't exist)", required=True) parser.add_argument("--feature-dim", help="The size of the output feature dimension of the model [default: 200].", type=int) parser.add_argument("--batch-size", help="Sets the batch size [default: 64].", type=int) - parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.") + parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5) files at once. Set to a higher number of systems with high read latency to avoid starving the GPU of data.") return parser