typo; add pretrain slurm job file

2024-11-22 09:13:01 +00:00 · 2022-08-31 16:32:17 +01:00 · 2022-08-31 16:32:17 +01:00 · dbf929325a
commit dbf929325a
parent e0162bc70b
2 changed files with 69 additions and 1 deletions
--- a/aimodel/slurm-pretrain.job
+++ b/aimodel/slurm-pretrain.job
@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+#SBATCH -J TweetAI
+#SBATCH -N 1
+#SBATCH -n 4
+#SBATCH --gres=gpu:1
+#SBATCH -o %j.%N.%a.out
+#SBATCH -e %j.%N.%a.err
+#SBATCH -p gpu05
+#SBATCH --time=5-00:00:00
+#SBATCH --exclusive
+
+module load utilities/multi
+module load readline/7.0
+module load gcc/10.2.0
+module load cuda/11.5.0
+
+module load python/anaconda/4.6/miniconda/3.7
+
+
+show_help() {
+	echo -e "Usage:" >&2;
+	echo -e "    [INPUT='\$HOME/rainfallwater_records_tfrecord'] [POSTFIX='<string>'] sbatch slurm-pretrain.job" >&2;
+	echo -e "" >&2;
+	echo -e "....where:" >&2;
+	echo -e "    INPUT     The path to the directory containing the .tfrecord files to use as training data (see the rainfallwrangler for making these)" >&2;
+	echo -e "    POSTFIX   Optional. A suffix to apply to the run code name." >&2;
+	echo -e "" >&2;
+	echo -e "The code used to identify the run is taken automatically from the filename of the config file." >&2;
+	exit;
+}
+
+CONFIG="${CONFIG:-configs/clip.toml}";
+
+if [[ -z "${CONFIG}" ]]; then
+	echo -e "Error: No CONFIG environment variable specified.\n" >&2;
+	show_help;
+	exit 0;
+fi
+
+if [[ ! -d "${INPUT}" ]]; then
+	echo -e "Error: The directory '${INPUT}' containing the input .tfrecord dataset either doesn't exist or isn't a directory.";
+	show_help;
+	exit 1;
+fi
+
+
+CODE="$(basename "${CONFIG}")";
+CODE="${CODE%.*}";
+
+if [[ -n "${POSTFIX}" ]]; then
+	echo -e ">>> Applying postfix of ${POSTFIX}" >&2;
+	CODE="${CODE}_${POSTFIX}";
+fi
+
+echo -e ">>> Input dirpath: ${INPUT}" >&2;
+echo -e ">>> Code: ${CODE}" >&2;
+echo -e ">>> Additional args: ${ARGS}";
+
+dir_output="output/$(date -u --rfc-3339=date)_${CODE}";
+
+export PATH=$HOME/software/bin:$PATH;
+
+echo ">>> Installing requirements";
+conda run -n py38 pip install -r requirements.txt;
+echo ">>> Training model";
+#shellcheck disable=SC2086
+/usr/bin/env time -v conda run -n py38 src/index.py --input "${CONFIG}" --output "${dir_output}" ${ARGS};
+echo ">>> exited with code $?";
--- a/aimodel/src/subcommands/pretrain.py
+++ b/aimodel/src/subcommands/pretrain.py
@ -15,7 +15,7 @@ def parse_args():
 	parser.add_argument("--output", "-o", help="Path to output directory to write output to (will be automatically created if it doesn't exist)", required=True)
 	parser.add_argument("--feature-dim", help="The size of the output feature dimension of the model [default: 200].", type=int)
 	parser.add_argument("--batch-size", help="Sets the batch size [default: 64].", type=int)
-	parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.")
+	parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5) files at once. Set to a higher number of systems with high read latency to avoid starving the GPU of data.")
 	
 	return parser