diff --git a/aimodel/slurm-encoderonly-rainfall.job b/aimodel/slurm-encoderonly-rainfall.job index 2aab9f5..efac585 100755 --- a/aimodel/slurm-encoderonly-rainfall.job +++ b/aimodel/slurm-encoderonly-rainfall.job @@ -34,7 +34,8 @@ show_help() { echo -e " CHANNELS=8 The number of channels the input data has." >&2; echo -e " WINDOW_SIZE=33 The window size to use when convolving the input dataset for single pixel prediction." >&2; echo -e " STEPS_PER_EPOCH The number of steps to consider an epoch. Defaults to None, which means use the entire dataset." >&2; - echo -e " STEPS_PER_EXECUTION The number of steps to do before returning to do callbacks. High numbers boost performance. Defaults to 1." >&2; + echo -e " VAL_STEPS_PER_EPOCH The number of validation steps to consider an epoch. Defaults to None, which means use the entire dataset." >&2; + echo -e " STEPS_PER_EXECUTION The number of steps to do before returning to do callbacks. High numbers boost performance. Defaults to 1. If set then STEPS_PER_EPOCH and VAL_STEPS_PER_EPOCH must also be set." >&2; echo -e " EPOCHS=25 The number of epochs to train for." >&2; echo -e " LEARNING_RATE The learning rate to use. Default: 0.001." >&2; # echo -e " NO_REMOVE_ISOLATED_PIXELS Set to any value to avoid the engine from removing isolated pixels - that is, water pixels with no other surrounding pixels, either side to side to diagonally." >&2; @@ -67,7 +68,7 @@ echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}"; echo -e ">>> Additional args: ${ARGS}"; export PATH=$HOME/software/bin:$PATH; -export BATCH_SIZE DIRPATH_RAINFALLWATER PATH_HEIGHTMAP STEPS_PER_EPOCH DIRPATH_OUTPUT PATH_CHECKPOINT CHANNELS WINDOW_SIZE EPOCHS LEARNING_RATE STEPS_PER_EXECUTION; +export BATCH_SIZE DIRPATH_RAINFALLWATER PATH_HEIGHTMAP STEPS_PER_EPOCH VAL_STEPS_PER_EPOCH DIRPATH_OUTPUT PATH_CHECKPOINT CHANNELS WINDOW_SIZE EPOCHS LEARNING_RATE STEPS_PER_EXECUTION; #LOSS ; echo ">>> Installing requirements"; diff --git a/aimodel/src/encoderonly_test_rainfall.py b/aimodel/src/encoderonly_test_rainfall.py index 6e6360b..fe35e32 100755 --- a/aimodel/src/encoderonly_test_rainfall.py +++ b/aimodel/src/encoderonly_test_rainfall.py @@ -26,6 +26,7 @@ EPOCHS = int(os.environ["EPOCHS"]) if "EPOCHS" in os.environ else 25 BATCH_SIZE = int(os.environ["BATCH_SIZE"]) if "BATCH_SIZE" in os.environ else 64 WINDOW_SIZE = int(os.environ["WINDOW_SIZE"]) if "WINDOW_SIZE" in os.environ else 33 STEPS_PER_EPOCH = int(os.environ["STEPS_PER_EPOCH"]) if "STEPS_PER_EPOCH" in os.environ else None +VAL_STEPS_PER_EPOCH = int(os.environ["VAL_STEPS_PER_EPOCH"]) if "VAL_STEPS_PER_EPOCH" in os.environ else None STEPS_PER_EXECUTION = int(os.environ["STEPS_PER_EXECUTION"]) if "STEPS_PER_EXECUTION" in os.environ else None LEARNING_RATE = float(os.environ["LEARNING_RATE"]) if "LEARNING_RATE" in os.environ else 0.001 JIT_COMPILE = True if "JIT_COMPILE" in os.environ else False @@ -126,7 +127,7 @@ summarywriter(model, os.path.join(DIRPATH_OUTPUT, "summary.txt")) history = model.fit(dataset_train, validation_data=dataset_validate, epochs=EPOCHS, - + callbacks=[ tf.keras.callbacks.CSVLogger( filename=os.path.join(DIRPATH_OUTPUT, "metrics.tsv"), @@ -143,6 +144,7 @@ history = model.fit(dataset_train, ), ], steps_per_epoch=STEPS_PER_EPOCH, + validation_steps=VAL_STEPS_PER_EPOCH ) logger.info(">>> Training complete")