Implement initial UNTESTED support for split_validation and split_test

This commit is contained in:
Starbeamrainbowlabs 2024-08-29 19:33:40 +01:00
parent b2b96ab636
commit e5f6e6394f
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
3 changed files with 22 additions and 10 deletions

View file

@ -43,6 +43,8 @@ show_help() {
echo -e " PATH_CHECKPOINT The path to a checkpoint to load. If specified, a model will be loaded instead of being trained." >&2;
echo -e " LEARNING_RATE The learning rate to use. Default: 0.001." >&2;
echo -e " UPSAMPLE How much to upsample by at the beginning of the model. A value of disables upscaling. Default: 2." >&2;
echo -e " SPLIT_VALIDATE Percentage of the available files in the dataset to be allocated to the validation split. Default: 0.2" >&2;
echo -e " SPLIT_TEST Percentage of the available files in the dataset to be allocated to the test split. Default: 0.2" >&2;
echo -e " STEPS_PER_EXECUTION How many steps to perform before surfacing from the GPU to e.g. do callbacks. Default: 16." >&2;
echo -e " RANDSEED The random seed to use when shuffling filepaths. Default: unset, which means use a random value." >&2;
echo -e " JIT_COMPILE Set to any value to compile the model with XLA." >&2;
@ -77,7 +79,7 @@ echo -e ">>> DIR_OUTPUT: ${DIR_OUTPUT}";
echo -e ">>> Additional args: ${ARGS}";
export PATH=$HOME/software/bin:$PATH;
export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE;
export IMAGE_SIZE BATCH_SIZE DIR_RAINFALLWATER PATH_HEIGHTMAP PATH_COLOURMAP STEPS_PER_EPOCH DIR_OUTPUT PATH_CHECKPOINT EPOCHS PREDICT_COUNT NO_REMOVE_ISOLATED_PIXELS LOSS LEARNING_RATE DICE_LOG_COSH WATER_THRESHOLD UPSAMPLE STEPS_PER_EXECUTION JIT_COMPILE RANDSEED PREDICT_AS_ONE SPLIT_VALIDATE SPLIT_TEST;
echo ">>> Installing requirements";
conda run -n py38 pip install -q -r requirements.txt;

View file

@ -56,13 +56,13 @@ UPSAMPLE = env.read("UPSAMPLE", int, 2)
SPLIT_VALIDATE = env.read("SPLIT_VALIDATE", float, 0.2)
SPLIT_TEST = env.read("SPLIT_TEST", float, 0)
STEPS_PER_EXECUTION = env.read("STEPS_PER_EXECUTION", int, 1)
JIT_COMPILE = env.read("JIT_COMPILE", bool, False)
DIR_OUTPUT = env.read("DIR_OUTPUT", str, f"output/{datetime.utcnow().date().isoformat()}_deeplabv3plus_rainfall_TEST")
PATH_CHECKPOINT = env.read("PATH_CHECKPOINT", str, None)
PREDICT_COUNT = env.read("PREDICT_COUNT", int, 25)
PREDICT_AS_ONE = env.read("PREDICT_AS_ONE", bool, False)
# ~~~
env.val_dir_exists(os.path.join(DIR_OUTPUT, "checkpoints"), create=True)
@ -82,7 +82,7 @@ env.print_all(False)
# ██████ ██ ██ ██ ██ ██ ███████ ███████ ██
if not PREDICT_AS_ONE:
dataset_train, dataset_validate = dataset_mono(
dataset_train, dataset_validate, dataset_test = dataset_mono(
dirpath_input=DIR_RAINFALLWATER,
batch_size=BATCH_SIZE,
water_threshold=WATER_THRESHOLD,
@ -91,11 +91,14 @@ if not PREDICT_AS_ONE:
input_size="same",
filepath_heightmap=PATH_HEIGHTMAP,
do_remove_isolated_pixels=REMOVE_ISOLATED_PIXELS,
parallel_reads_multiplier=PARALLEL_READS
parallel_reads_multiplier=PARALLEL_READS,
percentage_validate=SPLIT_VALIDATE,
percentage_test=SPLIT_TESTs
)
logger.info("Train Dataset:", dataset_train)
logger.info("Validation Dataset:", dataset_validate)
logger.info("Test Dataset:", dataset_test)
else:
dataset_train = dataset_mono_predict(
dirpath_input=DIR_RAINFALLWATER,
@ -253,6 +256,7 @@ if PATH_CHECKPOINT is None:
logger.info(">>> Beginning training")
history = model.fit(dataset_train,
validation_data=dataset_validate,
# test_data=dataset_test, # Nope, it doesn't have a param like this so it's time to do this the *hard* way
epochs=EPOCHS,
callbacks=[
tf.keras.callbacks.CSVLogger(
@ -395,5 +399,12 @@ if not PREDICT_AS_ONE:
colormap,
model=model
)
if dataset_test is not None:
plot_predictions(
os.path.join(DIR_OUTPUT, "predict_test_$$.png"),
get_from_batched(dataset_test, PREDICT_COUNT),
colormap,
model=model
)
logger.info(f"Complete at {str(datetime.now().isoformat())}, elapsed {str((datetime.now() - time_start).total_seconds())} seconds")

View file

@ -160,19 +160,18 @@ def get_filepaths(dirpath_input, do_shuffle=True):
return result
# TODO refactor this to validate_percentage=0.2 and test_percentage=0, but DON'T FORGET TO CHECK ***ALL*** usages of this FIRST and update them afterwards!
def dataset_mono(dirpath_input, validate_percentage=0.2, test_percentage=0, **kwargs):
def dataset_mono(dirpath_input, percentage_validate=0.2, percentage_test=0, **kwargs):
filepaths = get_filepaths(dirpath_input)
filepaths_count = len(filepaths)
split_trainvalidate=math.floor(filepaths_count * (1-(validate_percentage+test_percentage)))
split_validatetest=math.floor(filepaths * (1 - test_percentage))
split_trainvalidate=math.floor(filepaths_count * (1-(percentage_validate+percentage_test)))
split_validatetest=math.floor(filepaths * (1 - percentage_test))
filepaths_train = filepaths[:split_trainvalidate]
filepaths_validate = filepaths[split_trainvalidate:split_validatetest]
filepaths_test = []
if test_percentage > 0:
if percentage_test > 0:
filepaths_test = filepaths[split_validatetest:]
print("DEBUG:dataset_mono filepaths_train", filepaths_train, "filepaths_validate", filepaths_validate, "filepaths_test", filepaths_test)
@ -182,7 +181,7 @@ def dataset_mono(dirpath_input, validate_percentage=0.2, test_percentage=0, **kw
dataset_train = make_dataset(filepaths_train, metadata=metadata, **kwargs)
dataset_validate = make_dataset(filepaths_validate, metadata=metadata, **kwargs)
dataset_test = None
if test_percentage > 0:
if percentage_test > 0:
dataset_test = make_dataset(filepaths_test, metadata=metadata, **kwargs)
return dataset_train, dataset_validate, dataset_test #, filepaths