dlr: add PARALLEL_READS env var, update docs

2024-12-22 06:05:01 +00:00 · 2023-11-30 16:33:22 +00:00 · 2023-11-30 16:33:22 +00:00 · 0f9f185983
commit 0f9f185983
parent 60674fb6b3
3 changed files with 7 additions and 4 deletions
--- a/aimodel/slurm-TEST-deeplabv3p-rainfall.job
+++ b/aimodel/slurm-TEST-deeplabv3p-rainfall.job
@ -33,13 +33,14 @@ show_help() {
 	echo -e "    DIR_RAINFALLWATER	The path to the directory the .tfrecord files containing the rainfall radar / water depth data." >&2;
 	echo -e "    PATH_HEIGHTMAP 	The path to the heightmap jsonl file to read in." >&2;
 	echo -e "    PATH_COLOURMAP 	The path to the colourmap for predictive purposes." >&2;
+	echo -e "    PARALLEL_READS		Multiplier for the number of files to read in parallel. 1 = number of CPU cores available. Very useful on high-read-latency systems (e.g. HPC like Viper) to avoid starving the GPU of data. WILL MANGLE THE ORDERING OF DATA. Set to 0 to disable and read data sequentially. WILL ONLY NOT MANGLE DATA IF PREDICT_AS_ONE IS SET. Defaults to 1.5." >&2;
 	echo -e "    STEPS_PER_EPOCH	The number of steps to consider an epoch. Defaults to None, which means use the entire dataset." >&2;
 	echo -e "    NO_REMOVE_ISOLATED_PIXELS	Set to any value to avoid the engine from removing isolated pixels - that is, water pixels with no other surrounding pixels, either side to side to diagonally." >&2;
 	echo -e "    EPOCHS				The number of epochs to train for." >&2;
 	echo -e "    LOSS               The loss function to use. Default: cross-entropy (possible values: cross-entropy, cross-entropy-dice)." >&2;
 	echo -e "    DICE_LOG_COSH      When in cross-entropy-dice mode, in addition do loss = cel + log(cosh(dice_loss)) instead of just loss = cel + dice_loss." >&2;
 	echo -e "    WATER_THRESHOLD    The threshold to cut water off at when training, in metres. Default: 0.1" >&2;
-	echo -e "    PATH_CHECKPOINT 	The path to a checkcpoint to load. If specified, a model will be loaded instead of being trained." >&2;
+	echo -e "    PATH_CHECKPOINT 	The path to a checkpoint to load. If specified, a model will be loaded instead of being trained." >&2;
 	echo -e "    LEARNING_RATE      The learning rate to use. Default: 0.001." >&2;
 	echo -e "    UPSAMPLE           How much to upsample by at the beginning of the model. A value of disables upscaling. Default: 2." >&2;
 	echo -e "    STEPS_PER_EXECUTION How many steps to perform before surfacing from the GPU to e.g. do callbacks. Default: 16." >&2;
--- a/aimodel/src/deeplabv3_plus_test_rainfall.py
+++ b/aimodel/src/deeplabv3_plus_test_rainfall.py
@ -43,6 +43,7 @@ NUM_CLASSES = 2
 DIR_RAINFALLWATER = os.environ["DIR_RAINFALLWATER"]
 PATH_HEIGHTMAP = os.environ["PATH_HEIGHTMAP"]
 PATH_COLOURMAP = os.environ["PATH_COLOURMAP"]
+PARALLEL_READS = float(os.environ["PARALLEL_READS"]) if "PARALLEL_READS" in os.environ else 1.5
 STEPS_PER_EPOCH = int(os.environ["STEPS_PER_EPOCH"]) if "STEPS_PER_EPOCH" in os.environ else None
 REMOVE_ISOLATED_PIXELS = False if "NO_REMOVE_ISOLATED_PIXELS" in os.environ else True
 EPOCHS = int(os.environ["EPOCHS"]) if "EPOCHS" in os.environ else 50
@ -69,7 +70,7 @@ if not os.path.exists(DIR_OUTPUT):
 # ~~~

 logger.info("DeepLabV3+ rainfall radar TEST")
-for env_name in [ "BATCH_SIZE","NUM_CLASSES", "DIR_RAINFALLWATER", "PATH_HEIGHTMAP", "PATH_COLOURMAP", "STEPS_PER_EPOCH", "REMOVE_ISOLATED_PIXELS", "EPOCHS", "LOSS", "LEARNING_RATE", "DIR_OUTPUT", "PATH_CHECKPOINT", "PREDICT_COUNT", "DICE_LOG_COSH", "WATER_THRESHOLD", "UPSAMPLE", "STEPS_PER_EXECUTION", "JIT_COMPILE", "PREDICT_AS_ONE" ]:
+for env_name in [ "BATCH_SIZE","NUM_CLASSES", "DIR_RAINFALLWATER", "PATH_HEIGHTMAP", "PATH_COLOURMAP", "STEPS_PER_EPOCH", "PARALLEL_READS", "REMOVE_ISOLATED_PIXELS", "EPOCHS", "LOSS", "LEARNING_RATE", "DIR_OUTPUT", "PATH_CHECKPOINT", "PREDICT_COUNT", "DICE_LOG_COSH", "WATER_THRESHOLD", "UPSAMPLE", "STEPS_PER_EXECUTION", "JIT_COMPILE", "PREDICT_AS_ONE" ]:
 	logger.info(f"> {env_name} {str(globals()[env_name])}")


@ -88,7 +89,8 @@ if not PREDICT_AS_ONE:
 		output_size=IMAGE_SIZE,
 		input_size="same",
 		filepath_heightmap=PATH_HEIGHTMAP,
-		do_remove_isolated_pixels=REMOVE_ISOLATED_PIXELS
+		do_remove_isolated_pixels=REMOVE_ISOLATED_PIXELS,
+		parallel_reads_multiplier=PARALLEL_READS
 	)

 	logger.info("Train Dataset:", dataset_train)
--- a/aimodel/src/lib/dataset/dataset_mono.py
+++ b/aimodel/src/lib/dataset/dataset_mono.py
@ -178,7 +178,7 @@ def dataset_mono_predict(dirpath_input, batch_size=64, **kwargs):
 		filepaths=filepaths,
 		metadata=read_metadata(dirpath_input),
 		batch_size=batch_size, # WAS None
-		shuffle=False, #even with shuffle=False we're not gonna get them all in the same order since we're reading in parallel
+		shuffle=False, #even with shuffle=False we're not gonna get them all in the same order since we're reading in parallel by default
 		**kwargs
 	)