datasets: add shuffle=True/False to get_filepaths.

This is important because otherwise it SCAMBLES the filenames, which is a disaster for making predictions in the right order....!
2024-11-22 09:13:01 +00:00 · 2022-10-19 16:52:07 +01:00 · 2022-10-19 16:52:07 +01:00 · 63e909d9fc
commit 63e909d9fc
parent fe43ddfbf9
2 changed files with 19 additions and 8 deletions
--- a/aimodel/src/lib/dataset/dataset.py
+++ b/aimodel/src/lib/dataset/dataset.py
@ -66,11 +66,17 @@ def make_dataset(filepaths, metadata, shape_watch_desired=[100,100], compression
 	return dataset


-def get_filepaths(dirpath_input):
-	return shuffle(list(filter(
+def get_filepaths(dirpath_input, shuffle=True):
+	result = list(filter(
 		lambda filepath: str(filepath).endswith(".tfrecord.gz"),
 		[ file.path for file in os.scandir(dirpath_input) ] # .path on a DirEntry object yields the absolute filepath
-	)))
+	))
+	if shuffle:
+		result = shuffle(result)
+	else:
+		result = sorted(result, key=lambda filepath: int(filepath.split(".", 1)[0]))
+	
+	return result

 def dataset(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_multiplier=1.5):
 	filepaths = get_filepaths(dirpath_input)
@ -99,7 +105,7 @@ def dataset_predict(dirpath_input, parallel_reads_multiplier=1.5, prefetch=True)
 	Returns:
 		tf.data.Dataset: A tensorflow Dataset for the given input files.
 	"""
-	filepaths = get_filepaths(dirpath_input) if os.path.isdir(dirpath_input) else [ dirpath_input ]
+	filepaths = get_filepaths(dirpath_input, shuffle=False) if os.path.isdir(dirpath_input) else [ dirpath_input ]
 	
 	return make_dataset(
 		filepaths=filepaths,
--- a/aimodel/src/lib/dataset/dataset_segmenter.py
+++ b/aimodel/src/lib/dataset/dataset_segmenter.py
@ -63,11 +63,16 @@ def make_dataset(filepaths, metadata, shape_water_desired=[100,100], water_thres
 	return dataset


-def get_filepaths(dirpath_input):
-	return shuffle(list(filter(
+def get_filepaths(dirpath_input, shuffle=True):
+	result = list(filter(
 		lambda filepath: str(filepath).endswith(".tfrecord.gz"),
 		[ file.path for file in os.scandir(dirpath_input) ] # .path on a DirEntry object yields the absolute filepath
-	)))
+	))
+	if shuffle:
+		result = shuffle(result)
+	else:
+		result = sorted(result, key=lambda filepath: int(filepath.split(".", 1)[0]))
+	return result

 def dataset_segmenter(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_multiplier=1.5, water_threshold=0.1, shape_water_desired=[100,100]):
 	filepaths = get_filepaths(dirpath_input)
@ -97,7 +102,7 @@ def dataset_predict(dirpath_input, parallel_reads_multiplier=1.5, prefetch=True,
 	Returns:
 		tf.data.Dataset: A tensorflow Dataset for the given input files.
 	"""
-	filepaths = get_filepaths(dirpath_input) if os.path.isdir(dirpath_input) else [ dirpath_input ]
+	filepaths = get_filepaths(dirpath_input, shuffle=False) if os.path.isdir(dirpath_input) else [ dirpath_input ]
 	
 	return make_dataset(
 		filepaths=filepaths,