datasets: add shuffle=True/False to get_filepaths.

This is important because otherwise it SCAMBLES the filenames, which is a disaster for making predictions in the right order....!
This commit is contained in:
Starbeamrainbowlabs 2022-10-19 16:52:07 +01:00
parent fe43ddfbf9
commit 63e909d9fc
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
2 changed files with 19 additions and 8 deletions

View file

@ -66,11 +66,17 @@ def make_dataset(filepaths, metadata, shape_watch_desired=[100,100], compression
return dataset
def get_filepaths(dirpath_input):
return shuffle(list(filter(
def get_filepaths(dirpath_input, shuffle=True):
result = list(filter(
lambda filepath: str(filepath).endswith(".tfrecord.gz"),
[ file.path for file in os.scandir(dirpath_input) ] # .path on a DirEntry object yields the absolute filepath
)))
))
if shuffle:
result = shuffle(result)
else:
result = sorted(result, key=lambda filepath: int(filepath.split(".", 1)[0]))
return result
def dataset(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_multiplier=1.5):
filepaths = get_filepaths(dirpath_input)
@ -99,7 +105,7 @@ def dataset_predict(dirpath_input, parallel_reads_multiplier=1.5, prefetch=True)
Returns:
tf.data.Dataset: A tensorflow Dataset for the given input files.
"""
filepaths = get_filepaths(dirpath_input) if os.path.isdir(dirpath_input) else [ dirpath_input ]
filepaths = get_filepaths(dirpath_input, shuffle=False) if os.path.isdir(dirpath_input) else [ dirpath_input ]
return make_dataset(
filepaths=filepaths,

View file

@ -63,11 +63,16 @@ def make_dataset(filepaths, metadata, shape_water_desired=[100,100], water_thres
return dataset
def get_filepaths(dirpath_input):
return shuffle(list(filter(
def get_filepaths(dirpath_input, shuffle=True):
result = list(filter(
lambda filepath: str(filepath).endswith(".tfrecord.gz"),
[ file.path for file in os.scandir(dirpath_input) ] # .path on a DirEntry object yields the absolute filepath
)))
))
if shuffle:
result = shuffle(result)
else:
result = sorted(result, key=lambda filepath: int(filepath.split(".", 1)[0]))
return result
def dataset_segmenter(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_multiplier=1.5, water_threshold=0.1, shape_water_desired=[100,100]):
filepaths = get_filepaths(dirpath_input)
@ -97,7 +102,7 @@ def dataset_predict(dirpath_input, parallel_reads_multiplier=1.5, prefetch=True,
Returns:
tf.data.Dataset: A tensorflow Dataset for the given input files.
"""
filepaths = get_filepaths(dirpath_input) if os.path.isdir(dirpath_input) else [ dirpath_input ]
filepaths = get_filepaths(dirpath_input, shuffle=False) if os.path.isdir(dirpath_input) else [ dirpath_input ]
return make_dataset(
filepaths=filepaths,