From 3e4128c0a8171365ababa08af3cbe2a3f579b610 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 1 Sep 2022 18:47:07 +0100 Subject: [PATCH] resize rainfall to be 1/2 size of current --- aimodel/src/lib/dataset/dataset.py | 55 +++++++++++-------- .../src/lib/python/json2tfrecord.py | 2 + 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/aimodel/src/lib/dataset/dataset.py b/aimodel/src/lib/dataset/dataset.py index cd7f26a..3ef9677 100644 --- a/aimodel/src/lib/dataset/dataset.py +++ b/aimodel/src/lib/dataset/dataset.py @@ -6,40 +6,47 @@ from loguru import logger import tensorflow as tf +from ..io.readfile import readfile from .shuffle import shuffle -# TO PARSE: -@tf.function -def parse_item(item): - parsed = tf.io.parse_single_example(item, features={ - "rainfallradar": tf.io.FixedLenFeature([], tf.string), - "waterdepth": tf.io.FixedLenFeature([], tf.string) - }) - rainfall = tf.io.parse_tensor(parsed["rainfallradar"], out_type=tf.float32) - water = tf.io.parse_tensor(parsed["waterdepth"], out_type=tf.float32) - - # [channels, width, height] → [width, height, channels] - ref ConvNeXt does not support data_format=channels_first - rainfall = tf.transpose(rainfall, [1, 2, 0]) - # [width, height] → [width, height, channels] - water = tf.expand_dims(water, axis=-1) - - # TODO: The shape of the resulting tensor can't be statically determined, so we need to reshape here - print("DEBUG:dataset ITEM rainfall:shape", rainfall.shape, "water:shape", water.shape) - # TODO: Any other additional parsing here, since multiple .map() calls are not optimal - return ((rainfall, water), tf.ones(1)) -def make_dataset(filenames, compression_type="GZIP", parallel_reads_multiplier=1.5, shuffle_buffer_size=128, batch_size=64): +# TO PARSE: +def parse_item(metadata): + def parse_item_inner(item): + parsed = tf.io.parse_single_example(item, features={ + "rainfallradar": tf.io.FixedLenFeature([], tf.string), + "waterdepth": tf.io.FixedLenFeature([], tf.string) + }) + rainfall = tf.io.parse_tensor(parsed["rainfallradar"], out_type=tf.float32) + water = tf.io.parse_tensor(parsed["waterdepth"], out_type=tf.float32) + + # [channels, width, height] → [width, height, channels] - ref ConvNeXt does not support data_format=channels_first + rainfall = tf.transpose(rainfall, [1, 2, 0]) + # [width, height] → [width, height, channels] + water = tf.expand_dims(water, axis=-1) + + rainfall = tf.image.resize(rainfall, tf.constant(metadata.waterdepth)) + + # TODO: The shape of the resulting tensor can't be statically determined, so we need to reshape here + print("DEBUG:dataset ITEM rainfall:shape", rainfall.shape, "water:shape", water.shape) + # TODO: Any other additional parsing here, since multiple .map() calls are not optimal + return ((rainfall, water), tf.ones(1)) + + return tf.function(parse_item_inner) + +def make_dataset(filenames, metadata, compression_type="GZIP", parallel_reads_multiplier=1.5, shuffle_buffer_size=128, batch_size=64): return tf.data.TFRecordDataset(filenames, compression_type=compression_type, num_parallel_reads=math.ceil(os.cpu_count() * parallel_reads_multiplier) ).shuffle(shuffle_buffer_size) \ - .map(parse_item, num_parallel_calls=tf.data.AUTOTUNE) \ + .map(parse_item(metadata), num_parallel_calls=tf.data.AUTOTUNE) \ .batch(batch_size) \ .prefetch(tf.data.AUTOTUNE) def dataset(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_multiplier=1.5): + filepath_meta = os.path.join(dirpath_input, "metadata.json") filepaths = shuffle(list(filter( lambda filepath: str(filepath).endswith(".tfrecord.gz"), [ file.path for file in os.scandir(dirpath_input) ] # .path on a DirEntry object yields the absolute filepath @@ -50,8 +57,10 @@ def dataset(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_m filepaths_train = filepaths[:dataset_splitpoint] filepaths_validate = filepaths[dataset_splitpoint:] - dataset_train = make_dataset(filepaths_train, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier) - dataset_validate = make_dataset(filepaths_validate, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier) + metadata = json.loads(readfile(filepath_meta)) + + dataset_train = make_dataset(filepaths_train, metadata, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier) + dataset_validate = make_dataset(filepaths_validate, metadata, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier) return dataset_train, dataset_validate #, filepaths diff --git a/rainfallwrangler/src/lib/python/json2tfrecord.py b/rainfallwrangler/src/lib/python/json2tfrecord.py index 27dbdd1..0fc41af 100755 --- a/rainfallwrangler/src/lib/python/json2tfrecord.py +++ b/rainfallwrangler/src/lib/python/json2tfrecord.py @@ -37,6 +37,8 @@ def convert(filepath_in, filepath_out): rainfall = tf.constant(obj["rainfallradar"], dtype=tf.float32) water = tf.constant(obj["waterdepth"], dtype=tf.float32) + # TODO: cast float32 → divide by max_value → clip 0-1 (or -1 to +1? I don't know) + ### ## 3: Print shape definitions (required when parsing) ###