From 15a3519107d88c8584e5565e4db86e031b32b26b Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Thu, 11 Aug 2022 18:26:28 +0100 Subject: [PATCH] ai: the best thing about implementing a model is that you don't have to test it on the same day :P --- aimodel/src/lib/ai/RainfallWaterContraster.py | 25 ++++-- aimodel/src/lib/dataset/dataset.py | 4 +- aimodel/src/lib/dataset/read_metadata.py | 10 +++ aimodel/src/lib/io/find_paramsjson.py | 17 ++++ aimodel/src/lib/vis/embeddings.py | 45 ++++++++++ aimodel/src/subcommands/__init__.py | 0 aimodel/src/subcommands/pretrain.py | 58 +++++++++++++ aimodel/src/subcommands/pretrain_predict.py | 86 +++++++++++++++++++ 8 files changed, 238 insertions(+), 7 deletions(-) create mode 100644 aimodel/src/lib/dataset/read_metadata.py create mode 100644 aimodel/src/lib/io/find_paramsjson.py create mode 100644 aimodel/src/lib/vis/embeddings.py create mode 100644 aimodel/src/subcommands/__init__.py create mode 100644 aimodel/src/subcommands/pretrain.py create mode 100644 aimodel/src/subcommands/pretrain_predict.py diff --git a/aimodel/src/lib/ai/RainfallWaterContraster.py b/aimodel/src/lib/ai/RainfallWaterContraster.py index a233c5b..b452799 100644 --- a/aimodel/src/lib/ai/RainfallWaterContraster.py +++ b/aimodel/src/lib/ai/RainfallWaterContraster.py @@ -6,6 +6,7 @@ import json import tensorflow as tf +from ..io.find_paramsjson import find_paramsjson from ..io.readfile import readfile from ..io.writefile import writefile @@ -36,20 +37,30 @@ class RainfallWaterContraster(object): self.filepath_summary = os.path.join(self.dir_output, "summary.txt") summarywriter(self.model, self.filepath_summary) - writefile(os.path.join(self.dir_output, "params.json"), json.dumps(self.model.get_config())) + writefile(os.path.join(self.dir_output, "params.json"), self.get_config()) else: self.model = self.load_model(filepath_checkpoint) + def get_config(self): + return { + "epochs": self.epochs, + "batch_size": self.batch_size, + **self.kwargs + } @staticmethod - def from_checkpoint(filepath_checkpoint, filepath_hyperparams): + def from_checkpoint(filepath_checkpoint, filepath_hyperparams=None): + if filepath_checkpoint is None: + filepath_checkpoint = find_paramsjson(filepath_checkpoint) hyperparams = json.loads(readfile(filepath_hyperparams)) return RainfallWaterContraster(filepath_checkpoint=filepath_checkpoint, **hyperparams) + def make_model(self): model = model_rainfallwater_contrastive(batch_size=self.batch_size, **self.kwargs) return model + def load_model(self, filepath_checkpoint): """ Loads a saved model from the given filename. @@ -77,9 +88,11 @@ class RainfallWaterContraster(object): for batch in dataset: i_batch += 1 result_batch = self.model(batch[0]) - # Currently, the left and right should be the same - left, _ = tf.unstack(result_batch, axis=-2) - result_batch = tf.unstack(left, axis=0) - result.extend(result_batch) + rainfall, water = tf.unstack(result_batch, axis=-2) + + rainfall = tf.unstack(rainfall, axis=0) + water = tf.unstack(water, axis=0) + + result.extend(zip(rainfall, water)) return result \ No newline at end of file diff --git a/aimodel/src/lib/dataset/dataset.py b/aimodel/src/lib/dataset/dataset.py index 88593c3..f44d655 100644 --- a/aimodel/src/lib/dataset/dataset.py +++ b/aimodel/src/lib/dataset/dataset.py @@ -49,8 +49,10 @@ def dataset(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_m dataset_train = make_dataset(filepaths_train, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier) dataset_validate = make_dataset(filepaths_validate, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier) - return dataset_train, dataset_validate + return dataset_train, dataset_validate, filepaths +def dataset_predict(): + raise NotImplementedError("Not implemented yet") if __name__ == "__main__": diff --git a/aimodel/src/lib/dataset/read_metadata.py b/aimodel/src/lib/dataset/read_metadata.py new file mode 100644 index 0000000..2a1dc5f --- /dev/null +++ b/aimodel/src/lib/dataset/read_metadata.py @@ -0,0 +1,10 @@ +import os +import json + + +from ..io.readfile import readfile + +def read_metadata(dirpath_dataset): + filepath_metadata = os.path.join(dirpath_dataset, "metadata.json") + + return json.loads(readfile(filepath_metadata)) \ No newline at end of file diff --git a/aimodel/src/lib/io/find_paramsjson.py b/aimodel/src/lib/io/find_paramsjson.py new file mode 100644 index 0000000..01863db --- /dev/null +++ b/aimodel/src/lib/io/find_paramsjson.py @@ -0,0 +1,17 @@ +import os + +def find_paramsjson(filepath_checkpoint): + filepath_stem = os.path.splitext(filepath_checkpoint)[0] + dirpath_container = os.path.dirname(filepath_checkpoint) + dirpath_parent = os.path.dirname(dirpath_container) + + options = [ + f"{filepath_stem}.json", + os.path.join(dirpath_container, "params.json"), + os.path.join(dirpath_parent, "params.json") + ] + for candidate in options: + if os.path.exists(candidate): + return candidate + + return None \ No newline at end of file diff --git a/aimodel/src/lib/vis/embeddings.py b/aimodel/src/lib/vis/embeddings.py new file mode 100644 index 0000000..e926f1f --- /dev/null +++ b/aimodel/src/lib/vis/embeddings.py @@ -0,0 +1,45 @@ +import os + +import umap +import umap.plot +import numpy as np +import matplotlib.pylab as plt +import pandas + +def vis_embeddings(filepath_output, features): + dimreducer = umap.UMAP(min_dist=0.05).fit(features) + + px = 1 / plt.rcParams['figure.dpi'] # matplotlib sizes are in inches :-( :-( :-( + width = 1920 + height = 768 + + plt.rc("font", size=20) + plt.rc("font", family="Ubuntu") + figure = plt.figure(figsize=(width*px, height*px)) + figure.add_subplot(1, 2, 1) + + # 1: UMAP + umap.plot.points(dimreducer, + color_key_cmap="brg", # color_key_cmap="jet", + ax=figure.get_axes()[0] + ) + plt.title(f"UMAP Dimensionality Reduction", fontsize=20) + + # 2: Parallel coordinates + figure.add_subplot(1, 2, 2) + # CHEESE: This won't produce a terribly accurate result, as we're just ignoring the most of CLIP's embedded features. + dataframe = pandas.DataFrame(features) + dataframe["Label"] = range(len(features)) + pandas.plotting.parallel_coordinates( + dataframe, + "Label", + ax=figure.get_axes()[1], + use_columns=False, + axvlines=False, + sort_labels=True + ) + + plt.title(f"Parallel coordinates plot", fontsize=20) + + plt.suptitle(f"ContrastiveE1 embeddings | ResNetV2 | {len(features)} items", fontsize=28, weight="bold") + plt.savefig(filepath_output) \ No newline at end of file diff --git a/aimodel/src/subcommands/__init__.py b/aimodel/src/subcommands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/aimodel/src/subcommands/pretrain.py b/aimodel/src/subcommands/pretrain.py new file mode 100644 index 0000000..16fdbc4 --- /dev/null +++ b/aimodel/src/subcommands/pretrain.py @@ -0,0 +1,58 @@ +import sys +import argparse +from asyncio.log import logger + +import tensorflow as tf + +from lib.ai.RainfallWaterContraster import RainfallWaterContraster +from lib.dataset.dataset import dataset +from lib.dataset.read_metadata import read_metadata + +def parse_args(): + parser = argparse.ArgumentParser(description="Pretrain a contrastive learning model on a directory of rainfall+water .tfrecord.gz files.") + # parser.add_argument("--config", "-c", help="Filepath to the TOML config file to load.", required=True) + parser.add_argument("--input", "-i", help="Path to input directory containing the .tfrecord.gz files to pretrain with", required=True) + parser.add_argument("--output", "-o", help="Path to output directory to write output to (will be automatically created if it doesn't exist)", required=True) + parser.add_argument("--feature-dim", help="The size of the output feature dimension of the model [default: 200].", type=int) + parser.add_argument("--batch-size", help="Sets the batch size [default: 64].", type=int) + parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.") + + return parser + +def run(args): + + if (not hasattr(args, "batch_size")) or args.batch_size == None: + args.batch_size = 64 + if (not hasattr(args, "feature_dim")) or args.feature_dim == None: + args.feature_dim = 200 + if (not hasattr(args, "read_multiplier")) or args.read_multiplier == None: + args.read_multiplier = 1.5 + + + # TODO: Validate args here. + + sys.stderr.write(f"\n\n>>> This is TensorFlow {tf.__version__}\n\n\n") + + dataset_train, dataset_validate = dataset( + dirpath_input=args.input, + batch_size=args.batch_size, + ) + dataset_metadata = read_metadata(args.input) + + # for items in dataset_train.repeat(10): + # print("ITEMS", len(items)) + # print("LEFT", [ item.shape for item in items[0] ]) + # print("ITEMS DONE") + # exit(0) + + ai = RainfallWaterContraster( + dir_output=args.output, + batch_size=args.batch_size, + feature_dim=args.feature_dim, + + shape_rainfall=dataset_metadata["rainfallradar"], + shape_water=dataset_metadata["waterdepth"] + ) + + ai.train(dataset_train, dataset_validate) + \ No newline at end of file diff --git a/aimodel/src/subcommands/pretrain_predict.py b/aimodel/src/subcommands/pretrain_predict.py new file mode 100644 index 0000000..f6e1ce8 --- /dev/null +++ b/aimodel/src/subcommands/pretrain_predict.py @@ -0,0 +1,86 @@ +import io +import json +import os +import sys +import argparse +import re + +from loguru import logger +import tensorflow as tf +import numpy as np + +from lib.ai.RainfallWaterContraster import RainfallWaterContraster +from lib.dataset.dataset import dataset_predict +from lib.io.find_paramsjson import find_paramsjson +from lib.io.readfile import readfile +from lib.vis.embeddings import vis_embeddings + +def parse_args(): + parser = argparse.ArgumentParser(description="Output feature maps using a given pretrained contrastive model.") + # parser.add_argument("--config", "-c", help="Filepath to the TOML config file to load.", required=True) + parser.add_argument("--input", "-i", help="Path to input directory containing the images to predict for.", required=True) + parser.add_argument("--output", "-o", help="Path to output file to write output to. Defaults to stdout, but if specified a UMAP graph will NOT be produced.") + parser.add_argument("--checkpoint", "-c", help="Checkpoint file to load model weights from.", required=True) + parser.add_argument("--params", "-p", help="Optional. The file containing the model hyperparameters (usually called 'params.json'). If not specified, it's location will be determined automatically.") + parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.") + parser.add_argument("--no-vis", + help="Don't also plot a visualisation of the resulting embeddings.", action="store_true") + parser.add_argument("--only-gpu", + help="If the GPU is not available, exit with an error (useful on shared HPC systems to avoid running out of memory & affecting other users)", action="store_true") + + return parser + +def run(args): + + # Note that we do NOT check to see if the checkpoint file exists, because Tensorflow/Keras requires that we pass the stem instead of the actual index file..... :-/ + + if (not hasattr(args, "params")) or args.params == None: + args.params = find_paramsjson(args.checkpoint) + if (not hasattr(args, "read_multiplier")) or args.read_multiplier == None: + args.read_multiplier = 1.5 + + if not os.path.exists(args.params): + raise Exception(f"Error: The specified filepath params.json hyperparameters ('{args.params}) does not exist.") + if not os.path.exists(args.checkpoint): + raise Exception(f"Error: The specified filepath to the checkpoint to load ('{args.checkpoint}) does not exist.") + + + filepath_output = args.output if hasattr(args, "output") and args.output != None else "-" + + + ai = RainfallWaterContraster.from_checkpoint(args.checkpoint) + + sys.stderr.write(f"\n\n>>> This is TensorFlow {tf.__version__}\n\n\n") + + dataset_train, filepaths, filepaths_length = dataset_predict( + dirpath_input=args.input, + batch_size=ai.batch_size, + parallel_reads_multiplier=args.read_multiplier + ) + filepaths = filepaths[0:filepaths_length] + + # for items in dataset_train.repeat(10): + # print("ITEMS", len(items)) + # print("LEFT", [ item.shape for item in items[0] ]) + # print("ITEMS DONE") + # exit(0) + + handle = sys.stdout + if filepath_output != "-": + handle = io.open(filepath_output, "w") + + embeddings = ai.embed(dataset_train)[0:filepaths_length] # Trim off the padding + result = list(zip(filepaths, embeddings)) + for filepath, embedding in result: + handle.write(json.dumps({ + "filepath": filepath, + "embedding": embedding.numpy().tolist() + }, separators=(',', ':'))+"\n") # Ref https://stackoverflow.com/a/64710892/1460422 + + if filepath_output != "-": + sys.stderr.write(">>> Plotting with UMAP\n") + filepath_output_umap = os.path.splitext(filepath_output)[0]+'.png' + labels = [ os.path.basename(os.path.dirname(filepath)) for filepath in filepaths ] + vis_embeddings(filepath_output_umap, np.array([ embedding.numpy() for embedding in embeddings ]), np.array(labels)) + + sys.stderr.write(">>> Complete\n") \ No newline at end of file