From 15a3519107d88c8584e5565e4db86e031b32b26b Mon Sep 17 00:00:00 2001
From: Starbeamrainbowlabs <sbrl@starbeamrainbowlabs.com>
Date: Thu, 11 Aug 2022 18:26:28 +0100
Subject: [PATCH] ai: the best thing about implementing a model is that you
 don't have to test it on the same day :P

---
 aimodel/src/lib/ai/RainfallWaterContraster.py | 25 ++++--
 aimodel/src/lib/dataset/dataset.py            |  4 +-
 aimodel/src/lib/dataset/read_metadata.py      | 10 +++
 aimodel/src/lib/io/find_paramsjson.py         | 17 ++++
 aimodel/src/lib/vis/embeddings.py             | 45 ++++++++++
 aimodel/src/subcommands/__init__.py           |  0
 aimodel/src/subcommands/pretrain.py           | 58 +++++++++++++
 aimodel/src/subcommands/pretrain_predict.py   | 86 +++++++++++++++++++
 8 files changed, 238 insertions(+), 7 deletions(-)
 create mode 100644 aimodel/src/lib/dataset/read_metadata.py
 create mode 100644 aimodel/src/lib/io/find_paramsjson.py
 create mode 100644 aimodel/src/lib/vis/embeddings.py
 create mode 100644 aimodel/src/subcommands/__init__.py
 create mode 100644 aimodel/src/subcommands/pretrain.py
 create mode 100644 aimodel/src/subcommands/pretrain_predict.py

diff --git a/aimodel/src/lib/ai/RainfallWaterContraster.py b/aimodel/src/lib/ai/RainfallWaterContraster.py
index a233c5b..b452799 100644
--- a/aimodel/src/lib/ai/RainfallWaterContraster.py
+++ b/aimodel/src/lib/ai/RainfallWaterContraster.py
@@ -6,6 +6,7 @@ import json
 
 import tensorflow as tf
 
+from ..io.find_paramsjson import find_paramsjson
 from ..io.readfile import readfile
 from ..io.writefile import writefile
 
@@ -36,20 +37,30 @@ class RainfallWaterContraster(object):
 			self.filepath_summary = os.path.join(self.dir_output, "summary.txt")
 			
 			summarywriter(self.model, self.filepath_summary)
-			writefile(os.path.join(self.dir_output, "params.json"), json.dumps(self.model.get_config()))
+			writefile(os.path.join(self.dir_output, "params.json"), self.get_config())
 		else:	
 			self.model = self.load_model(filepath_checkpoint)
 	
+	def get_config(self):
+		return {
+			"epochs": self.epochs,
+			"batch_size": self.batch_size,
+			**self.kwargs
+		}
 	
 	@staticmethod
-	def from_checkpoint(filepath_checkpoint, filepath_hyperparams):
+	def from_checkpoint(filepath_checkpoint, filepath_hyperparams=None):
+		if filepath_checkpoint is None:
+			filepath_checkpoint = find_paramsjson(filepath_checkpoint)
 		hyperparams = json.loads(readfile(filepath_hyperparams))
 		return RainfallWaterContraster(filepath_checkpoint=filepath_checkpoint, **hyperparams)
 	
+	
 	def make_model(self):
 		model = model_rainfallwater_contrastive(batch_size=self.batch_size, **self.kwargs)
 		return model
 	
+	
 	def load_model(self, filepath_checkpoint):
 		"""
 		Loads a saved model from the given filename.
@@ -77,9 +88,11 @@ class RainfallWaterContraster(object):
 		for batch in dataset:
 			i_batch += 1
 			result_batch = self.model(batch[0])
-			# Currently, the left and right should be the same
-			left, _ = tf.unstack(result_batch, axis=-2)
-			result_batch = tf.unstack(left, axis=0)
-			result.extend(result_batch)
+			rainfall, water = tf.unstack(result_batch, axis=-2)
+			
+			rainfall = tf.unstack(rainfall, axis=0)
+			water = tf.unstack(water, axis=0)
+			
+			result.extend(zip(rainfall, water))
 		
 		return result
\ No newline at end of file
diff --git a/aimodel/src/lib/dataset/dataset.py b/aimodel/src/lib/dataset/dataset.py
index 88593c3..f44d655 100644
--- a/aimodel/src/lib/dataset/dataset.py
+++ b/aimodel/src/lib/dataset/dataset.py
@@ -49,8 +49,10 @@ def dataset(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_m
 	dataset_train = make_dataset(filepaths_train, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier)
 	dataset_validate = make_dataset(filepaths_validate, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier)
 	
-	return dataset_train, dataset_validate
+	return dataset_train, dataset_validate, filepaths
 
+def dataset_predict():
+	raise NotImplementedError("Not implemented yet")
 
 
 if __name__ == "__main__":
diff --git a/aimodel/src/lib/dataset/read_metadata.py b/aimodel/src/lib/dataset/read_metadata.py
new file mode 100644
index 0000000..2a1dc5f
--- /dev/null
+++ b/aimodel/src/lib/dataset/read_metadata.py
@@ -0,0 +1,10 @@
+import os
+import json
+
+
+from ..io.readfile import readfile
+
+def read_metadata(dirpath_dataset):
+	filepath_metadata = os.path.join(dirpath_dataset, "metadata.json")
+	
+	return json.loads(readfile(filepath_metadata))
\ No newline at end of file
diff --git a/aimodel/src/lib/io/find_paramsjson.py b/aimodel/src/lib/io/find_paramsjson.py
new file mode 100644
index 0000000..01863db
--- /dev/null
+++ b/aimodel/src/lib/io/find_paramsjson.py
@@ -0,0 +1,17 @@
+import os
+
+def find_paramsjson(filepath_checkpoint):
+	filepath_stem = os.path.splitext(filepath_checkpoint)[0]
+	dirpath_container = os.path.dirname(filepath_checkpoint)
+	dirpath_parent = os.path.dirname(dirpath_container)
+	
+	options = [
+		f"{filepath_stem}.json",
+		os.path.join(dirpath_container, "params.json"),
+		os.path.join(dirpath_parent, "params.json")
+	]
+	for candidate in options:
+		if os.path.exists(candidate):
+			return candidate
+	
+	return None
\ No newline at end of file
diff --git a/aimodel/src/lib/vis/embeddings.py b/aimodel/src/lib/vis/embeddings.py
new file mode 100644
index 0000000..e926f1f
--- /dev/null
+++ b/aimodel/src/lib/vis/embeddings.py
@@ -0,0 +1,45 @@
+import os
+
+import umap
+import umap.plot
+import numpy as np
+import matplotlib.pylab as plt
+import pandas
+
+def vis_embeddings(filepath_output, features):
+	dimreducer = umap.UMAP(min_dist=0.05).fit(features)
+	
+	px = 1 / plt.rcParams['figure.dpi'] # matplotlib sizes are in inches :-( :-( :-(
+	width = 1920
+	height = 768
+	
+	plt.rc("font", size=20)
+	plt.rc("font", family="Ubuntu")
+	figure = plt.figure(figsize=(width*px, height*px))
+	figure.add_subplot(1, 2, 1)
+	
+	# 1: UMAP
+	umap.plot.points(dimreducer,
+		color_key_cmap="brg", # color_key_cmap="jet",
+		ax=figure.get_axes()[0]
+	)
+	plt.title(f"UMAP Dimensionality Reduction", fontsize=20)
+	
+	# 2: Parallel coordinates
+	figure.add_subplot(1, 2, 2)
+	# CHEESE: This won't produce a terribly accurate result, as we're just ignoring the most of CLIP's embedded features.
+	dataframe = pandas.DataFrame(features)
+	dataframe["Label"] = range(len(features))
+	pandas.plotting.parallel_coordinates(
+		dataframe,
+		"Label",
+		ax=figure.get_axes()[1],
+		use_columns=False,
+		axvlines=False,
+		sort_labels=True
+	)
+	
+	plt.title(f"Parallel coordinates plot", fontsize=20)
+	
+	plt.suptitle(f"ContrastiveE1 embeddings | ResNetV2 | {len(features)} items", fontsize=28, weight="bold")
+	plt.savefig(filepath_output)
\ No newline at end of file
diff --git a/aimodel/src/subcommands/__init__.py b/aimodel/src/subcommands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/aimodel/src/subcommands/pretrain.py b/aimodel/src/subcommands/pretrain.py
new file mode 100644
index 0000000..16fdbc4
--- /dev/null
+++ b/aimodel/src/subcommands/pretrain.py
@@ -0,0 +1,58 @@
+import sys
+import argparse
+from asyncio.log import logger
+
+import tensorflow as tf
+
+from lib.ai.RainfallWaterContraster import RainfallWaterContraster
+from lib.dataset.dataset import dataset
+from lib.dataset.read_metadata import read_metadata
+
+def parse_args():
+	parser = argparse.ArgumentParser(description="Pretrain a contrastive learning model on a directory of rainfall+water .tfrecord.gz files.")
+	# parser.add_argument("--config", "-c", help="Filepath to the TOML config file to load.", required=True)
+	parser.add_argument("--input", "-i", help="Path to input directory containing the .tfrecord.gz files to pretrain with", required=True)
+	parser.add_argument("--output", "-o", help="Path to output directory to write output to (will be automatically created if it doesn't exist)", required=True)
+	parser.add_argument("--feature-dim", help="The size of the output feature dimension of the model [default: 200].", type=int)
+	parser.add_argument("--batch-size", help="Sets the batch size [default: 64].", type=int)
+	parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.")
+	
+	return parser
+
+def run(args):
+	
+	if (not hasattr(args, "batch_size")) or args.batch_size == None:
+		args.batch_size = 64
+	if (not hasattr(args, "feature_dim")) or args.feature_dim == None:
+		args.feature_dim = 200
+	if (not hasattr(args, "read_multiplier")) or args.read_multiplier == None:
+		args.read_multiplier = 1.5
+	
+	
+	# TODO: Validate args here.
+	
+	sys.stderr.write(f"\n\n>>> This is TensorFlow {tf.__version__}\n\n\n")
+	
+	dataset_train, dataset_validate = dataset(
+		dirpath_input=args.input,
+		batch_size=args.batch_size,
+	)
+	dataset_metadata = read_metadata(args.input)
+	
+	# for items in dataset_train.repeat(10):
+	# 	print("ITEMS", len(items))
+	# 	print("LEFT", [ item.shape for item in items[0] ])
+	# print("ITEMS DONE")
+	# exit(0)
+	
+	ai = RainfallWaterContraster(
+		dir_output=args.output,
+		batch_size=args.batch_size,
+		feature_dim=args.feature_dim,
+		
+		shape_rainfall=dataset_metadata["rainfallradar"],
+		shape_water=dataset_metadata["waterdepth"]
+	)
+	
+	ai.train(dataset_train, dataset_validate)
+	
\ No newline at end of file
diff --git a/aimodel/src/subcommands/pretrain_predict.py b/aimodel/src/subcommands/pretrain_predict.py
new file mode 100644
index 0000000..f6e1ce8
--- /dev/null
+++ b/aimodel/src/subcommands/pretrain_predict.py
@@ -0,0 +1,86 @@
+import io
+import json
+import os
+import sys
+import argparse
+import re
+
+from loguru import logger
+import tensorflow as tf
+import numpy as np
+
+from lib.ai.RainfallWaterContraster import RainfallWaterContraster
+from lib.dataset.dataset import dataset_predict
+from lib.io.find_paramsjson import find_paramsjson
+from lib.io.readfile import readfile
+from lib.vis.embeddings import vis_embeddings
+
+def parse_args():
+	parser = argparse.ArgumentParser(description="Output feature maps using a given pretrained contrastive model.")
+	# parser.add_argument("--config", "-c", help="Filepath to the TOML config file to load.", required=True)
+	parser.add_argument("--input", "-i", help="Path to input directory containing the images to predict for.", required=True)
+	parser.add_argument("--output", "-o", help="Path to output file to write output to. Defaults to stdout, but if specified a UMAP graph will NOT be produced.")
+	parser.add_argument("--checkpoint", "-c", help="Checkpoint file to load model weights from.", required=True)
+	parser.add_argument("--params", "-p", help="Optional. The file containing the model hyperparameters (usually called 'params.json'). If not specified, it's location will be determined automatically.")
+	parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.")
+	parser.add_argument("--no-vis",
+		help="Don't also plot a visualisation of the resulting embeddings.", action="store_true")
+	parser.add_argument("--only-gpu",
+		help="If the GPU is not available, exit with an error (useful on shared HPC systems to avoid running out of memory & affecting other users)", action="store_true")
+	
+	return parser
+
+def run(args):
+	
+	# Note that we do NOT check to see if the checkpoint file exists, because Tensorflow/Keras requires that we pass the stem instead of the actual index file..... :-/
+	
+	if (not hasattr(args, "params")) or args.params == None:
+		args.params = find_paramsjson(args.checkpoint)
+	if (not hasattr(args, "read_multiplier")) or args.read_multiplier == None:
+		args.read_multiplier = 1.5
+	
+	if not os.path.exists(args.params):
+		raise Exception(f"Error: The specified filepath params.json hyperparameters ('{args.params}) does not exist.")
+	if not os.path.exists(args.checkpoint):
+		raise Exception(f"Error: The specified filepath to the checkpoint to load ('{args.checkpoint}) does not exist.")
+	
+	
+	filepath_output = args.output if hasattr(args, "output") and args.output != None else "-"
+	
+	
+	ai = RainfallWaterContraster.from_checkpoint(args.checkpoint)
+	
+	sys.stderr.write(f"\n\n>>> This is TensorFlow {tf.__version__}\n\n\n")
+	
+	dataset_train, filepaths, filepaths_length = dataset_predict(
+		dirpath_input=args.input,
+		batch_size=ai.batch_size,
+		parallel_reads_multiplier=args.read_multiplier
+	)
+	filepaths = filepaths[0:filepaths_length]
+	
+	# for items in dataset_train.repeat(10):
+	# 	print("ITEMS", len(items))
+	# 	print("LEFT", [ item.shape for item in items[0] ])
+	# print("ITEMS DONE")
+	# exit(0)
+	
+	handle = sys.stdout
+	if filepath_output != "-":
+		handle = io.open(filepath_output, "w")
+	
+	embeddings = ai.embed(dataset_train)[0:filepaths_length] # Trim off the padding
+	result = list(zip(filepaths, embeddings))
+	for filepath, embedding in result:
+		handle.write(json.dumps({
+			"filepath": filepath,
+			"embedding": embedding.numpy().tolist()
+		}, separators=(',', ':'))+"\n") # Ref https://stackoverflow.com/a/64710892/1460422
+	
+	if filepath_output != "-":
+		sys.stderr.write(">>> Plotting with UMAP\n")
+		filepath_output_umap = os.path.splitext(filepath_output)[0]+'.png'
+		labels = [ os.path.basename(os.path.dirname(filepath)) for filepath in filepaths ]
+		vis_embeddings(filepath_output_umap, np.array([ embedding.numpy() for embedding in embeddings ]), np.array(labels))
+	
+	sys.stderr.write(">>> Complete\n")
\ No newline at end of file