ai: the best thing about implementing a model is that you don't have to test it on the same day :P

2025-02-18 22:14:56 +00:00 · 2022-08-11 18:26:28 +01:00 · 2022-08-11 18:26:28 +01:00 · 15a3519107
commit 15a3519107
parent 28bcdf2192
8 changed files with 238 additions and 7 deletions
--- a/aimodel/src/lib/ai/RainfallWaterContraster.py
+++ b/aimodel/src/lib/ai/RainfallWaterContraster.py
@ -6,6 +6,7 @@ import json
 import tensorflow as tf
 from ..io.find_paramsjson import find_paramsjson
 from ..io.readfile import readfile
 from ..io.writefile import writefile
@ -36,20 +37,30 @@ class RainfallWaterContraster(object):
 			self.filepath_summary = os.path.join(self.dir_output, "summary.txt")
 			summarywriter(self.model, self.filepath_summary)
-			writefile(os.path.join(self.dir_output, "params.json"), json.dumps(self.model.get_config()))
+			writefile(os.path.join(self.dir_output, "params.json"), self.get_config())
 		else:	
 			self.model = self.load_model(filepath_checkpoint)
 	def get_config(self):
 		return {
 			"epochs": self.epochs,
 			"batch_size": self.batch_size,
 			**self.kwargs
 		}
 	@staticmethod
-	def from_checkpoint(filepath_checkpoint, filepath_hyperparams):
+	def from_checkpoint(filepath_checkpoint, filepath_hyperparams=None):
 		if filepath_checkpoint is None:
 			filepath_checkpoint = find_paramsjson(filepath_checkpoint)
 		hyperparams = json.loads(readfile(filepath_hyperparams))
 		return RainfallWaterContraster(filepath_checkpoint=filepath_checkpoint, **hyperparams)
 	def make_model(self):
 		model = model_rainfallwater_contrastive(batch_size=self.batch_size, **self.kwargs)
 		return model
 	def load_model(self, filepath_checkpoint):
 		"""
 		Loads a saved model from the given filename.
@ -77,9 +88,11 @@ class RainfallWaterContraster(object):
 		for batch in dataset:
 			i_batch += 1
 			result_batch = self.model(batch[0])
-			# Currently, the left and right should be the same
+			rainfall, water = tf.unstack(result_batch, axis=-2)
-			left, _ = tf.unstack(result_batch, axis=-2)
+			
-			result_batch = tf.unstack(left, axis=0)
+			rainfall = tf.unstack(rainfall, axis=0)
-			result.extend(result_batch)
+			water = tf.unstack(water, axis=0)
 			result.extend(zip(rainfall, water))
 		return result
--- a/aimodel/src/lib/dataset/dataset.py
+++ b/aimodel/src/lib/dataset/dataset.py
@ -49,8 +49,10 @@ def dataset(dirpath_input, batch_size=64, train_percentage=0.8, parallel_reads_m
 	dataset_train = make_dataset(filepaths_train, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier)
 	dataset_validate = make_dataset(filepaths_validate, batch_size=batch_size, parallel_reads_multiplier=parallel_reads_multiplier)
-	return dataset_train, dataset_validate
+	return dataset_train, dataset_validate, filepaths
 def dataset_predict():
 	raise NotImplementedError("Not implemented yet")
 if __name__ == "__main__":
--- a/aimodel/src/lib/dataset/read_metadata.py
+++ b/aimodel/src/lib/dataset/read_metadata.py
@ -0,0 +1,10 @@
 import os
 import json
 from ..io.readfile import readfile
 def read_metadata(dirpath_dataset):
 	filepath_metadata = os.path.join(dirpath_dataset, "metadata.json")
 	return json.loads(readfile(filepath_metadata))
--- a/aimodel/src/lib/io/find_paramsjson.py
+++ b/aimodel/src/lib/io/find_paramsjson.py
@ -0,0 +1,17 @@
 import os
 def find_paramsjson(filepath_checkpoint):
 	filepath_stem = os.path.splitext(filepath_checkpoint)[0]
 	dirpath_container = os.path.dirname(filepath_checkpoint)
 	dirpath_parent = os.path.dirname(dirpath_container)
 	options = [
 		f"{filepath_stem}.json",
 		os.path.join(dirpath_container, "params.json"),
 		os.path.join(dirpath_parent, "params.json")
 	]
 	for candidate in options:
 		if os.path.exists(candidate):
 			return candidate
 	return None
--- a/aimodel/src/lib/vis/embeddings.py
+++ b/aimodel/src/lib/vis/embeddings.py
@ -0,0 +1,45 @@
 import os
 import umap
 import umap.plot
 import numpy as np
 import matplotlib.pylab as plt
 import pandas
 def vis_embeddings(filepath_output, features):
 	dimreducer = umap.UMAP(min_dist=0.05).fit(features)
 	px = 1 / plt.rcParams['figure.dpi'] # matplotlib sizes are in inches :-( :-( :-(
 	width = 1920
 	height = 768
 	plt.rc("font", size=20)
 	plt.rc("font", family="Ubuntu")
 	figure = plt.figure(figsize=(width*px, height*px))
 	figure.add_subplot(1, 2, 1)
 	# 1: UMAP
 	umap.plot.points(dimreducer,
 		color_key_cmap="brg", # color_key_cmap="jet",
 		ax=figure.get_axes()[0]
 	)
 	plt.title(f"UMAP Dimensionality Reduction", fontsize=20)
 	# 2: Parallel coordinates
 	figure.add_subplot(1, 2, 2)
 	# CHEESE: This won't produce a terribly accurate result, as we're just ignoring the most of CLIP's embedded features.
 	dataframe = pandas.DataFrame(features)
 	dataframe["Label"] = range(len(features))
 	pandas.plotting.parallel_coordinates(
 		dataframe,
 		"Label",
 		ax=figure.get_axes()[1],
 		use_columns=False,
 		axvlines=False,
 		sort_labels=True
 	)
 	plt.title(f"Parallel coordinates plot", fontsize=20)
 	plt.suptitle(f"ContrastiveE1 embeddings | ResNetV2 | {len(features)} items", fontsize=28, weight="bold")
 	plt.savefig(filepath_output)
--- a/aimodel/src/subcommands/init.py
+++ b/aimodel/src/subcommands/init.py
--- a/aimodel/src/subcommands/pretrain.py
+++ b/aimodel/src/subcommands/pretrain.py
@ -0,0 +1,58 @@
 import sys
 import argparse
 from asyncio.log import logger
 import tensorflow as tf
 from lib.ai.RainfallWaterContraster import RainfallWaterContraster
 from lib.dataset.dataset import dataset
 from lib.dataset.read_metadata import read_metadata
 def parse_args():
 	parser = argparse.ArgumentParser(description="Pretrain a contrastive learning model on a directory of rainfall+water .tfrecord.gz files.")
 	# parser.add_argument("--config", "-c", help="Filepath to the TOML config file to load.", required=True)
 	parser.add_argument("--input", "-i", help="Path to input directory containing the .tfrecord.gz files to pretrain with", required=True)
 	parser.add_argument("--output", "-o", help="Path to output directory to write output to (will be automatically created if it doesn't exist)", required=True)
 	parser.add_argument("--feature-dim", help="The size of the output feature dimension of the model [default: 200].", type=int)
 	parser.add_argument("--batch-size", help="Sets the batch size [default: 64].", type=int)
 	parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.")
 	return parser
 def run(args):
 	if (not hasattr(args, "batch_size")) or args.batch_size == None:
 		args.batch_size = 64
 	if (not hasattr(args, "feature_dim")) or args.feature_dim == None:
 		args.feature_dim = 200
 	if (not hasattr(args, "read_multiplier")) or args.read_multiplier == None:
 		args.read_multiplier = 1.5
 	# TODO: Validate args here.
 	sys.stderr.write(f"\n\n>>> This is TensorFlow {tf.__version__}\n\n\n")
 	dataset_train, dataset_validate = dataset(
 		dirpath_input=args.input,
 		batch_size=args.batch_size,
 	)
 	dataset_metadata = read_metadata(args.input)
 	# for items in dataset_train.repeat(10):
 	# 	print("ITEMS", len(items))
 	# 	print("LEFT", [ item.shape for item in items[0] ])
 	# print("ITEMS DONE")
 	# exit(0)
 	ai = RainfallWaterContraster(
 		dir_output=args.output,
 		batch_size=args.batch_size,
 		feature_dim=args.feature_dim,
 		shape_rainfall=dataset_metadata["rainfallradar"],
 		shape_water=dataset_metadata["waterdepth"]
 	)
 	ai.train(dataset_train, dataset_validate)
--- a/aimodel/src/subcommands/pretrain_predict.py
+++ b/aimodel/src/subcommands/pretrain_predict.py
@ -0,0 +1,86 @@
 import io
 import json
 import os
 import sys
 import argparse
 import re
 from loguru import logger
 import tensorflow as tf
 import numpy as np
 from lib.ai.RainfallWaterContraster import RainfallWaterContraster
 from lib.dataset.dataset import dataset_predict
 from lib.io.find_paramsjson import find_paramsjson
 from lib.io.readfile import readfile
 from lib.vis.embeddings import vis_embeddings
 def parse_args():
 	parser = argparse.ArgumentParser(description="Output feature maps using a given pretrained contrastive model.")
 	# parser.add_argument("--config", "-c", help="Filepath to the TOML config file to load.", required=True)
 	parser.add_argument("--input", "-i", help="Path to input directory containing the images to predict for.", required=True)
 	parser.add_argument("--output", "-o", help="Path to output file to write output to. Defaults to stdout, but if specified a UMAP graph will NOT be produced.")
 	parser.add_argument("--checkpoint", "-c", help="Checkpoint file to load model weights from.", required=True)
 	parser.add_argument("--params", "-p", help="Optional. The file containing the model hyperparameters (usually called 'params.json'). If not specified, it's location will be determined automatically.")
 	parser.add_argument("--reads-multiplier", help="Optional. The multiplier for the number of files we should read from at once. Defaults to 1.5, which means read ceil(NUMBER_OF_CORES * 1.5). Set to a higher number of systems with high read latency to avoid starving the GPU of data.")
 	parser.add_argument("--no-vis",
 		help="Don't also plot a visualisation of the resulting embeddings.", action="store_true")
 	parser.add_argument("--only-gpu",
 		help="If the GPU is not available, exit with an error (useful on shared HPC systems to avoid running out of memory & affecting other users)", action="store_true")
 	return parser
 def run(args):
 	# Note that we do NOT check to see if the checkpoint file exists, because Tensorflow/Keras requires that we pass the stem instead of the actual index file..... :-/
 	if (not hasattr(args, "params")) or args.params == None:
 		args.params = find_paramsjson(args.checkpoint)
 	if (not hasattr(args, "read_multiplier")) or args.read_multiplier == None:
 		args.read_multiplier = 1.5
 	if not os.path.exists(args.params):
 		raise Exception(f"Error: The specified filepath params.json hyperparameters ('{args.params}) does not exist.")
 	if not os.path.exists(args.checkpoint):
 		raise Exception(f"Error: The specified filepath to the checkpoint to load ('{args.checkpoint}) does not exist.")
 	filepath_output = args.output if hasattr(args, "output") and args.output != None else "-"
 	ai = RainfallWaterContraster.from_checkpoint(args.checkpoint)
 	sys.stderr.write(f"\n\n>>> This is TensorFlow {tf.__version__}\n\n\n")
 	dataset_train, filepaths, filepaths_length = dataset_predict(
 		dirpath_input=args.input,
 		batch_size=ai.batch_size,
 		parallel_reads_multiplier=args.read_multiplier
 	)
 	filepaths = filepaths[0:filepaths_length]
 	# for items in dataset_train.repeat(10):
 	# 	print("ITEMS", len(items))
 	# 	print("LEFT", [ item.shape for item in items[0] ])
 	# print("ITEMS DONE")
 	# exit(0)
 	handle = sys.stdout
 	if filepath_output != "-":
 		handle = io.open(filepath_output, "w")
 	embeddings = ai.embed(dataset_train)[0:filepaths_length] # Trim off the padding
 	result = list(zip(filepaths, embeddings))
 	for filepath, embedding in result:
 		handle.write(json.dumps({
 			"filepath": filepath,
 			"embedding": embedding.numpy().tolist()
 		}, separators=(',', ':'))+"\n") # Ref https://stackoverflow.com/a/64710892/1460422
 	if filepath_output != "-":
 		sys.stderr.write(">>> Plotting with UMAP\n")
 		filepath_output_umap = os.path.splitext(filepath_output)[0]+'.png'
 		labels = [ os.path.basename(os.path.dirname(filepath)) for filepath in filepaths ]
 		vis_embeddings(filepath_output_umap, np.array([ embedding.numpy() for embedding in embeddings ]), np.array(labels))
 	sys.stderr.write(">>> Complete\n")