Source code for aweSOM.run_som

## Script to initialize and train SOM network with generic (1D flattened) data

import h5py as h5
import sys
import argparse

from sklearn.preprocessing import MinMaxScaler
import numpy as np

import pickle


[docs] def batch_separator(data: np.ndarray, number_of_batches: int) -> np.ndarray: """Given a dataset and a number of batches, return a list of datasets each containing the same number of data points. Args: data (np.ndarray): N x f dataset, N is the number of data points and f is the number of features number_of_batches (int): number of batches to create (b) Returns: np.ndarray: b x N//b x f list of datasets """ N = data.shape[0] f = data.shape[1] batch_size = N // number_of_batches batches = np.zeros((number_of_batches, batch_size, f)) for i in range(number_of_batches): batches[i] = data[i * batch_size : (i + 1) * batch_size] return batches
[docs] def number_of_nodes(N: int, f: int) -> int: """Given a dataset with N data points and f features, return the number of nodes in the SOM lattice. Args: N (int): number of data points f (int): number of features Returns: int: number of nodes in the lattice """ return int(5 * np.sqrt(N * f) / 6)
[docs] def initialize_lattice(data: np.ndarray, ratio: float) -> list[int]: """Given a N x f dataset and a ratio, return the dimensions of the SOM lattice based on Kohonen's advice. Args: data (np.ndarray): N x f dataset, N is the number of data points and f is the number of features ratio (float): height to width ratio of the lattice, between 0 and 1. Returns: list[int]: [xdim, ydim] dimensions of the lattice """ N = data.shape[0] f = data.shape[1] nodes = number_of_nodes(N, f) xdim = int(np.sqrt(nodes / ratio)) ydim = int(nodes / xdim) return [xdim, ydim]
[docs] def manual_scaling(data: np.ndarray, bulk_range: float = 1.0) -> np.ndarray: """Scale data to a range that centers on 0. and contains 95% of the data within the range. Args: data (np.ndarray): 2d array of data (N x f) bulk_range (float, optional): The extent to which 95% of the data resides in. Defaults to 1.. Returns: np.ndarray: scaled data """ two_sigma = 2.0 * np.std(data, axis=0) return (data - np.mean(data, axis=0)) / two_sigma * bulk_range
[docs] def inv_manual_scaling( normed_data: np.ndarray, ori_data: np.ndarray, bulk_range: float = 1.0 ) -> np.ndarray: """Given a value that has been scaled using manual_scaling, return the original value. Args: normed_data (np.ndarray): 2d array of data (M x f) ori_data (np.ndarray): 2d array of original data (N x f) bulk_range (float, optional): The extent to which 95% of the data resides in. Defaults to 1.. Returns: np.ndarray: unscaled data """ two_sigma = 2.0 * np.std(ori_data, axis=0) return two_sigma * bulk_range * normed_data + np.mean(ori_data, axis=0)
[docs] def save_som_object( som: "Lattice", xdim: int, ydim: int, alpha_0: float, train: int, batch: int = 1, initial: str = "s", name_of_dataset: str = "", ): """ Save the SOM object to a pickle file. Args: som (aweSOM.Lattice): The SOM object to be saved. xdim (int): The x-dimension of the SOM lattice. ydim (int): The y-dimension of the SOM lattice. alpha_0 (float): The initial learning rate of the SOM. train (int): The number of training iterations. batch (int, optional): The batch size for training. Defaults to 1. initial (str, optional): The type of initial weights. Defaults to "s". name_of_dataset (str, optional): The name of the dataset. Defaults to "". """ with open( f"som_object.{name_of_dataset}-{xdim}-{ydim}-{alpha_0}-{train}-{batch}{initial}.pkl", "wb", ) as file: pickle.dump(som, file) print( f"SOM object saved to som_object.{name_of_dataset}-{xdim}-{ydim}-{alpha_0}-{train}-{batch}{initial}.pkl" )
[docs] def save_cluster_labels( som_labels: np.ndarray, xdim: int, ydim: int, alpha_0: float, train: int, batch: int = 1, initial: str = "s", name_of_dataset: str = "", ): """ Saves the cluster labels to a numpy file. Args: som_labels (np.ndarray): The cluster labels to be saved. xdim (int): The x-dimension of the SOM grid. ydim (int): The y-dimension of the SOM grid. alpha_0 (float): The initial learning rate. train (int): The number of training iterations. batch (int, optional): The batch size. Defaults to 1. initial (str, optional): The type of initialization. Defaults to "s". name_of_dataset (str, optional): The name of the dataset. Defaults to "". """ np.save( f"labels.{name_of_dataset}-{xdim}-{ydim}-{alpha_0}-{train}-{batch}{initial}.npy", som_labels, ) print( f"Cluster labels saved to labels.{name_of_dataset}-{xdim}-{ydim}-{alpha_0}-{train}-{batch}{initial}.npy" )
[docs] def parse_args(): """CLI argument parser for run_som.py script.""" parser = argparse.ArgumentParser(description="SOM code") parser.add_argument( "--features_path", type=str, dest="features_path", default="/mnt/ceph/users/tha10/SOM-tests/hr-d3x640/", ) parser.add_argument( "--file", type=str, dest="file", default="features_4j1b1e_2800.h5" ) parser.add_argument( "--init_lattice", type=str, dest="init_lattice", default="uniform", help="Initial values of lattice. uniform or sampling", ) parser.add_argument( "--xdim", type=int, dest="xdim", default=None, help="X dimension of the lattice", required=False, ) parser.add_argument( "--ydim", type=int, dest="ydim", default=None, help="Y dimension of the lattice", required=False, ) parser.add_argument( "--ratio", type=float, dest="ratio", default=0.7, help="Height to width ratio of the lattice", required=False, ) parser.add_argument( "--alpha", type=float, dest="alpha", default=0.5, help="Initial learning parameter", ) parser.add_argument( "--train", type=int, dest="train", default=None, help="Number of training steps" ) parser.add_argument( "--batch", type=int, dest="batch", default=1, help="Number of batches", required=False, ) parser.add_argument( "--pretrained", action="store_true", dest="pretrained", help="Pass this argument if supplying a pre-trained model", required=False, ) parser.add_argument( "--lattice_path", type=str, dest="lattice_path", default=None, help="Path to file containing lattice values", required=False, ) parser.add_argument( "--threshold", type=float, dest="threshold", default=0.2, help="Threshold for merging clusters", required=False, ) return parser.parse_args()
if __name__ == "__main__": from .som import Lattice args = parse_args() # -------------------------------------------------- if (args.pretrained == True) & (args.lattice_path is None): sys.exit("Cannot run, no lattice provided.") # CLI arguments features_path = args.features_path file_name = args.file init_lattice = args.init_lattice xdim = args.xdim ydim = args.ydim ratio = args.ratio alpha_0 = args.alpha train = args.train batch = args.batch pretrained = args.pretrained neurons_path = args.neurons_path threshold = args.threshold name_of_dataset = file_name.split("_")[2].split(".h5")[ 0 ] # all the data laps to process # load data with h5.File(features_path + file_name, "r") as f5: x = f5["features"][()] feature_list = f5["names"][()] feature_list = [n.decode("utf-8") for n in feature_list] # figure out the number of training steps if train is None: train = len(x) print( f"Training steps not provided, defaulting to # steps = # data points = {train}", flush=True, ) # initialize lattice if xdim is None or ydim is None: # NOTE: try PCA here to figure out the ratio of the map print( "No lattice dimensions provided, initializing lattice based on Kohonen's advice", flush=True, ) [xdim, ydim] = initialize_lattice(x, ratio) print(f"Initialized lattice dimensions: {xdim}x{ydim}", flush=True) print( f"File loaded, parameters: {name_of_dataset}-{xdim}-{ydim}-{alpha_0}-{train}-{batch}", flush=True, ) # normalize data scale_method = "manual" if scale_method == "manual": data_transformed = manual_scaling(x) else: scaler = MinMaxScaler() data_transformed = scaler.fit_transform(x) scale_method = "MinMaxScaler" print(f"Data scaled with {scale_method}", flush=True) # initialize SOM lattice som = Lattice( xdim, ydim, alpha_0, train, alpha_type="decay", sampling_type=init_lattice ) # train SOM if batch == 1: som.train_lattice( data_transformed, feature_list, ) else: print(f"Training batch 1/{batch}", flush=True) data_by_batch = batch_separator(data_transformed, batch) som.train_lattice( data_by_batch[0], feature_list, number_of_steps=data_by_batch[0].shape[0] ) lattice_weights = som.lattice for i in range(1, batch): print(f"Training batch {i+1}/{batch}", flush=True) som.train_lattice( data_by_batch[i], feature_list, number_of_steps=data_by_batch[i].shape[0], restart_lattice=lattice_weights, ) lattice_weights = som.lattice print(f"Random seed: {som.seed}", flush=True) # map data to lattice som.data_array = data_transformed # recover the full dataset instead of the batch projection_2d = som.map_data_to_lattice() # assign cluster ids to the lattice clusters = som.assign_cluster_to_lattice(smoothing=None, merge_cost=threshold) # assign cluster ids to the data som_labels = som.assign_cluster_to_data(projection_2d, clusters) if init_lattice == "sampling": initial = "s" else: initial = "u" # save cluster ids save_cluster_labels( som_labels, xdim, ydim, alpha_0, train, batch, initial, name_of_dataset ) # save som object save_som_object(som, xdim, ydim, alpha_0, train, batch, initial, name_of_dataset)