diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a180fa1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +build/** +__pycache__/** +ElM2D.egg-info/** +ElM2D/.ipynb_checkpoints/** +ElM2D/__pycache__/** diff --git a/ElM2D/ElM2D.py b/ElM2D/ElM2D.py index c608504..397c7ed 100644 --- a/ElM2D/ElM2D.py +++ b/ElM2D/ElM2D.py @@ -1,7 +1,5 @@ """ -A class to construct an ElM2D plot of a list of inorganic compostions based on -the Element Movers Distance Between These. - +Construct ElM2D plot of a list of inorganic compostions via Element Movers Distance. Copyright (C) 2021 Cameron Hargreaves @@ -29,22 +27,21 @@ Network simplex source modified to use numba from https://networkx.github.io/documentation/networkx-1.10/_modules/networkx/algorithms/flow/networksimplex.html#network_simplex -Requies umap which may be installed via -conda install -c conda-forge umap-learn +Requires umap which may be installed via: + conda install -c conda-forge umap-learn """ -import re, json, csv - -from multiprocessing import Pool, cpu_count +import os +from operator import attrgetter +from importlib import reload +from numba import cuda -from copy import deepcopy -from collections import Counter +from multiprocessing import cpu_count, freeze_support import numpy as np import pandas as pd -import pickle as pk +import pickle as pk from scipy.spatial.distance import squareform -from numba import njit import umap @@ -52,69 +49,130 @@ import plotly.io as pio from tqdm import tqdm -from tqdm.contrib.concurrent import process_map -# from matminer.datasets import load_dataset + +# from tqdm.contrib.concurrent import process_map from ElMD import ElMD, EMD +from njit_dist_matrix import dist_matrix as njit_dist_matrix + +# number of columns of U and V must be set as env var before import +n_elements = len(ElMD().periodic_tab["mod_petti"]) +os.environ["COLUMNS"] = str(n_elements) + +# other environment variables (set before importing cuda_dist_matrix) +os.environ["USE_64"] = "0" +os.environ["INLINE"] = "never" +os.environ["FASTMATH"] = "1" +# os.environ["TARGET"] = "cuda" + +import cuda_dist_matrix # noqa + +# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228) # noqa +reload(cuda_dist_matrix) +cuda_dist_matrix = cuda_dist_matrix.dist_matrix + + def main(): - df = load_dataset("matbench_expt_gap").head(1001) + """ + Perform a test of basic features such as intersection, sorting, and featurization. + + Returns + ------- + None. + """ + df = pd.read_csv("train-debug.csv") df_1 = df.head(500) df_2 = df.tail(500) mapper = ElM2D(metric="mod_petti") mapper.intersect(df_1["composition"], df_2["composition"]) sorted_comps = mapper.sort(df["composition"]) sorted_comps, sorted_inds = mapper.sort(df["composition"], return_inds=True) - fts = mapper.featurize() + mapper.featurize() print() -class ElM2D(): - ''' - This class takes in a list of compound formula and creates the intercompound - distance matrix wrt EMD and a two dimensional embedding using either PCA or - UMAP - ''' - def __init__(self, formula_list=None, - n_proc=None, - n_components=2, - verbose=True, - metric="mod_petti", - chunksize=1, - umap_kwargs={}): + +class ElM2D: + """ + Create intercompound EMD distance matrix and embedding via list of formulas. + + Embedding types are: + - PCA + - UMAP + """ + + def __init__( + self, + formula_list=None, + n_proc=None, + n_components=2, + verbose=True, + metric="mod_petti", + chunksize=1, + umap_kwargs={}, + emd_algorithm="wasserstein", + target=None, + ): self.verbose = verbose if n_proc is None: self.n_proc = cpu_count() - else: + else: self.n_proc = n_proc - self.formula_list = formula_list # Input formulae + self.formula_list = formula_list # Input formulae self.metric = metric - self.chunksize=chunksize + self.chunksize = chunksize self.umap_kwargs = umap_kwargs self.umap_kwargs["n_components"] = n_components self.umap_kwargs["metric"] = "precomputed" - self.input_mat = None # Pettifor vector representation of formula - self.embedder = None # For accessing UMAP object - self.embedding = None # Stores the last embedded coordinates - self.dm = None # Stores distance matrix - + self.input_mat = None # Pettifor vector representation of formula + self.embedder = None # For accessing UMAP object + self.embedding = None # Stores the last embedded coordinates + self.dm = None # Stores distance matrix + self.emd_algorithm = emd_algorithm + self.target = target # "cuda" or "cpu" def save(self, filepath): - # Save all variables except for the distance matrix + """ + Save all variables except for the distance matrix. + + Parameters + ---------- + filepath : str + Filepath for which to save the pickle. + + Returns + ------- + None. + + """ save_dict = {k: v for k, v in self.__dict__.items()} - f_handle = open(filepath + ".pk", 'wb') + f_handle = open(filepath + ".pk", "wb") pk.dump(save_dict, f_handle) f_handle.close() - + def load(self, filepath): - f_handle = open(filepath + ".pk", 'rb') + """ + Load variables from pickle file. + + Parameters + ---------- + filepath : str + Filepath for which to load the pickle. + + Returns + ------- + None. + + """ + f_handle = open(filepath + ".pk", "rb") load_dict = pk.load(f_handle) f_handle.close() @@ -122,31 +180,106 @@ def load(self, filepath): self.__dict__[k] = v def plot(self, fp=None, color=None, embedding=None): + """ + Generate plots based on embedding dimension. + + Parameters + ---------- + fp : str, optional + Filepath for which to write plotly html. The default is None. + color : str, optional + Color to use for scatter points. The default is None. + embedding : 2D array, optional + From self.embedding if not specified. The default is None. + + Returns + ------- + fig : plotly figure + Handle to the plotly figure. + + """ if self.embedding is None: - print("No embedding in memory, call transform() first.") - return + print("No embedding in memory, call transform() first.") + return if embedding is None: embedding = self.embedding + x = embedding[:, 0] + y = embedding[:, 1] if embedding.shape[1] == 2: if color is None: - df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "formula": self.formula_list}) - fig = px.scatter(df, x="x", y="y", hover_name="formula", hover_data={"x":False, "y":False}) + df = pd.DataFrame({"x": x, "y": y, "formula": self.formula_list}) + fig = px.scatter( + df, + x="x", + y="y", + hover_name="formula", + hover_data={"x": False, "y": False}, + ) else: - df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "formula": self.formula_list, color.name: color.to_numpy()}) - fig = px.scatter(df, x="x", y="y",color=color.name, hover_data={"formula": True, color.name: True, "x":False, "y":False}) + df = pd.DataFrame( + { + "x": x, + "y": y, + "formula": self.formula_list, + color.name: color.to_numpy(), + } + ) + fig = px.scatter( + df, + x="x", + y="y", + color=color.name, + hover_data={ + "formula": True, + color.name: True, + "x": False, + "y": False, + }, + ) elif embedding.shape[1] == 3: + z = embedding[:, 2] if color is None: - df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "z": embedding[:, 2], "formula": self.formula_list}) - fig = px.scatter_3d(df, x="x", y="y", z="z", hover_name="formula", hover_data={"x":False, "y":False, "z":False}) + df = pd.DataFrame( + {"x": x, "y": y, "z": z, "formula": self.formula_list} + ) + fig = px.scatter_3d( + df, + x="x", + y="y", + z="z", + hover_name="formula", + hover_data={"x": False, "y": False, "z": False}, + ) else: - df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "z": embedding[:, 2], "formula": self.formula_list, color.name: color.to_numpy()}) - fig = px.scatter_3d(df, x="x", y="y", z="z", color=color.name, hover_data={"formula": True, color.name: True, "x":False, "y":False, "z":False}) - + df = pd.DataFrame( + { + "x": x, + "y": y, + "z": z, + "formula": self.formula_list, + color.name: color.to_numpy(), + } + ) + fig = px.scatter_3d( + df, + x="x", + y="y", + z="z", + color=color.name, + hover_data={ + "formula": True, + color.name: True, + "x": False, + "y": False, + "z": False, + }, + ) + elif embedding.shape[1] > 3: print("Too many dimensions to plot directly, using first three components") fig = self.plot(fp=fp, color=color, embedding=embedding[:, :3]) @@ -157,20 +290,31 @@ def plot(self, fp=None, color=None, embedding=None): return fig - def fit(self, X): - ''' + def fit(self, X, target=None): + """ + Construct and store an ElMD distance matrix. + Take an input vector, either of a precomputed distance matrix, or an iterable of strings of composition formula, construct an ElMD distance matrix and store to self.dm. - Input - X - A list of compound formula strings, or a precomputed distance matrix - (ensure self.metric = "precomputed") - ''' + Parameters + ---------- + X : list of str OR 2D array + A list of compound formula strings, or a precomputed distance matrix. If + using a precomputed distance matrix, ensure self.metric == "precomputed" + + + Returns + ------- + None. + + """ self.formula_list = X n = len(X) - if self.verbose: print(f"Fitting {self.metric} kernel matrix") + if self.verbose: + print(f"Fitting {self.metric} kernel matrix") if self.metric == "precomputed": self.dm = X @@ -182,90 +326,100 @@ def fit(self, X): x = ElMD(X[i], metric=self.metric) for j in range(i + 1, n): distances.append(x.elmd(X[j])) - + dist_vec = np.array(distances) self.dm = squareform(dist_vec) else: - if self.verbose: print("Constructing distances") - dist_vec = self._process_list(X, n_proc=self.n_proc) - self.dm = squareform(dist_vec) - - def fit_transform(self, X, y=None, how="UMAP", n_components=2): + if self.verbose: + print("Constructing distances") + if self.emd_algorithm == "network_simplex": + dist_vec = self._process_list(X, n_proc=self.n_proc) + self.dm = squareform(dist_vec) + elif self.emd_algorithm == "wasserstein": + self.dm = self.EM2D(X, X, target=target) + + def fit_transform(self, X, y=None, how="UMAP", n_components=2, target=None): """ - Successively call fit and transform + Successively call fit and transform. + + Parameters + ---------- + X : list of str + Compositions to embed. + y : 1D numerical array, optional + Target values to use for supervised UMAP embedding. The default is None. + how : str, optional + How to perform embedding ("UMAP" or "PCA"). The default is "UMAP". + n_components : int, optional + Number of dimensions to embed to. The default is 2. + + Returns + ------- + embedding : TYPE + DESCRIPTION. - Parameters: - X - List of compositions to embed - how - "UMAP" or "PCA", the embedding technique to use - n_components - The number of dimensions to embed to """ - self.fit(X) - embedding = self.transform(how=how, n_components=self.umap_kwargs["n_components"], y=y) + self.fit(X, target=target) + embedding = self.transform( + how=how, n_components=self.umap_kwargs["n_components"], y=y + ) return embedding def transform(self, how="UMAP", n_components=2, y=None): """ - Call the selected embedding method (UMAP or PCA) and embed to - n_components dimensions. + Call the selected embedding method (UMAP or PCA) and embed. + + Parameters + ---------- + how : str, optional + How to perform embedding ("UMAP" or "PCA"). The default is "UMAP". + The default is "UMAP". + n_components : int, optional + Number of dimensions to embed to. The default is 2. + y : 1D numerical array, optional + Target values to use for supervised UMAP embedding. The default is None. + + Returns + ------- + 2D array + UMAP or PCA embedding. + + """ + """ + """ if self.dm is None: print("No distance matrix computed, run fit() first") - return + return + n = self.umap_kwargs["n_components"] if how == "UMAP": if y is None: - if self.verbose: print(f"Constructing UMAP Embedding to {self.umap_kwargs['n_components']} dimensions") + if self.verbose: + print(f"Constructing UMAP Embedding to {n} dimensions") self.embedder = umap.UMAP(**self.umap_kwargs) self.embedding = self.embedder.fit_transform(self.dm) else: y = y.to_numpy(dtype=float) - if self.verbose: print(f"Constructing UMAP Embedding to {self.umap_kwargs['n_components']} dimensions, with a targetted embedding") + if self.verbose: + print( + f"Constructing UMAP Embedding to {n} dimensions, with \ + a targeted embedding" + ) self.embedder = umap.UMAP(**self.umap_kwargs) self.embedding = self.embedder.fit_transform(self.dm, y) elif how == "PCA": - if self.verbose: print(f"Constructing PCA Embedding to {self.umap_kwargs['n_components']} dimensions") + if self.verbose: + print(f"Constructing PCA Embedding to {n} dimensions") self.embedding = self.PCA(n_components=self.umap_kwargs["n_components"]) - if self.verbose: print(f"Finished Embedding") + if self.verbose: + print("Finished Embedding") return self.embedding - def PCA(self, n_components=5): - """ - Multidimensional Scaling - Given a matrix of interpoint distances, - find a set of low dimensional points that have similar interpoint - distances. - https://github.com/stober/mds/blob/master/src/mds.py - """ - - if self.dm is None: - raise Exception("No distance matrix computed, call fit_transform with a list of compositions, or load a saved matrix with load_dm()") - - - (n,n) = self.dm.shape - - if self.verbose: print(f"Constructing {n}x{n_components} Gram matrix") - E = (-0.5 * self.dm**2) - - # Use this matrix to get column and row means - Er = np.mat(np.mean(E,1)) - Es = np.mat(np.mean(E,0)) - - # From Principles of Multivariate Analysis: A User's Perspective (page 107). - F = np.array(E - np.transpose(Er) - Es + np.mean(E)) - - if self.verbose: print(f"Computing Eigen Decomposition") - [U, S, V] = np.linalg.svd(F) - - Y = U * np.sqrt(S) - - if self.verbose: print(f"PCA Projected Points Computed") - self.mds_points = Y - - return Y[:, :n_components] - def sort(self, formula_list=None): """ Sorts compositions based on their ElMD similarity. @@ -278,15 +432,18 @@ def sort(self, formula_list=None): sorted_indices = mapper.sort() sorted_comps = mapper.sorted_comps """ - if formula_list is None and self.formula_list is None: - raise Exception("Must input a list of compositions or fit a list of compositions first") # TODO Exceptions? + # TODO Exceptions? + raise Exception( + "Must input a list of compositions or fit a list of compositions first" + ) elif formula_list is None: formula_list = self.formula_list elif self.formula_list is None: - formula_list = process_map(ElMD, formula_list, chunksize=self.chunksize) + E = ElMD + formula_list = map(E, formula_list, chunksize=self.chunksize) self.formula_list = formula_list sorted_comps = sorted(formula_list) @@ -294,23 +451,23 @@ def sort(self, formula_list=None): return sorted_comps - - def cross_validate(self, y=None, X=None, k=5, shuffle=True, seed=42): """ - Implementation of cross validation with K-Folds. - - Splits the formula_list into k equal sized partitions and returns five - tuples of training and test sets. Returns a list of length k, each item - containing 2 (4 with target data) numpy arrays of formulae of + Cross validate with K-Folds. + + Splits the formula_list into k equal sized partitions and returns five + tuples of training and test sets. Returns a list of length k, each item + containing 2 (4 with target data) numpy arrays of formulae of length n - n/k and n/k. - Parameters: + Parameters + ---------- y=None: (optional) a numpy array of target properties to cross validate k=5: Number of k-folds shuffle=True: whether to shuffle the input formulae or not - Usage: + Usage + ----- cvs = mapper.cross_validate() for i, (X_train, X_test) in enumerate(cvs): sub_mapper = ElM2D() @@ -326,29 +483,29 @@ def cross_validate(self, y=None, X=None, k=5, shuffle=True, seed=42): errors.append(mae(y_pred, y_test)) print(np.mean(errors)) """ - inds = np.arange(len(self.formula_list)) # TODO Exception + inds = np.arange(len(self.formula_list)) # TODO Exception if shuffle: - np.random.seed(seed) + np.random.seed(seed) np.random.shuffle(inds) if X is None: formulas = self.formula_list.to_numpy(str)[inds] - + else: formulas = X - + splits = np.array_split(formulas, k) X_ret = [] - + for i in range(k): train_splits = np.delete(np.arange(k), i) X_train = splits[train_splits[0]] for index in train_splits[1:]: X_train = np.concatenate((X_train, splits[index])) - + X_test = splits[i] X_ret.append((X_train, X_test)) @@ -365,26 +522,40 @@ def cross_validate(self, y=None, X=None, k=5, shuffle=True, seed=42): for index in train_splits[1:]: y_train = np.concatenate((y_train, y_splits[index])) - + y_test = y_splits[i] y_ret.append((y_train, y_test)) return [(X_ret[i][0], X_ret[i][1], y_ret[i][0], y_ret[i][1]) for i in range(k)] def _process_list(self, formula_list, n_proc): - ''' - Given an iterable list of formulas in composition form - use multiple processes to convert these to pettifor ratio - vector form, and then calculate the distance between these - pairings, returning this as a condensed distance vector - ''' + """ + Process a list of formulas into a pairwise distance matrix. + + Parameters + ---------- + formula_list : list of str + Chemical formulas. + n_proc : int + number of processors (i.e. CPU cores). + + Returns + ------- + 2D array + 2D distance matrix. + + Given an iterable list of formulas in composition form use multiple processes + to convert these to pettifor ratio vector form, and then calculate the + distance between these pairings, returning this as a condensed distance vector. + """ pool_list = [] - n_elements = len(ElMD().periodic_tab[self.metric]) - self.input_mat = np.ndarray(shape=(len(formula_list), n_elements), dtype=np.float64) + self.input_mat = np.ndarray( + shape=(len(formula_list), n_elements), dtype=np.float64 + ) - if self.verbose: + if self.verbose: print("Parsing Formula") for i, formula in tqdm(list(enumerate(formula_list))): self.input_mat[i] = ElMD(formula, metric=self.metric).ratio_vector @@ -393,7 +564,7 @@ def _process_list(self, formula_list, n_proc): self.input_mat[i] = ElMD(formula, metric=self.metric).ratio_vector # Create input pairings - if self.verbose: + if self.verbose: print("Constructing joint compositional pairings") for i in tqdm(range(len(formula_list) - 1)): sublist = [(i, j) for j in range(i + 1, len(formula_list))] @@ -404,55 +575,222 @@ def _process_list(self, formula_list, n_proc): pool_list.append(sublist) # Distribute amongst processes - if self.verbose: print("Creating Process Pool\nScattering compositions between processes and computing distances") - - scores = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize) - - if self.verbose: print("Distances computed closing processes") + if self.verbose: + print( + "Creating Process Pool\nScattering compositions between processes \ + and computing distances" + ) + + # scores = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize) + scores = map(self._pool_ElMD, pool_list) - if self.verbose: print("Flattening sublists") + if self.verbose: + print("Distances computed closing processes") + + if self.verbose: + print("Flattening sublists") # Flattens list of lists to single list - distances = [dist for sublist in scores for dist in sublist] - - return np.array(distances, dtype=np.float64) + distances = np.array( + [dist for sublist in scores for dist in sublist], dtype=np.float64 + ) + + return distances + + def EM2D(self, formulas, formulas2=None, target=None): + """ + Earth Mover's 2D distances. See also EMD. + + Parameters + ---------- + formulas : list of str + First list of formulas for which to compute distances. If only formulas + is specified, then a `pdist`-like array is returned, i.e. pairwise + distances within a single set. + formulas2 : list of str, optional + Second list of formulas, which if specified, causes `cdist`-like + behavior (i.e. pairwise distances between two sets). + + Returns + ------- + 2D array + Pairwise distances. + + """ + isXY = formulas2 is None + E = ElMD() + + def gen_ratio_vector(comp): + """Create a numpy array from a composition dictionary.""" + if isinstance(comp, str): + comp = E._parse_formula(comp) + comp = E._normalise_composition(comp) + + sorted_keys = sorted(comp.keys()) + comp_labels = [E._get_position(k) for k in sorted_keys] + comp_ratios = [comp[k] for k in sorted_keys] + + indices = np.array(comp_labels, dtype=np.int64) + ratios = np.array(comp_ratios, dtype=np.float64) + + numeric = np.zeros(shape=len(E.periodic_tab[E.metric]), dtype=np.float64) + numeric[indices] = ratios + + return numeric + + def gen_ratio_vectors(comps): + return np.array([gen_ratio_vector(comp) for comp in comps]) + + U_weights = gen_ratio_vectors(formulas) + if isXY: + V_weights = gen_ratio_vectors(formulas2) + + lookup, periodic_tab, metric = attrgetter("lookup", "periodic_tab", "metric")(E) + ptab_metric = periodic_tab[metric] + + def get_mod_petti(x): + return [ptab_metric[lookup[a]] if b > 0 else 0 for a, b in enumerate(x)] + + def get_mod_pettis(X): + return np.array([get_mod_petti(x) for x in X]) + + U = get_mod_pettis(U_weights) + if isXY: + V = get_mod_pettis(V_weights) + + # decide whether to use cpu or cuda version + if target is None: + if self.target is None or not cuda.is_available(): + target = "cpu" + elif self.target == "cuda" or cuda.is_available(): + target = "cuda" + + # if target == "cpu": + # dist_matrix = njit_dist_matrix + # elif target == "cuda": + # dist_matrix = cuda_dist_matrix + + if isXY: + if target == "cpu": + distances = njit_dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + metric="wasserstein", + ) + elif target == "cuda": + distances = cuda_dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + metric="wasserstein", + ) + else: + if target == "cpu": + distances = njit_dist_matrix( + U, U_weights=U_weights, metric="wasserstein" + ) + elif target == "cuda": + distances = cuda_dist_matrix( + U, U_weights=U_weights, metric="wasserstein" + ) + + # package + self.U = U + self.U_weights = U_weights + + if isXY: + self.V = V + self.V_weights = V_weights + + return distances + + def PCA(self, n_components=5): + """ + Perform multidimensional scaling (MDS) on a matrix of interpoint distances. + + This finds a set of low dimensional points that have similar interpoint + distances. + Source: https://github.com/stober/mds/blob/master/src/mds.py + """ + if self.dm == []: + raise Exception( + "No distance matrix computed, call fit_transform with a list of \ + compositions, or load a saved matrix with load_dm()" + ) + + (n, n) = self.dm.shape + + if self.verbose: + print(f"Constructing {n}x{n_components} Gram matrix") + E = -0.5 * self.dm ** 2 + + # Use this matrix to get column and row means + Er = np.mat(np.mean(E, 1)) + Es = np.mat(np.mean(E, 0)) + + # From Principles of Multivariate Analysis: A User's Perspective (page 107). + F = np.array(E - np.transpose(Er) - Es + np.mean(E)) + + if self.verbose: + print("Computing Eigen Decomposition") + [U, S, V] = np.linalg.svd(F) + + Y = U * np.sqrt(S) + + if self.verbose: + print("PCA Projected Points Computed") + self.mds_points = Y + + return Y[:, :n_components] def _pool_ElMD(self, input_tuple): - ''' - Uses multiprocessing module to call the numba compiled EMD function - ''' + """Use multiprocessing module to call the numba compiled EMD function.""" distances = np.ndarray(len(input_tuple)) elmd_obj = ElMD(metric=self.metric) - + for i, (input_1, input_2) in enumerate(input_tuple): - distances[i] = EMD(self.input_mat[input_1], - self.input_mat[input_2], - elmd_obj.lookup, - elmd_obj.periodic_tab[self.metric]) + distances[i] = EMD( + self.input_mat[input_1], + self.input_mat[input_2], + elmd_obj.lookup, + elmd_obj.periodic_tab[self.metric], + ) return distances def __repr__(self): + """Summary of ElM2D object: length, diversity, and max distance if dm exists.""" if self.dm is not None: - return f"ElM2D(size={len(self.formula_list)}, chemical_diversity={np.mean(self.dm)} +/- {np.std(self.dm)}, maximal_distance={np.max(self.dm)})" + return f"ElM2D(size={len(self.formula_list)}, \ + chemical_diversity={np.mean(self.dm)} +/- {np.std(self.dm)}, \ + maximal_distance={np.max(self.dm)})" else: - return f"ElM2D()" + return "ElM2D()" def export_dm(self, path): + """Export distance matrix as .csv to path.""" np.savetxt(path, self.dm, delimiter=",") - + def import_dm(self, path): + """Import distance matrix from .csv file located at path.""" self.dm = np.loadtxt(path, delimiter=",") def export_embedding(self, path): + """Export embedding as .csv file to path.""" np.savetxt(path, self.embedding, delimiter=",") - + def import_embedding(self, path): + """Import embedding from .csv file located at path.""" self.embedding = np.loadtxt(path, delimiter=",") def _pool_featurize(self, comp): + """Extract the feature vector for a given composition (comp).""" return ElMD(comp, metric=self.metric).feature_vector def featurize(self, formula_list=None, how="mean"): + """Featurize a list of formulas.""" if formula_list is None and self.formula_list is None: raise Exception("You must enter a list of compositions first") @@ -462,15 +800,25 @@ def featurize(self, formula_list=None, how="mean"): elif self.formula_list is None: self.formula_list = formula_list - elmd_obj = ElMD(metric=self.metric) + # elmd_obj = ElMD(metric=self.metric) # if type(elmd_obj.periodic_tab[self.metric]["H"]) is int: - # vectors = np.ndarray((len(compositions), len(elmd_obj.periodic_tab[self.metric]))) + # vectors = np.ndarray( + # (len(compositions), len(elmd_obj.periodic_tab[self.metric])) + # ) # else: - # vectors = np.ndarray((len(compositions), len(elmd_obj.periodic_tab[self.metric]["H"]))) - - print(f"Constructing compositionally weighted {self.metric} feature vectors for each composition") - vectors = process_map(self._pool_featurize, formula_list, chunksize=self.chunksize) + # vectors = np.ndarray( + # (len(compositions), len(elmd_obj.periodic_tab[self.metric]["H"])) + # ) + + print( + f"Constructing compositionally weighted {self.metric} feature vectors \ + for each composition" + ) + # vectors = process_map( + # self._pool_featurize, formula_list, chunksize=self.chunksize + # ) + vectors = map(self._pool_featurize, formula_list) print("Complete") @@ -478,9 +826,11 @@ def featurize(self, formula_list=None, how="mean"): def intersect(self, y=None, X=None): """ - Takes in a second formula list, y, and computes the intersectional distance + Find intersectional distance matrix between two formula lists. + + Takes in a second formula list, y, and computes the intersectional distance matrix between the two under the given metric. If a two formula lists - are given the intersection between the two is computed, returning a + are given the intersection between the two is computed, returning a distance matrix of the form: X_0 X_1 X_2 ... @@ -490,63 +840,78 @@ def intersect(self, y=None, X=None): ... """ if X is None and y is None: - raise Exception("Must enter two lists of formula or fit a list of formula and enter an intersecting list of formula") + raise Exception( + "Must enter two lists of formula or fit a list of formula and enter \ + an intersecting list of formula" + ) if X is None: X = self.formula_list - elmd = ElMD() - dm = [] - process_pool = Pool(cpu_count()) + # elmd = ElMD() + # dm = [] - for comp_1 in tqdm(X): - ionic = process_pool.starmap(elmd.elmd, ((comp_1, comp_2) for comp_2 in y)) - dm.append(ionic) - - distance_matrix = np.array(dm) + # with Pool(cpu_count()) as process_pool: + # for comp_1 in tqdm(X): + # ionic = process_pool.starmap( + # elmd.elmd, ((comp_1, comp_2) for comp_2 in y) + # ) + # dm.append(ionic) - return distance_matrix + # distance_matrix = np.array(dm) + + distances = self.EM2D(X, formulas2=y) + + return distances # Not working currently, might be faster... # intersection_dm = self._process_intersection(X, y, self.n_proc) # return intersection_dm - def _process_intersection(self, X, y, n_proc): - ''' - Compute the intersection of two lists of compositions - ''' + """Compute the intersection of two lists of compositions.""" pool_list = [] n_elements = len(ElMD().periodic_tab[self.metric]) X_mat = np.ndarray(shape=(len(X), n_elements), dtype=np.float64) y_mat = np.ndarray(shape=(len(y), n_elements), dtype=np.float64) - if self.verbose: print("Parsing X Formula") + if self.verbose: + print("Parsing X Formula") for i, formula in tqdm(list(enumerate(X))): X_mat[i] = ElMD(formula, metric=self.metric).ratio_vector - if self.verbose: print("Parsing Y Formula") + if self.verbose: + print("Parsing Y Formula") for i, formula in tqdm(list(enumerate(y))): y_mat[i] = ElMD(formula, metric=self.metric).ratio_vector # Create input pairings - if self.verbose: print("Constructing joint compositional pairings") + if self.verbose: + print("Constructing joint compositional pairings") for y in tqdm(range(len(y_mat))): sublist = [(y, x) for x in range(len(X_mat))] pool_list.append(sublist) - + # Distribute amongst processes - if self.verbose: print("Creating Process Pool\nScattering compositions between processes and computing distances") - distances = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize) + if self.verbose: + print( + "Creating Process Pool\nScattering compositions between processes \ + and computing distances" + ) + # distances = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize) + distances = map(self._pool_ElMD, pool_list) - if self.verbose: print("Distances computed closing processes") + if self.verbose: + print("Distances computed closing processes") # if self.verbose: print("Flattening sublists") # Flattens list of lists to single list # distances = [dist for sublist in scores for dist in sublist] - + return np.array(distances, dtype=np.float64) + if __name__ == "__main__": + freeze_support() main() diff --git a/ElM2D/benchmark_njit.py b/ElM2D/benchmark_njit.py new file mode 100644 index 0000000..1a9ca97 --- /dev/null +++ b/ElM2D/benchmark_njit.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 8 01:36:19 2021 + +@author: sterg +""" +from scipy.stats import wasserstein_distance as scipy_wasserstein_distance +from scipy.spatial.distance import cdist + +# %% example runs +pairtest = np.array([(0, 1), (1, 2), (2, 3)]) +Utest = U[0:6] +Vtest = V[0:6] +Uwtest = U_weights[0:6] +Vwtest = V_weights[0:6] + +one_set = dist_matrix(Utest, Uw=Uwtest, metric="wasserstein") +print(one_set) + +two_set = dist_matrix(Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, metric="wasserstein") +print(two_set) + +pairs = np.array([(0, 1), (1, 2), (2, 3)]) + +one_set_sparse = dist_matrix(Utest, Uw=Uwtest, pairs=pairtest, metric="wasserstein") +print(one_set_sparse) + +two_set_sparse = dist_matrix( + Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, pairs=pairtest, metric="wasserstein" +) +print(two_set_sparse) + +# %% timing of large distance matrices + + +# for compilation purposes, maybe just once is necessary? +dist_matrix(U, Uw=U_weights, metric="wasserstein") +dist_matrix(Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, metric="wasserstein") +dist_matrix(Utest, Uw=Uwtest, pairs=pairtest, metric="wasserstein") +dist_matrix(Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, pairs=pairtest, metric="wasserstein") + +with Timer("cdist Euclidean"): + d = cdist(U, V) + +with Timer("two-set dist_matrix Euclidean"): + d = dist_matrix(U, V=V, Uw=U_weights, Vw=V_weights, metric="euclidean") + +with Timer("cdist SciPy Wasserstein"): + d = cdist(U_Uw, V_Vw, metric=scipy_wasserstein_distance) + +with Timer("cdist Wasserstein"): + d = cdist(U_Uw, V_Vw, metric=my_wasserstein_distance) + +with Timer("two-set dist_matrix Wasserstein"): + d = dist_matrix(U, V=V, Uw=U_weights, Vw=V_weights, metric="wasserstein") diff --git a/ElM2D/cuda_dist_matrix.py b/ElM2D/cuda_dist_matrix.py new file mode 100644 index 0000000..c2e07c9 --- /dev/null +++ b/ElM2D/cuda_dist_matrix.py @@ -0,0 +1,544 @@ +""" +Compute a distance matrix for various cases. + +Cases: + within a single set of vectors (like pdist) + between two sets of vectors (like cdist) + between prespecified pairs (i.e. sparse) for either one or two sets of + vectors. + +Various distance metrics are available. +""" +import os +import numpy as np +from math import ceil +from metrics import wasserstein_distance, euclidean_distance + +# CUDA Simulator not working.. +# os.environ["NUMBA_ENABLE_CUDASIM"] = "1" +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1" + +from numba import cuda, jit # noqa +from numba.types import int32, float32, int64, float64 # noqa + +inline = os.environ.get("INLINE", "never") +fastmath = bool(os.environ.get("FASTMATH", "1")) +cols = os.environ.get("COLUMNS") # 121 for ElM2D repo +USE_64 = bool(os.environ.get("USE_64", "0")) +target = os.environ.get("TARGET", "cuda") + +if USE_64 is None: + USE_64 = False +if USE_64: + bits = 64 + nb_float = float64 + nb_int = int64 + np_float = np.float64 + np_int = np.int64 +else: + bits = 32 + nb_float = float32 + nb_int = int32 + np_float = np.float32 + np_int = np.int32 + +if cols is not None: + cols = int(cols) + cols_plus_1 = cols + 1 + tot_cols = cols * 2 + tot_cols_minus_1 = tot_cols - 1 +else: + raise KeyError( + "For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable, COLUMNS, " + 'via e.g. `os.environ["COLUMNS"] = "100"`.' + ) + +# override types +if target == "cpu": + nb_float = np_float + nb_int = np_int + +# threads-per-block +TPB = 16 + + +@jit(inline=inline) +def compute_distance(u, v, u_weights, v_weights, metric_num): + """ + Calculate weighted distance between two vectors, u and v. + + Parameters + ---------- + u : 1D array of float + First vector. + v : 1D array of float + Second vector. + u_weights : 1D array of float + Weights for u. + v_weights : 1D array of float + Weights for v. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Raises + ------ + NotImplementedError + "Specified metric is mispelled or has not been implemented yet. + If not implemented, consider submitting a pull request." + + Returns + ------- + d : float + Weighted distance between u and v. + + """ + if metric_num == 0: + d = euclidean_distance(u, v) + elif metric_num == 1: + # assume u and v are presorted, and weights are sorted by u and v + d = wasserstein_distance(u, v, u_weights, v_weights, True, True, True) + else: + raise NotImplementedError( + "Specified metric is mispelled or has not been implemented yet. " + "If not implemented, consider submitting a pull request." + ) + return d + + +@cuda.jit( + "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "int{0}[:,:], float{0}[:], int{0})".format(bits), + fastmath=fastmath, +) +def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num): + """ + Calculate sparse pairwise distances between two sets of vectors for pairs. + + Parameters + ---------- + mat : numeric cuda array + First set of vectors for which to compute a single pairwise distance. + mat2 : numeric cuda array + Second set of vectors for which to compute a single pairwise distance. + pairs : cuda array of 2-tuples + All pairs for which distances are to be computed. + out : numeric cuda array + The initialized array which will be populated with distances. + + Raises + ------ + ValueError + Both matrices should have the same number of columns. + + Returns + ------- + None. + + """ + k = cuda.grid(1) + + npairs = pairs.shape[0] + + if k < npairs: + pair = pairs[k] + # extract indices of the pair for which the distance will be computed + i, j = pair + + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + + out[k] = compute_distance(u, v, uw, vw, metric_num) + + +@cuda.jit( + "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits), + fastmath=fastmath, +) +def one_set_distance_matrix(U, U_weights, out, metric_num): + """ + Calculate pairwise distances within single set of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Initialized matrix to populate with pairwise distances. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + dm_rows = U.shape[0] + dm_cols = U.shape[0] + + if i < j and i < dm_rows and j < dm_cols and i != j: + u = U[i] + v = U[j] + uw = U_weights[i] + vw = U_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + out[j, i] = d + + +# faster compilation *and* runtimes with explicit signature +@cuda.jit( + "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "float{0}[:,:], int{0})".format(bits), + fastmath=fastmath, +) +def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num): + """ + Calculate pairwise distances between two sets of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + V : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + V_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Pairwise distance matrix between two sets of vectors. + metric_num : int + Distance metric number {"euclidean": 0, "wasserstein": 1}. + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + # distance matrix shape + dm_rows = U.shape[0] + dm_cols = V.shape[0] + + if i < dm_rows and j < dm_cols: + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + + +def dist_matrix( + U, V=None, U_weights=None, V_weights=None, pairs=None, metric="euclidean" +): # noqa + """ + Compute distance matrices using Numba/CUDA. + + Parameters + ---------- + mat : array + First set of vectors for which to compute pairwise distances. + + mat2 : array, optional + Second set of vectors for which to compute pairwise distances. + If not specified, then mat2 is a copy of mat. + + pairs : array, optional + List of 2-tuples which contain the indices for which to compute + distances. If mat2 was specified, then the second index accesses + mat2 instead of mat. If not specified, then the pairs are + auto-generated. If mat2 was specified, all combinations of the two + vector sets are used. If mat2 isn't specified, then only the upper + triangle (minus diagonal) pairs are computed. + + metric : str, optional + Possible options are 'euclidean', 'wasserstein'. + Defaults to Euclidean distance. These are converted to integers + internally due to Numba's lack of support for string arguments + (2021-08-14). See compute_distance() for other keys. + + For example: + 0 - 'euclidean' + 1 - 'wasserstein' + target : str, optional + Which target to use: "cuda" or "cpu". Default is "cuda". + inline : str, optional + Whether to inline functions: "always" or "never". Default is "never". + fastmath : bool, optional + Whether to use fastmath or not. The default is True. + USE_64 : bool, optional + Whether to use 64 bit or 34 bit types. The default is False. + + Raises + ------ + ValueError + DESCRIPTION. + NotImplementedError + DESCRIPTION. + + Returns + ------- + out : array + A pairwise distance matrix, or if pairs are specified, then a + vector of distances corresponding to the pairs. + + """ + rows, cols_check = U.shape + + if cols_check != cols: + raise KeyError( + "For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable: COLUMNS. " + 'However, os.environ["COLUMNS"] does not match the ' + "number of columns in U. Reset the environment variable " + 'via e.g. `os.environ["COLUMNS"] = str(U.shape[0])` ' + "defined at the top of your script and make sure that " + "dist_matrix is reloaded. You may also need to restart " + "Python." + ) + + if V is not None and cols is not V.shape[1]: + raise ValueError("The number of columns for U and V should match") + + # is it a distance matrix between two sets of vectors? + # (rather than within a single set) + isXY = V is not None + + # were pairs specified? (useful for sparse matrix generation) + pairQ = pairs is not None + + # block, grid, and out shape + if pairQ: + block_dim = TPB + npairs = pairs.shape[0] + grid_dim = ceil(npairs / block_dim) + else: + block_dim = (TPB, TPB) + blockspergrid_x = ceil(rows / block_dim[0]) + if isXY: + blockspergrid_y = ceil(V.shape[0] / block_dim[1]) + else: + blockspergrid_y = ceil(rows / block_dim[1]) + grid_dim = (blockspergrid_x, blockspergrid_y) + + # CUDA setup + stream = cuda.stream() + + # sorting and cumulative weights + if metric == "wasserstein": + # presort values (and weights by sorted value indices) + U_sorter = np.argsort(U) + U = np.take_along_axis(U, U_sorter, axis=-1) + U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1) + + # calculate cumulative weights + U_weights = np.cumsum(U_weights, axis=1) + + # prepend a column of zeros + zero = np.zeros((U_weights.shape[0], 1)) + U_weights = np.column_stack((zero, U_weights)) + + # do the same for V and V_weights + if isXY: + V_sorter = np.argsort(V) + V = np.take_along_axis(V, V_sorter, axis=-1) + V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1) + V_weights = np.cumsum(V_weights, axis=1) + V_weights = np.column_stack((zero, V_weights)) + + # assign dummy arrays + if V is None: + V = np.zeros((2, 2)) + + if U_weights is None: + U_weights = np.zeros((2, 2)) + + if V_weights is None: + V_weights = np.zeros((2, 2)) + + if pairs is None: + pairs = np.zeros((2, 2)) + + # assign metric_num based on specified metric (no string support) + metric_dict = {"euclidean": 0, "wasserstein": 1} + metric_num = metric_dict[metric] + + # # same for target_num + # target_dict = {"cuda": 0, "cpu": 1} + # target_num = target_dict[target] + + m = U.shape[0] + + if isXY: + m2 = V.shape[0] + else: + m2 = m + + if pairQ: + shape = (npairs,) + else: + shape = (m, m2) + + # values should be initialized instead of using cuda.device_array + out = np.zeros(shape) + + if target == "cuda": + # copying/allocating to GPU + cU = cuda.to_device(np.asarray(U, dtype=np_float), stream=stream) + + cU_weights = cuda.to_device( + np.asarray(U_weights, dtype=np_float), stream=stream + ) + + if isXY or pairQ: + cV = cuda.to_device(np.asarray(V, dtype=np_float), stream=stream) + cV_weights = cuda.to_device( + np.asarray(V_weights, dtype=np_float), stream=stream + ) + + if pairQ: + cpairs = cuda.to_device(np.asarray(pairs, dtype=np_int), stream=stream) + + cuda_out = cuda.to_device(np.asarray(out, dtype=np_float), stream=stream) + + elif target == "cpu": + cU = U + cV = V + cU_weights = U_weights + cV_weights = V_weights + cpairs = pairs + cuda_out = out + + # distance matrix between two sets of vectors + if isXY and not pairQ: + fn = two_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif not isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU, cU_weights, cU_weights, cpairs, cuda_out, metric_num) + + # distance matrix within single set of vectors + elif not isXY and not pairQ: + fn = one_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cpairs, cuda_out, metric_num) + + out = cuda_out.copy_to_host(stream=stream) + + return out + + +# %% Code Graveyard + +# u_stack = cuda.local.array(cols, nb_int) +# v_stack = cuda.local.array(cols, nb_int) + +# copy(u_sorted, utmp) +# copy(v_sorted, vtmp) + +# copy(u_weights, u_sorted_weights) +# copy(v_weights, v_sorted_weights) + +# stack = [0] * size +# ids = list(range(len(arr))) + +# u_sorted_weights = cuda.local.array(cols, nb_float) +# v_sorted_weights = cuda.local.array(cols, nb_float) + +# sorting stack +# uv_stack = cuda.local.array(tot_cols, nb_int) + +# def sort_cum_prepend(X, X_weights): +# # presort values (and weights by sorted value indices) +# X_sorter = np.argsort(X) +# X = np.take_along_axis(X, X_sorter, axis=-1) +# X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1) +# # calculate cumulative weights +# X_weights = np.cumsum(X_weights) +# # prepend a column of zeros +# zero = np.zeros((X_weights.shape[0], 1)) +# X_weights = np.column_stack((zero, X_weights)) + +# def mean_squared_error(y, y_pred, squared=False): +# """ +# Return mean squared error (MSE) without using sklearn. + +# If squared == True, then return root mean squared error (RMSE). + +# Parameters +# ---------- +# y : 1D numeric array +# "True" or first set of values. +# y_pred : 1D numeric array +# "Predicted" or second set of values. + +# Returns +# ------- +# rmse : numeric scalar +# DESCRIPTION. + +# """ +# mse = np.mean((y - y_pred)**2) +# if squared is True: +# rmse = np.sqrt(mse) +# return rmse +# else: +# return mse +# return mse + +# opt = False +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0" +# opt = True +# os.environ["NUMBA_BOUNDSCHECK"] = "0" +# os.environ["OPT"] = "0" + + +# HACK +# a "hacky" way to get compatibility between @njit and @cuda.jit +# myjit = get_myjit(target=target, inline=inline) +# mycudajit = get_myjit(device=False, target=target, inline=inline) + +# local array shapes needs to be defined globally due to lack of dynamic +# array allocation support. Don't wrap with np.int32, etc. see +# https://github.com/numba/numba/issues/7314 + +# np.savetxt('100-zeros.csv', +# np.zeros(tmp, dtype=np.int32), +# delimiter=",") +# OK to take cols from .shape +# cols = np.genfromtxt('100-zeros.csv', +# dtype=np.int32, +# delimiter=",").shape[0] + + +# HACK from numba.cuda.kernels.device.myjit import get_myjit +# from numba.cuda.cudadrv.driver import driver +# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash.. + + +# os.environ["MACHINE_BITS"] = str(bits) diff --git a/ElM2D/helper.py b/ElM2D/helper.py new file mode 100644 index 0000000..3ff059b --- /dev/null +++ b/ElM2D/helper.py @@ -0,0 +1,355 @@ +"""Helper functions for distance matrix computations.""" +import os +from math import sqrt +from numba import jit +from time import time + +inline = os.environ.get("INLINE", "never") + + +class Timer(object): + """ + Simple timer class. + + https://stackoverflow.com/a/5849861/13697228 + Usage + ----- + with Timer("description"): + # do stuff + """ + + def __init__(self, name=None): + self.name = name + + def __enter__(self): + """Enter the timer.""" + self.tstart = time() + + def __exit__(self, type, value, traceback): + """Exit the timer.""" + if self.name: + print("[%s]" % self.name,) + print(("Elapsed: {}\n").format(round((time() - self.tstart), 5))) + + +@jit(inline=inline) +def copy(a, out): + """ + Copy a into out. + + Parameters + ---------- + a : vector + values to copy. + out : vector + values to copy into. + + Returns + ------- + None. + + """ + for i in range(len(a)): + i = int(i) + out[i] = a[i] + + +@jit(inline=inline) +def insertionSort(arr): + """ + Perform insertion sorting on a vector. + + Source: https://www.geeksforgeeks.org/insertion-sort/ + + Parameters + ---------- + arr : 1D array + Array to be sorted in-place. + + Returns + ------- + None. + + """ + for i in range(1, len(arr)): + key = arr[i] + + # Move elements of arr[0..i-1], that are greater than key, to one + # position ahead of their current position + j = i - 1 + while j >= 0 and key < arr[j]: + arr[j + 1] = arr[j] + j -= 1 + arr[j + 1] = key + + +@jit(inline=inline) +def insertionArgSort(arr, ids): + """ + Perform insertion sorting on a vector and track the sorting indices. + + Source: https://www.geeksforgeeks.org/insertion-sort/ + + Parameters + ---------- + arr : 1D array + Array to be sorted in-place. + + ids : 1D array + Initialized array to fill with sorting indices. + + Returns + ------- + None. + + """ + # fill ids + for i in range(len(arr)): + ids[i] = i + + for i in range(1, len(arr)): + key = arr[i] + id_key = ids[i] + + # Move elements of arr[0..i-1], that are greater than key, to one + # position ahead of their current position + j = i - 1 + while j >= 0 and key < arr[j]: + arr[j + 1] = arr[j] + ids[j + 1] = ids[j] + j -= 1 + arr[j + 1] = key + ids[j + 1] = id_key + + +@jit(inline=inline) +def concatenate(vec, vec2, out): + """ + Concatenate two vectors. + + It also reassigns the values of out back into vec and vec2 to prevent odd + roundoff issues. + + Parameters + ---------- + vec : vector + First vector to concatenate. + vec2 : vector + Second vector to concatenate. + out : vector + Initialization of concatenated vector. + + Returns + ------- + None. + + """ + # number of elements + n = len(vec) + n2 = len(vec2) + # populate out + for i in range(n): + out[i] = vec[i] + for i in range(n2): + out[i + n] = vec2[i] + + +@jit(inline=inline) +def diff(vec, out): + """ + Compute gradient. + + Parameters + ---------- + vec : vector of floats + The vector for which to calculate the gradient. + out : vector of floats + Initialization of output vector (one less element than vec) which + contains the gradient. + + Returns + ------- + None. + + """ + # number of elements + n = len(vec) - 1 + # progressively add next elements + for i in range(n): + out[i] = vec[i + 1] - vec[i] + +# maybe there's a faster implementation somewhere + + +@jit(inline=inline) +def bisect_right(a, v, ids): + """ + Return indices where to insert items in v in list a, assuming a is sorted. + + Parameters + ---------- + arr : 1D array + Array for which to perform a binary search. + v : 1D array + Values to perform the binary search for. + ids : 1D array + Initialized list of indices, same size as v. + + Returns + ------- + None. + + The return value i is such that all e in a[:i] have e <= x, and all e in + a[i:] have e > x. So if x already appears in the list, a.insert(x) will + insert just after the rightmost x already there. Optional args lo + (default 0) and hi (default len(a)) bound the slice of a to be searched. + + Source: modified from + https://github.com/python/cpython/blob/43bab0537ceb6e2ca3597f8f3a3c79733b897434/Lib/bisect.py#L15-L35 # noqa + """ + n = len(a) + n2 = len(v) + for i in range(n2): + lo = 0 + mid = 0 + hi = n + x = v[i] + while lo < hi: + mid = (lo + hi) // 2 + # Use __lt__ to match the logic in list.sort() and in heapq + if x < a[mid]: + hi = mid + else: + lo = mid + 1 + ids[i] = lo + + +@jit(inline=inline) +def sort_by_indices(v, ids, out): + """ + Sort v by ids and assign to out, as in out = v[ids]. + + Parameters + ---------- + v : vector + values to sort. + ids : vector + indices to use for sorting. + out : vector + preallocated vector to put sorted values into. + + Returns + ------- + None. + + """ + for i in range(len(ids)): + idx = ids[i] + out[i] = v[idx] + + +@jit(inline=inline) +def cumsum(vec, out): + """ + Return the cumulative sum of the elements in a vector. + + Referenced https://stackoverflow.com/a/15889203/13697228 + + Parameters + ---------- + vec : 1D array + Vector for which to compute the cumulative sum. + + out: 1D array + Initialized vector which stores the cumulative sum. + + Returns + ------- + out : 1D array + The cumulative sum of elements in vec (same shape as vec). + + """ + # number of elements + n = len(vec) + # initialize + total = 0.0 + # progressively add next elements + for i in range(n): + total += vec[i] + out[i] = total + + +@jit(inline=inline) +def divide(v, b, out): + """ + Divide a vector by a scalar. + + Parameters + ---------- + v : vector + vector numerator. + b : float + scalar denominator. + out : vector + initialized vector to populate with divided values. + + Returns + ------- + None. + + """ + for i in range(len(v)): + out[i] = v[i] / b + + +@jit(inline=inline) +def integrate(u_cdf, v_cdf, deltas, p): + """ + Integrate between two vectors using a p-norm. + + Parameters + ---------- + u_cdf : numeric vector + First CDF. + v_cdf : numeric vector + Second CDF of same length as a. + deltas : numeric vector + The differences between the original vectors a and b. + p : int, optional + The finite power for which to compute the p-norm. The default is 2. + + If p == 1 or p == 2, use of power is avoided, which introduces an overhead + # of about 15% + Source: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404-L8488 # noqa + However, without support for math.square or np.square (2021-08-19), + math.pow is required for p == 2. + + Returns + ------- + out : numeric scalar + The p-norm of a and b. + + """ + n = len(u_cdf) + out = 0.0 + if p == 1: + for i in range(n): + out += abs(u_cdf[i] - v_cdf[i]) * deltas[i] + elif p == 2: + for i in range(n): + out += ((u_cdf[i] - v_cdf[i]) ** p) * deltas[i] + out = sqrt(out) + elif p > 2: + for i in range(n): + out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i] + out = out ** (1 / p) + return out + + +# %% CODE GRAVEYARD +# from numba.core import config +# config_keys = dir(config) +# if "INLINE" in config_keys: +# inline = config.INLINE +# else: +# inline = "never" diff --git a/ElM2D/metrics.py b/ElM2D/metrics.py new file mode 100644 index 0000000..ef945e9 --- /dev/null +++ b/ElM2D/metrics.py @@ -0,0 +1,291 @@ +""" +Numba/CUDA-compatible distance metrics. + +Created on Wed Sep 8 14:47:43 2021 + +@author: sterg +""" +import os +import numpy as np +import helper as hp +from math import sqrt +from numba import cuda, jit # noqa +from numba.types import int32, float32, int64, float64 # noqa + +inline = os.environ.get("INLINE", "never") +fastmath = bool(os.environ.get("FASTMATH", "1")) +cols = os.environ.get("COLUMNS") # 121 for ElM2D repo +USE_64 = bool(os.environ.get("USE_64", "0")) +target = os.environ.get("TARGET", "cuda") + + +if USE_64 is None: + USE_64 = False +if USE_64: + bits = 64 + nb_float = float64 + nb_int = int64 + np_float = np.float64 + np_int = np.int64 +else: + bits = 32 + nb_float = float32 + nb_int = int32 + np_float = np.float32 + np_int = np.int32 + +if cols is not None: + cols = int(cols) + cols_plus_1 = cols + 1 + tot_cols = cols * 2 + tot_cols_minus_1 = tot_cols - 1 +else: + raise KeyError( + "For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable, COLUMNS, " + 'via e.g. `os.environ["COLUMNS"] = "100"`.' + ) + + +@jit(inline=inline) +def cdf_distance( + u, v, u_weights, v_weights, p, presorted, cumweighted, prepended +): # noqa + r""" # noqa + Compute distance between two 1D distributions :math:`u` and :math:`v`. + + The respective CDFs are :math:`U` and :math:`V`, and the + statistical distance is defined as: + .. math:: + l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} + p is a positive parameter; p = 1 gives the Wasserstein distance, + p = 2 gives the energy distance. + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like + Weight for each value. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from + 1, it must still be positive and finite so that the weights can + be normalized to sum to 1. + p : scalar + positive parameter that determines the type of cdf distance. + presorted : bool + Whether u and v have been sorted already *and* u_weights and + v_weights have been sorted using the same indices used to sort + u and v, respectively. + cumweighted : bool + Whether u_weights and v_weights have been converted to their + cumulative weights via e.g. np.cumsum(). + prepended : bool + Whether a zero has been prepended to accumated, sorted + u_weights and v_weights. + + By setting presorted, cumweighted, *and* prepended to False, the + computationproceeds proceeds in the same fashion as _cdf_distance + from scipy.stats. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from + samples whose values are effectively inputs of the function, or + they can be seen as generalized functions, in which case they are + weighted sums of Dirac delta functions located at the specified + values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, + Hoyer, Munos "The Cramer Distance as a Solution to Biased + Wasserstein Gradients" (2017). :arXiv:`1705.10743`. + """ + # allocate local float arrays + # combined vector + uv = cuda.local.array(tot_cols, nb_float) + uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float) + + # CDFs + u_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + v_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + + # allocate local int arrays + # CDF indices via binary search + u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + + u_cdf_sorted_cumweights = cuda.local.array(tot_cols_minus_1, nb_float) + v_cdf_sorted_cumweights = cuda.local.array(tot_cols_minus_1, nb_float) + + # short-circuit + if presorted and cumweighted and prepended: + u_sorted = u + v_sorted = v + + u_0_cumweights = u_weights + v_0_cumweights = v_weights + + # sorting, accumulating, and prepending (for compatibility) + else: + # check arguments + if not presorted and (cumweighted or prepended): + raise ValueError( + "if cumweighted or prepended are True, then presorted cannot be False" + ) # noqa + + if (not presorted or not cumweighted) and prepended: + raise ValueError( + "if prepended is True, then presorted and cumweighted must both be True" + ) # noqa + + # sorting + if not presorted: + # local arrays + u_sorted = cuda.local.array(cols, np_float) + v_sorted = cuda.local.array(cols, np_float) + + u_sorter = cuda.local.array(cols, nb_int) + v_sorter = cuda.local.array(cols, nb_int) + + u_sorted_weights = cuda.local.array(cols, nb_float) + v_sorted_weights = cuda.local.array(cols, nb_float) + + # local copy since quickArgSortIterative sorts in-place + hp.copy(u, u_sorted) + hp.copy(v, v_sorted) + + # sorting + hp.insertionArgSort(u_sorted, u_sorter) + hp.insertionArgSort(v_sorted, v_sorter) + + # inplace to avoid extra cuda local array + hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights) + hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights) + + # cumulative weights + if not cumweighted: + # local arrays + u_cumweights = cuda.local.array(cols, nb_float) + v_cumweights = cuda.local.array(cols, nb_float) + # accumulate + hp.cumsum(u_sorted_weights, u_cumweights) + hp.cumsum(v_sorted_weights, v_cumweights) + + # prepend weights with zero + if not prepended: + zero = cuda.local.array(1, nb_float) + + u_0_cumweights = cuda.local.array(cols_plus_1, nb_float) + v_0_cumweights = cuda.local.array(cols_plus_1, nb_float) + + hp.concatenate(zero, u_cumweights, u_0_cumweights) + hp.concatenate(zero, v_cumweights, v_0_cumweights) + + # concatenate u and v into uv + hp.concatenate(u_sorted, v_sorted, uv) + + # sorting + # quickSortIterative(uv, uv_stack) + hp.insertionSort(uv) + + # Get the respective positions of the values of u and v among the + # values of both distributions. See also np.searchsorted + hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices) + hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices) + + # empirical CDFs + hp.sort_by_indices(u_0_cumweights, u_cdf_indices, u_cdf_sorted_cumweights) + hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf) + + hp.sort_by_indices(v_0_cumweights, v_cdf_indices, v_cdf_sorted_cumweights) + hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf) + + # # Integration + hp.diff(uv, uv_deltas) # See also np.diff + + out = hp.integrate(u_cdf, v_cdf, uv_deltas, p) + + return out + + +@jit(inline=inline) +def wasserstein_distance( + u, v, u_weights, v_weights, presorted, cumweighted, prepended +): # noqa + r""" + Compute the first Wasserstein distance between two 1D distributions. + + This distance is also known as the earth mover's distance, since it can be + seen as the minimum amount of "work" required to transform :math:`u` into + :math:`v`, where "work" is measured as the amount of distribution weight + that must be moved, multiplied by the distance it has to be moved. + + Source + ------ + https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like, optional + Weight for each value. If unspecified, each value is assigned the same + weight. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from 1, it + must still be positive and finite so that the weights can be normalized + to sum to 1. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from samples + whose values are effectively inputs of the function, or they can be seen as + generalized functions, in which case they are weighted sums of Dirac delta + functions located at the specified values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, + Munos "The Cramer Distance as a Solution to Biased Wasserstein + Gradients" (2017). :arXiv:`1705.10743`. + """ + return cdf_distance( + u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended + ) # noqa + + +@jit(inline=inline) +def euclidean_distance(a, b): + """ + Calculate Euclidean distance between vectors a and b. + + Parameters + ---------- + a : 1D array + First vector. + b : 1D array + Second vector. + + Returns + ------- + d : numeric scalar + Euclidean distance between vectors a and b. + """ + d = 0 + for i in range(len(a)): + d += (b[i] - a[i]) ** 2 + d = sqrt(d) + return d diff --git a/ElM2D/njit_dist_matrix.py b/ElM2D/njit_dist_matrix.py new file mode 100644 index 0000000..06d8561 --- /dev/null +++ b/ElM2D/njit_dist_matrix.py @@ -0,0 +1,369 @@ +""" +Nopython version of dist_matrix. + +Author: Sterling Baird +""" +from numba import prange, njit +from numba.types import float32, int32 +import numpy as np + +np_int = np.int32 +np_float = np.float32 + +nb_int = int32 +nb_float = float32 + +fastmath = True +parallel = True +debug = False + + +@njit(fastmath=fastmath, debug=debug) +def wasserstein_distance( + u_values, v_values, u_weights=None, v_weights=None, p=1, presorted=False +): + r""" + Compute first Wasserstein distance. + + Compute, between two one-dimensional distributions :math:`u` and + :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the + statistical distance that is defined as: + .. math:: + l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} + p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2 + gives the energy distance. + + Source: + https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404 + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like, optional + Weight for each value. If unspecified, each value is assigned the same + weight. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from 1, it + must still be positive and finite so that the weights can be normalized + to sum to 1. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from samples + whose values are effectively inputs of the function, or they can be seen as + generalized functions, in which case they are weighted sums of Dirac delta + functions located at the specified values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, + Munos "The Cramer Distance as a Solution to Biased Wasserstein + Gradients" (2017). :arXiv:`1705.10743`. + """ + if not presorted: + u_sorter = np.argsort(u_values) + v_sorter = np.argsort(v_values) + + u_values = u_values[u_sorter] + v_values = v_values[v_sorter] + + u_weights = u_weights[u_sorter] + v_weights = v_weights[v_sorter] + + all_values = np.concatenate((u_values, v_values)) + # all_values.sort(kind='mergesort') + all_values.sort() + + # Compute the differences between pairs of successive values of u and v. + deltas = np.diff(all_values) + + # Get the respective positions of the values of u and v among the values of + # both distributions. + u_cdf_indices = np.searchsorted(u_values, all_values[:-1], side="right") + v_cdf_indices = np.searchsorted(v_values, all_values[:-1], side="right") + + zero = np.array([0]) + # Calculate the CDFs of u and v using their weights, if specified. + if u_weights is None: + u_cdf = u_cdf_indices / u_values.size + else: + uw_cumsum = np.cumsum(u_weights) + u_sorted_cumweights = np.concatenate((zero, uw_cumsum)) + u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1] + + if v_weights is None: + v_cdf = v_cdf_indices / v_values.size + else: + vw_cumsum = np.cumsum(v_weights) + v_sorted_cumweights = np.concatenate((zero, vw_cumsum)) + v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1] + + # Compute the value of the integral based on the CDFs. + # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead + # of about 15%. + if p == 1: + return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas)) + if p == 2: + return np.sqrt(np.sum(np.multiply(np.power(u_cdf - v_cdf, 2), deltas))) + return np.power( + np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p), deltas)), 1 / p + ) + + +@njit(fastmath=fastmath, debug=debug) +def euclidean_distance(a, b): + """ + Calculate Euclidean distance between vectors a and b. + + Parameters + ---------- + a : 1D array + First vector. + b : 1D array + Second vector. + + Returns + ------- + d : numeric scalar + Euclidean distance between vectors a and b. + """ + d = 0 + for i in range(len(a)): + d += (b[i] - a[i]) ** 2 + d = np.sqrt(d) + return d + + +@njit(fastmath=fastmath, debug=debug) +def compute_distance(u, v, u_weights, v_weights, metric_num): + """ + Calculate weighted distance between two vectors, u and v. + + Parameters + ---------- + u : 1D array of float + First vector. + v : 1D array of float + Second vector. + u_weights : 1D array of float + Weights for u. + v_weights : 1D array of float + Weights for v. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Raises + ------ + NotImplementedError + "Specified metric is mispelled or has not been implemented yet. + If not implemented, consider submitting a pull request." + + Returns + ------- + d : float + Weighted distance between u and v. + + """ + if metric_num == 0: + # d = np.linalg.norm(vec - vec2) + d = euclidean_distance(u, v) + elif metric_num == 1: + # d = my_wasserstein_distance(vec, vec2) + d = wasserstein_distance( + u, v, u_weights=u_weights, v_weights=v_weights, p=1, presorted=True + ) + else: + raise NotImplementedError( + "Specified metric is mispelled or has not been implemented yet. \ + If not implemented, consider submitting a pull request." + ) + return d + + +@njit(fastmath=fastmath, parallel=parallel, debug=debug) +def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, isXY, metric_num): + """ + Calculate sparse pairwise distances between two sets of vectors for pairs. + + Parameters + ---------- + mat : numeric cuda array + First set of vectors for which to compute a single pairwise distance. + mat2 : numeric cuda array + Second set of vectors for which to compute a single pairwise distance. + pairs : cuda array of 2-tuples + All pairs for which distances are to be computed. + out : numeric cuda array + The initialized array which will be populated with distances. + + Raises + ------ + ValueError + Both matrices should have the same number of columns. + + Returns + ------- + None. + + """ + npairs = pairs.shape[0] + + for k in prange(npairs): + pair = pairs[k] + i, j = pair + + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + + d = compute_distance(u, v, uw, vw, metric_num) + out[k] = d + + +@njit(fastmath=fastmath, parallel=parallel, debug=debug) +def one_set_distance_matrix(U, U_weights, out, metric_num): + """ + Calculate pairwise distances within single set of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Initialized matrix to populate with pairwise distances. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Returns + ------- + None. + + """ + dm_rows = U.shape[0] + dm_cols = U.shape[0] + + for i in prange(dm_rows): + for j in range(dm_cols): + if i < j: + u = U[i] + v = U[j] + uw = U_weights[i] + vw = U_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + out[j, i] = d + + +@njit(fastmath=fastmath, parallel=parallel, debug=debug) +def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num): + """Calculate distance matrix between two sets of vectors.""" + dm_rows = U.shape[0] + dm_cols = V.shape[0] + + for i in prange(dm_rows): + for j in range(dm_cols): + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + + +def dist_matrix( + U, V=None, U_weights=None, V_weights=None, pairs=None, metric="euclidean" +): + """ + Compute pairwise distances using Numba/CUDA. + + Parameters + ---------- + mat : array + First set of vectors for which to compute pairwise distances. + + mat2 : array, optional + Second set of vectors for which to compute pairwise distances. If not specified, + then mat2 is a copy of mat. + + pairs : array, optional + List of 2-tuples which contain the indices for which to compute distances for. + If mat2 was specified, then the second index accesses mat2 instead of mat. + If not specified, then the pairs are auto-generated. If mat2 was specified, + all combinations of the two vector sets are used. If mat2 isn't specified, + then only the upper triangle (minus diagonal) pairs are computed. + + metric : str, optional + Possible options are 'euclidean', 'wasserstein'. + Defaults to Euclidean distance. These are converted to integers internally + due to Numba's lack of support for string arguments (2021-08-14). + See compute_distance() for other keys. For example, 0 corresponds to Euclidean + distance and 1 corresponds to Wasserstein distance. + + Returns + ------- + out : array + A pairwise distance matrix, or if pairs are specified, then a vector of + distances corresponding to the pairs. + + """ + # is it distance matrix between two sets of vectors rather than within a single set? + isXY = V is not None + + # were pairs specified? (useful for sparse matrix generation) + pairQ = pairs is not None + + # assign metric_num based on specified metric (Numba doesn't support strings) + metric_dict = {"euclidean": 0, "wasserstein": 1} + metric_num = metric_dict[metric] + + m = U.shape[0] + + if isXY: + m2 = V.shape[0] + else: + m2 = m + + if pairQ: + npairs = pairs.shape[0] + shape = (npairs, 1) + else: + shape = (m, m2) + + if metric == "wasserstein": + U_sorter = np.argsort(U) + U = np.take_along_axis(U, U_sorter, axis=-1) + U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1) + + if isXY: + V_sorter = np.argsort(V) + V = np.take_along_axis(V, V_sorter, axis=-1) + V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1) + + out = np.zeros(shape, dtype=np_float) + + if isXY and not pairQ: + # distance matrix between two sets of vectors + two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num) + + elif not isXY and pairQ: + # specified pairwise distances within single set of vectors + sparse_distance_matrix(U, U, U_weights, U_weights, pairs, out, isXY, metric_num) + + elif not isXY and not pairQ: + # distance matrix within single set of vectors + one_set_distance_matrix(U, U_weights, out, metric_num) + + elif isXY and pairQ: + # specified pairwise distances between two sets of vectors + sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, isXY, metric_num) + + return out diff --git a/ElM2D/setup.cfg b/ElM2D/setup.cfg new file mode 100644 index 0000000..08aedd7 --- /dev/null +++ b/ElM2D/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description_file = README.md diff --git a/ElM2D/test_ElM2D.py b/ElM2D/test_ElM2D.py new file mode 100644 index 0000000..e5daca6 --- /dev/null +++ b/ElM2D/test_ElM2D.py @@ -0,0 +1,55 @@ +""" +Test Element Mover's 2D Distance Matrix via network simplex and "wasserstein" methods. + +This test ensures that the fast implementation "wasserstein" produces "close" values +to the original network simplex method. + +Note: "network_simplex" emd_algorithm produces a "freeze_support()" error in +Spyder IPython and Spyder external terminal without the if __name == "__main__" on +Windows. This is likely due to the way Pool is used internally within ElM2D. + +Created on Mon Sep 6 22:02:36 2021 + +@author: sterg +""" +import unittest +from numpy.testing import assert_allclose +from helper import Timer +import pandas as pd + + +class Testing(unittest.TestCase): + def test_dm_close(self): + from ElM2D import ElM2D + + mapper = ElM2D() + + # df = pd.read_csv("train-debug.csv") + df = pd.read_csv("stable-mp.csv") + formulas = df["formula"] + sub_formulas = formulas[0:500] + with Timer("fit-cuda-wasserstein"): + mapper.fit(sub_formulas, target="cuda") + dm_wasserstein = mapper.dm + + with Timer("fit-cpu-wasserstein"): + mapper.fit(sub_formulas) + dm_wasserstein = mapper.dm + + mapper2 = ElM2D(emd_algorithm="network_simplex") + + with Timer("fit-network_simplex"): + mapper2.fit(sub_formulas) + dm_network = mapper2.dm + assert_allclose( + dm_wasserstein, + dm_network, + atol=1e-3, + err_msg="wasserstein did not match network simplex.", + ) + + +if __name__ == "__main__": + unittest.main() +# network_simplex gives a strange error (freeze_support() on Spyder IPython); +# however dist_matrix does not give this error diff --git a/ElM2D/test_dist_matrix.py b/ElM2D/test_dist_matrix.py new file mode 100644 index 0000000..ade816e --- /dev/null +++ b/ElM2D/test_dist_matrix.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*- +"""Test distance matrix calculations using CUDA/Numba.""" +from scipy.spatial.distance import euclidean, cdist +from scipy.stats import wasserstein_distance as scipy_wasserstein_distance +from numpy.testing import assert_allclose +import numpy as np +import os +from importlib import reload + +# os.environ["NUMBA_DISABLE_JIT"] = "1" + +# number of columns of U and V must be set as env var before import dist_matrix +cols = 100 +os.environ["COLUMNS"] = str(cols) + +# other environment variables (set before importing dist_matrix) +os.environ["USE_64"] = "0" +os.environ["INLINE"] = "never" +os.environ["FASTMATH"] = "1" +os.environ["TARGET"] = "cuda" + +from helper import Timer # noqa +from numba.cuda.testing import unittest, CUDATestCase # noqa +import cuda_dist_matrix # noqa + +# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228) +reload(cuda_dist_matrix) +dist_matrix = cuda_dist_matrix.dist_matrix + +verbose_test = True + + +class TestDistMat(CUDATestCase): + """Test distance matrix calculations on GPU for various metrics.""" + + def test_dist_matrix(self): + """ + Loop through distance metrics and perform unit tests. + + The four test cases are: + pairwise distances within a single set of vectors + pairwise distances between two sets of vectors + sparse pairwise distances within a single set of vectors + sparse pairwise distances between two sets of vectors + + The ground truth for Euclidean comes from cdist. + The ground truth for Earth Mover's distance (1-Wasserstein) is via + a scipy.stats function. + + Helper functions are used to generate test data and support the use of + Wasserstein distances in cdist. + + Returns + ------- + None. + + """ + + def test_data(rows, cols): + """ + Generate seeded test values and weights for two distributions. + + Returns + ------- + U : 2D array + Values of first distribution. + V : 2D array + Values of second distribution. + U_weights : 2D array + Weights of first distribution. + V_weights : 2D array + Weights of second distribution. + + """ + np.random.seed(42) + [U, V, U_weights, V_weights] = [ + np.random.rand(rows, cols) for i in range(4) + ] + return U, V, U_weights, V_weights + + def my_wasserstein_distance(u_uw, v_vw): + """ + Return Earth Mover's distance using concatenated values and weights. + + Parameters + ---------- + u_uw : 1D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + Returns + ------- + distance : numeric scalar + Earth Mover's distance given two distributions. + + """ + # split into values and weights + n = len(u_uw) + i = n // 2 + u = u_uw[0:i] + uw = u_uw[i:n] + v = v_vw[0:i] + vw = v_vw[i:n] + # calculate distance + distance = scipy_wasserstein_distance(u, v, u_weights=uw, v_weights=vw) + return distance + + def join_wasserstein(U, V, Uw, Vw): + """ + Horizontally stack values and weights for each distribution. + + Weights are added as additional columns to values. + + Example: + u_uw, v_vw = join_wasserstein(u, v, uw, vw) + d = my_wasserstein_distance(u_uw, v_vw) + cdist(u_uw, v_vw, metric=my_wasserstein_distance) + + Parameters + ---------- + u : 1D or 2D numeric array + First set of distribution values. + v : 1D or 2D numeric array + Second set of values of distribution values. + uw : 1D or 2D numeric array + Weights for first distribution. + vw : 1D or 2D numeric array + Weights for second distribution. + + Returns + ------- + u_uw : 1D or 2D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + """ + U_Uw = np.concatenate((U, Uw), axis=1) + V_Vw = np.concatenate((V, Vw), axis=1) + return U_Uw, V_Vw + + tol = 1e-5 + + # test and production data + pairs = np.array([(0, 1), (1, 2), (2, 3)]) + i, j = pairs[0] + + for testQ in [True, False]: + # large # of rows with testQ == True is slow due to use of + # cdist and non-jitted scipy-wasserstein + if testQ: + rows = 6 + else: + rows = 5000 + print("====[PARTIAL TEST WITH {} ROWS]====".format(rows)) + U, V, U_weights, V_weights = test_data(rows, cols) + Utest, Vtest, Uwtest, Vwtest = [ + x[0:6] for x in [U, V, U_weights, V_weights] + ] # noqa + + for target in ["cuda"]: # "cpu" not implemented + for metric in ["euclidean", "wasserstein"]: + print("[" + target.upper() + "_" + metric.upper() + "]") + + if testQ: + # compile + dist_matrix(Utest, U_weights=Uwtest, metric=metric) + with Timer("one set"): + one_set = dist_matrix( + U, U_weights=U_weights, metric=metric + ) # noqa + if verbose_test: + print(one_set, "\n") + + # compile + dist_matrix( + Utest, + V=Vtest, + U_weights=Uwtest, + V_weights=Vwtest, + metric=metric, + ) + with Timer("two set"): + two_set = dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + metric=metric, + ) # noqa + if testQ and verbose_test: + print(two_set, "\n") + + one_set_sparse = dist_matrix( + U, U_weights=U_weights, pairs=pairs, metric=metric + ) # noqa + if testQ and verbose_test: + print(one_set_sparse, "\n") + + two_set_sparse = dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + pairs=pairs, + metric=metric, + ) + if testQ and verbose_test: + print(two_set_sparse, "\n") + + if testQ: + if metric == "euclidean": + with Timer("one set check (cdist)"): + one_set_check = cdist(U, U) + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + one_sparse_check = [euclidean(U[i], U[j]) for i, j in pairs] + two_sparse_check = [euclidean(U[i], V[j]) for i, j in pairs] + + elif metric == "wasserstein": + U_Uw, V_Vw = join_wasserstein(U, V, U_weights, V_weights) + + with Timer("one set check (cdist)"): + one_set_check = cdist( + U_Uw, U_Uw, metric=my_wasserstein_distance + ) + with Timer("two set check (cdist)"): + two_set_check = cdist( + U_Uw, V_Vw, metric=my_wasserstein_distance + ) + + one_sparse_check = [ + my_wasserstein_distance(U_Uw[i], U_Uw[j]) # noqa + for i, j in pairs + ] + two_sparse_check = [ + my_wasserstein_distance(U_Uw[i], V_Vw[j]) # noqa + for i, j in pairs + ] + + # check results + assert_allclose( + one_set.ravel(), + one_set_check.ravel(), + rtol=tol, + err_msg="one set {} {} distance matrix inaccurate".format( + target, metric + ), + ) # noqa + assert_allclose( + two_set.ravel(), + two_set_check.ravel(), + rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format( + target, metric + ), + ) # noqa + assert_allclose( + one_set_sparse, + one_sparse_check, + rtol=tol, + err_msg="one set {} {} sparse distance matrix \ + inaccurate".format( + target, metric + ), + ) # noqa + assert_allclose( + two_set_sparse, + two_sparse_check, + rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format( + target, metric + ), + ) # noqa + elif metric == "euclidean": + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + +if __name__ == "__main__": + unittest.main() + + +# TF = [ +# np.allclose(one_set, one_set_check), +# np.allclose(two_set, two_set_check), +# np.allclose(one_set_sparse, one_sparse_check), +# np.allclose(two_set_sparse, two_sparse_check) +# ] diff --git a/ElM2D/test_helper.py b/ElM2D/test_helper.py new file mode 100644 index 0000000..67383ca --- /dev/null +++ b/ElM2D/test_helper.py @@ -0,0 +1,83 @@ +""" +Test helper functions for distance matrix calculations. +""" +import os +import numpy as np +from numba.cuda.testing import unittest, CUDATestCase +import numba.cuda.kernels.device.helper as hp +from numpy.testing import assert_allclose, assert_equal + +bits = int(os.environ.get("MACHINE_BITS", "32")) + +if bits == 32: + np_float = np.float32 + np_int = np.float32 +elif bits == 64: + np_float = np.float64 + np_int = np.float64 + +tol = 1e-5 + + +class TestHelperFunctions(CUDATestCase): + def test_concatenate(self): + vec = np.random.rand(5) + vec2 = np.random.rand(10) + check = np.concatenate((vec, vec2)) + out = np.zeros(len(vec) + len(vec2), dtype=np_float) + hp.concatenate(vec, vec2, out) + + assert_allclose(out, check) + vec[0] = 100.0 + assert_allclose(out, check) + + def test_diff(self): + vec = np.random.rand(10) + out = np.zeros(len(vec) - 1) + hp.diff(vec, out) + check = np.diff(vec) + assert_equal(out, check) + + def test_bisect_right(self): + a = np.random.rand(10) + a.sort() + v = np.random.rand(5) + ids = np.zeros_like(v[:-1], dtype=np_int) + hp.bisect_right(a, v[:-1], ids) + check = np.searchsorted(a, v[:-1], side="right") + assert_equal(ids, check) + + def test_sort_by_indices(self): + v = np.random.rand(10) + ids = np.arange(len(v)) + np.random.shuffle(ids) + out = np.zeros_like(v) + hp.sort_by_indices(v, ids, out) + check = v[ids] + assert_equal(out, check) + + v = np.array([0.0, 0.5528931, 1.1455898, 1.5933731]) + ids = np.array([1, 2, 3, 3, 3]) + out = np.zeros_like(ids, dtype=np_float) + hp.sort_by_indices(v, ids, out) + check = v[ids] + assert_allclose(out, check) + + def test_cumsum(self): + vec = np.random.rand(10) + out = np.zeros_like(vec, dtype=np_float) + hp.cumsum(vec, out) + check = np.cumsum(vec) + assert_allclose(out, check) + + def test_divide(self): + v = np.random.rand(10) + b = 4.62 + out = np.zeros_like(v, dtype=np_float) + hp.divide(v, b, out) + check = v / b + assert_allclose(out, check) + + +if __name__ == '__main__': + unittest.main() diff --git a/ElM2D/test_njit.py b/ElM2D/test_njit.py new file mode 100644 index 0000000..924f0ab --- /dev/null +++ b/ElM2D/test_njit.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 8 01:36:51 2021 + +@author: sterg +""" + +from scipy.stats import wasserstein_distance as scipy_wasserstein_distance +from scipy.spatial.distance import cdist + +# generate test data +np.random.seed(42) +rows = 200 +cols = 100 +[U, V, U_weights, V_weights] = [np.random.rand(rows, cols) for i in range(4)] + +testpairs = np.array([(1, 2), (2, 3), (3, 4)]) + +# %% unit tests +one_set_check = scipy_wasserstein_distance( + U[0], U[1], u_weights=U_weights[0], v_weights=U_weights[1] +) +two_set_check = scipy_wasserstein_distance( + U[0], V[0], u_weights=U_weights[0], v_weights=V_weights[0] +) +i, j = pairs[0] +one_sparse_check = scipy_wasserstein_distance( + U[i], U[j], u_weights=U_weights[i], v_weights=U_weights[j] +) +two_sparse_check = scipy_wasserstein_distance( + U[i], V[j], u_weights=U_weights[i], v_weights=V_weights[j] +) + +tol = 1e-6 +print( + abs(one_set[0, 1] - one_set_check) < tol, + abs(two_set[0, 0] - two_set_check) < tol, + abs(one_set_sparse[0, 0] - one_sparse_check) < tol, + abs(two_set_sparse[0, 0] - two_sparse_check) < tol, +) + + +# %% wasserstein helper functions +def my_wasserstein_distance(u_uw, v_vw): + """ + Return Earth Mover's distance using concatenated values and weights. + + Parameters + ---------- + u_uw : 1D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + Returns + ------- + distance : numeric scalar + Earth Mover's distance given two distributions. + + """ + # split into values and weights + n = len(u_uw) + i = n // 2 + u = u_uw[0:i] + uw = u_uw[i:n] + v = v_vw[0:i] + vw = v_vw[i:n] + # calculate distance + distance = wasserstein_distance(u, v, u_weights=uw, v_weights=vw) + return distance + + +def join_wasserstein(U, V, Uw, Vw): + """ + Horizontally stack values and weights for each distribution. + + Weights are added as additional columns to values. + + Example: + u_uw, v_vw = join_wasserstein(u, v, uw, vw) + d = my_wasserstein_distance(u_uw, v_vw) + cdist(u_uw, v_vw, metric=my_wasserstein_distance) + + Parameters + ---------- + u : 1D or 2D numeric array + First set of distribution values. + v : 1D or 2D numeric array + Second set of values of distribution values. + uw : 1D or 2D numeric array + Weights for first distribution. + vw : 1D or 2D numeric array + Weights for second distribution. + + Returns + ------- + u_uw : 1D or 2D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + """ + U_Uw = np.concatenate((U, Uw), axis=1) + V_Vw = np.concatenate((V, Vw), axis=1) + return U_Uw, V_Vw + + +@njit(fastmath=fastmath, debug=debug) +def wasserstein_distance_check(u_values, v_values, u_weights=None, v_weights=None, p=1): + r""" + Compute first + + Compute, between two one-dimensional distributions :math:`u` and + :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the + statistical distance that is defined as: + .. math:: + l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} + p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2 + gives the energy distance. + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like, optional + Weight for each value. If unspecified, each value is assigned the same + weight. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from 1, it + must still be positive and finite so that the weights can be normalized + to sum to 1. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from samples + whose values are effectively inputs of the function, or they can be seen as + generalized functions, in which case they are weighted sums of Dirac delta + functions located at the specified values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, + Munos "The Cramer Distance as a Solution to Biased Wasserstein + Gradients" (2017). :arXiv:`1705.10743`. + """ + u_sorter = np.argsort(u_values) + v_sorter = np.argsort(v_values) + + all_values = np.concatenate((u_values, v_values)) + all_values.sort(kind="mergesort") + + # Compute the differences between pairs of successive values of u and v. + deltas = np.diff(all_values) + + # Get the respective positions of the values of u and v among the values of + # both distributions. + u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], "right") + v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], "right") + + # Calculate the CDFs of u and v using their weights, if specified. + if u_weights is None: + u_cdf = u_cdf_indices / u_values.size + else: + u_sorted_cumweights = np.concatenate(([0], np.cumsum(u_weights[u_sorter]))) + u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1] + + if v_weights is None: + v_cdf = v_cdf_indices / v_values.size + else: + v_sorted_cumweights = np.concatenate(([0], np.cumsum(v_weights[v_sorter]))) + v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1] + + # Compute the value of the integral based on the CDFs. + # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead + # of about 15%. + if p == 1: + return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas)) + if p == 2: + return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas))) + return np.power( + np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p), deltas)), 1 / p + ) + + +U_Uw, V_Vw = join_wasserstein(U, V, U_weights, V_weights) + + +# %% CODE GRAVEYARD +# def setdiff(a, b): +# """ +# Find the rows in a which are not in b. + +# Source: modified from https://stackoverflow.com/a/11903368/13697228 +# See also: https://www.mathworks.com/help/matlab/ref/double.setdiff.html + +# Parameters +# ---------- +# a : 2D array +# Set of vectors. +# b : 2D array +# Set of vectors. + +# Returns +# ------- +# out : 2D array +# Set of vectors in a that are not in b. + +# """ +# a_rows = a.view([("", a.dtype)] * a.shape[1]) +# b_rows = b.view([("", b.dtype)] * b.shape[1]) +# out = np.setdiff1d(a_rows, b_rows).view(a.dtype).reshape(-1, a.shape[1]) +# return out + +# n_neighbors = 3 +# n_neighbors2 = 1 diff --git a/ElM2D/train-debug.csv b/ElM2D/train-debug.csv new file mode 100644 index 0000000..e4de4c6 --- /dev/null +++ b/ElM2D/train-debug.csv @@ -0,0 +1,99 @@ +cif_file,formula,target +Zr1_ICSD_164572.cif,Zr1,999 +K2Mg5Sn3_ICSD_421342.cif,K2Mg5Sn3,999 +C1In1La3_ICSD_80956.cif,C1In1La3,999 +Al3B2Ru4_ICSD_43844.cif,Al3B2Ru4,999 +Au1Ho1Pb1_ICSD_58487.cif,Au1Ho1Pb1,999 +F2Xe1_ICSD_260950.cif,F2Xe1,999 +Dy1S2_ICSD_630195.cif,Dy1S2,999 +Nb3Te4_ICSD_16606.cif,Nb3Te4,999 +Rb2Te1_ICSD_182743.cif,Rb2Te1,999 +Sb2Tb1_ICSD_651606.cif,Sb2Tb1,999 +Hf1Pd5_ICSD_168289.cif,Hf1Pd5,999 +O3Pb1Zr1_ICSD_262104.cif,O3Pb1Zr1,999 +Ag1Al1Se2_ICSD_28745.cif,Ag1Al1Se2,999 +Pd1Sb1Tb1_ICSD_648792.cif,Pd1Sb1Tb1,999 +Ga1La1Zn1_ICSD_634520.cif,Ga1La1Zn1,999 +Ag1As1S1_ICSD_604740.cif,Ag1As1S1,999 +Ca1Pd1_ICSD_58926.cif,Ca1Pd1,999 +Cd2Yb1_ICSD_620601.cif,Cd2Yb1,999 +K1Li1Se1_ICSD_290451.cif,K1Li1Se1,999 +Al2N1Nb3_ICSD_60644.cif,Al2N1Nb3,999 +Pt7Sb1_ICSD_54340.cif,Pt7Sb1,999 +Ag1Er1_ICSD_167893.cif,Ag1Er1,999 +Ca2O4Pb1_ICSD_36629.cif,Ca2O4Pb1,999 +Cl4K2Pd1_ICSD_23997.cif,Cl4K2Pd1,999 +P1Ru1Zr1_ICSD_648037.cif,P1Ru1Zr1,999 +As1Li1Mg1_ICSD_107954.cif,As1Li1Mg1,999 +Au1Ga1Zr1_ICSD_156264.cif,Au1Ga1Zr1,999 +In2Li1Rh1_ICSD_106806.cif,In2Li1Rh1,999 +Ca3Ge13Ir4_ICSD_619308.cif,Ca3Ge13Ir4,999 +Ba1Mg4Si3_ICSD_165630.cif,Ba1Mg4Si3,999 +Mo3P1_ICSD_43238.cif,Mo3P1,999 +Ni1Zr2_ICSD_102805.cif,Ni1Zr2,999 +As1Hf1Ru1_ICSD_610654.cif,As1Hf1Ru1,999 +Co2Ge1Zn1_ICSD_52994.cif,Co2Ge1Zn1,999 +N1Ni2W3_ICSD_86170.cif,N1Ni2W3,999 +B6K1_ICSD_166487.cif,B6K1,999 +Lu1Pb2_ICSD_104811.cif,Lu1Pb2,999 +Au1Dy1_ICSD_58439.cif,Au1Dy1,999 +Pt3Sn1_ICSD_105795.cif,Pt3Sn1,999 +Sn1Zr3_ICSD_106108.cif,Sn1Zr3,999 +Bi1_ICSD_51674.cif,Bi1,999 +In1P1Pd5_ICSD_640193.cif,In1P1Pd5,999 +In2O3_ICSD_14387.cif,In2O3,999 +C1Sc3Tl1_ICSD_50164.cif,C1Sc3Tl1,999 +Al1Pd1Y1_ICSD_156923.cif,Al1Pd1Y1,999 +Ho1In3_ICSD_104411.cif,Ho1In3,999 +Dy1Rh2Si2_ICSD_108427.cif,Dy1Rh2Si2,999 +Al2Y1_ICSD_186476.cif,Al2Y1,999 +Pu1Se1_ICSD_649958.cif,Pu1Se1,999 +Co1Y1_ICSD_20954.cif,Co1Y1,999 +Al4Er1Ni1_ICSD_57779.cif,Al4Er1Ni1,999 +Er5Si3_ICSD_57268.cif,Er5Si3,999 +As1Hf1_ICSD_42915.cif,As1Hf1,999 +Be2Re1_ICSD_58732.cif,Be2Re1,999 +Al1Ge1Sc1_ICSD_608009.cif,Al1Ge1Sc1,999 +C1Pb1Pd3_ICSD_108178.cif,C1Pb1Pd3,999 +Cu2In1Y1_ICSD_103023.cif,Cu2In1Y1,999 +Hf1Pt1_ICSD_104257.cif,Hf1Pt1,999 +As3La4_ICSD_610771.cif,As3La4,999 +Al16Hf6Pd7_ICSD_608137.cif,Al16Hf6Pd7,999 +Ba1Os2P2_ICSD_602106.cif,Ba1Os2P2,999 +B6Os1Y2_ICSD_603494.cif,B6Os1Y2,999 +Ho1In1_ICSD_104410.cif,Ho1In1,999 +Er1Ru2Si2_ICSD_106608.cif,Er1Ru2Si2,999 +Au1Be5_ICSD_150580.cif,Au1Be5,999 +Rh2Si2Y1_ICSD_602257.cif,Rh2Si2Y1,999 +Ce1Ga1Ni1_ICSD_102171.cif,Ce1Ga1Ni1,999 +Au1Ca1Cd1_ICSD_420574.cif,Au1Ca1Cd1,999 +Ga12Lu4Pd1_ICSD_634587.cif,Ga12Lu4Pd1,999 +O6Os2Rb1_ICSD_161127.cif,O6Os2Rb1,999 +Dy1Ga1Pt1_ICSD_629731.cif,Dy1Ga1Pt1,999 +Pt1Sb2_ICSD_43105.cif,Pt1Sb2,999 +La1Rh2_ICSD_104705.cif,La1Rh2,999 +Er1Ge2Rh2_ICSD_630646.cif,Er1Ge2Rh2,999 +F6K2Mn1_ICSD_47213.cif,F6K2Mn1,999 +Ho2Rh3Sn5_ICSD_639639.cif,Ho2Rh3Sn5,999 +Cl6Cs2Te1_ICSD_26704.cif,Cl6Cs2Te1,999 +Ni2Si2Zr1_ICSD_20317.cif,Ni2Si2Zr1,999 +P1Rh2_ICSD_38356.cif,P1Rh2,999 +Au1Pb4Rb3_ICSD_107447.cif,Au1Pb4Rb3,999 +N1Ru2Zr4_ICSD_644658.cif,N1Ru2Zr4,999 +Ca1O3Ti1_ICSD_153173.cif,Ca1O3Ti1,999 +As1Fe1Nb1_ICSD_610502.cif,As1Fe1Nb1,999 +Au1Cu4Tb1_ICSD_611769.cif,Au1Cu4Tb1,999 +Ir5Th1_ICSD_104573.cif,Ir5Th1,999 +P2Zn3_ICSD_24487.cif,P2Zn3,999 +In1Pd2_ICSD_417907.cif,In1Pd2,999 +As2Cu1Er1_ICSD_412553.cif,As2Cu1Er1,999 +Al1F3_ICSD_130021.cif,Al1F3,999 +C2Ir1_ICSD_181489.cif,C2Ir1,999 +Cu4O3_ICSD_100566.cif,Cu4O3,999 +Ir2S3Sn3_ICSD_640953.cif,Ir2S3Sn3,999 +Si136_ICSD_56721.cif,Si136,999 +Ce2Pd21Si6_ICSD_174079.cif,Ce2Pd21Si6,999 +O4Sb1V1_ICSD_84567.cif,O4Sb1V1,999 +In1Pt2Tb1_ICSD_640312.cif,In1Pt2Tb1,999 +Sr1Tl2_ICSD_106111.cif,Sr1Tl2,999 +Ca1O3Rh1_ICSD_164775.cif,Ca1O3Rh1,999