diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a180fa1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+build/**
+__pycache__/**
+ElM2D.egg-info/**
+ElM2D/.ipynb_checkpoints/**
+ElM2D/__pycache__/**
diff --git a/ElM2D/ElM2D.py b/ElM2D/ElM2D.py
index c608504..397c7ed 100644
--- a/ElM2D/ElM2D.py
+++ b/ElM2D/ElM2D.py
@@ -1,7 +1,5 @@
 """
-A class to construct an ElM2D plot of a list of inorganic compostions based on
-the Element Movers Distance Between These.
-
+Construct ElM2D plot of a list of inorganic compostions via Element Movers Distance.
 
 Copyright (C) 2021  Cameron Hargreaves
 
@@ -29,22 +27,21 @@
 Network simplex source modified to use numba from
 https://networkx.github.io/documentation/networkx-1.10/_modules/networkx/algorithms/flow/networksimplex.html#network_simplex
 
-Requies umap which may be installed via
-conda install -c conda-forge umap-learn
+Requires umap which may be installed via:
+    conda install -c conda-forge umap-learn
 """
-import re, json, csv
-
-from multiprocessing import Pool, cpu_count
+import os
+from operator import attrgetter
+from importlib import reload
+from numba import cuda
 
-from copy import deepcopy
-from collections import Counter
+from multiprocessing import cpu_count, freeze_support
 
 import numpy as np
 import pandas as pd
-import pickle as pk 
+import pickle as pk
 
 from scipy.spatial.distance import squareform
-from numba import njit 
 
 import umap
 
@@ -52,69 +49,130 @@
 import plotly.io as pio
 
 from tqdm import tqdm
-from tqdm.contrib.concurrent import process_map  
-# from matminer.datasets import load_dataset
+
+# from tqdm.contrib.concurrent import process_map
 
 from ElMD import ElMD, EMD
 
+from njit_dist_matrix import dist_matrix as njit_dist_matrix
+
+# number of columns of U and V must be set as env var before import
+n_elements = len(ElMD().periodic_tab["mod_petti"])
+os.environ["COLUMNS"] = str(n_elements)
+
+# other environment variables (set before importing cuda_dist_matrix)
+os.environ["USE_64"] = "0"
+os.environ["INLINE"] = "never"
+os.environ["FASTMATH"] = "1"
+# os.environ["TARGET"] = "cuda"
+
+import cuda_dist_matrix  # noqa
+
+# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228) # noqa
+reload(cuda_dist_matrix)
+cuda_dist_matrix = cuda_dist_matrix.dist_matrix
+
+
 def main():
-    df = load_dataset("matbench_expt_gap").head(1001)
+    """
+    Perform a test of basic features such as intersection, sorting, and featurization.
+
+    Returns
+    -------
+    None.
 
+    """
+    df = pd.read_csv("train-debug.csv")
     df_1 = df.head(500)
     df_2 = df.tail(500)
     mapper = ElM2D(metric="mod_petti")
     mapper.intersect(df_1["composition"], df_2["composition"])
     sorted_comps = mapper.sort(df["composition"])
     sorted_comps, sorted_inds = mapper.sort(df["composition"], return_inds=True)
-    fts =  mapper.featurize()
+    mapper.featurize()
     print()
 
-class ElM2D():
-    '''
-    This class takes in a list of compound formula and creates the intercompound
-    distance matrix wrt EMD and a two dimensional embedding using either PCA or 
-    UMAP
-    '''
-    def __init__(self, formula_list=None,
-                       n_proc=None,
-                       n_components=2,
-                       verbose=True,
-                       metric="mod_petti",
-                       chunksize=1,
-                       umap_kwargs={}):
+
+class ElM2D:
+    """
+    Create intercompound EMD distance matrix and embedding via list of formulas.
+
+    Embedding types are:
+        - PCA
+        - UMAP
+    """
+
+    def __init__(
+        self,
+        formula_list=None,
+        n_proc=None,
+        n_components=2,
+        verbose=True,
+        metric="mod_petti",
+        chunksize=1,
+        umap_kwargs={},
+        emd_algorithm="wasserstein",
+        target=None,
+    ):
 
         self.verbose = verbose
 
         if n_proc is None:
             self.n_proc = cpu_count()
-        else: 
+        else:
             self.n_proc = n_proc
 
-        self.formula_list = formula_list # Input formulae
+        self.formula_list = formula_list  # Input formulae
 
         self.metric = metric
-        self.chunksize=chunksize
+        self.chunksize = chunksize
 
         self.umap_kwargs = umap_kwargs
 
         self.umap_kwargs["n_components"] = n_components
         self.umap_kwargs["metric"] = "precomputed"
 
-        self.input_mat = None    # Pettifor vector representation of formula
-        self.embedder = None     # For accessing UMAP object
-        self.embedding = None    # Stores the last embedded coordinates
-        self.dm = None           # Stores distance matrix
-
+        self.input_mat = None  # Pettifor vector representation of formula
+        self.embedder = None  # For accessing UMAP object
+        self.embedding = None  # Stores the last embedded coordinates
+        self.dm = None  # Stores distance matrix
+        self.emd_algorithm = emd_algorithm
+        self.target = target  # "cuda" or "cpu"
 
     def save(self, filepath):
-        # Save all variables except for the distance matrix
+        """
+        Save all variables except for the distance matrix.
+
+        Parameters
+        ----------
+        filepath : str
+            Filepath for which to save the pickle.
+
+        Returns
+        -------
+        None.
+
+        """
         save_dict = {k: v for k, v in self.__dict__.items()}
-        f_handle = open(filepath + ".pk", 'wb')
+        f_handle = open(filepath + ".pk", "wb")
         pk.dump(save_dict, f_handle)
         f_handle.close()
-        
+
     def load(self, filepath):
-        f_handle = open(filepath + ".pk", 'rb')
+        """
+        Load variables from pickle file.
+
+        Parameters
+        ----------
+        filepath : str
+            Filepath for which to load the pickle.
+
+        Returns
+        -------
+        None.
+
+        """
+        f_handle = open(filepath + ".pk", "rb")
         load_dict = pk.load(f_handle)
         f_handle.close()
 
@@ -122,31 +180,106 @@ def load(self, filepath):
             self.__dict__[k] = v
 
     def plot(self, fp=None, color=None, embedding=None):
+        """
+        Generate plots based on embedding dimension.
+
+        Parameters
+        ----------
+        fp : str, optional
+            Filepath for which to write plotly html. The default is None.
+        color : str, optional
+            Color to use for scatter points. The default is None.
+        embedding : 2D array, optional
+            From self.embedding if not specified. The default is None.
+
+        Returns
+        -------
+        fig : plotly figure
+            Handle to the plotly figure.
+
+        """
         if self.embedding is None:
-            print("No embedding in memory, call transform() first.")    
-            return 
+            print("No embedding in memory, call transform() first.")
+            return
 
         if embedding is None:
             embedding = self.embedding
 
+        x = embedding[:, 0]
+        y = embedding[:, 1]
         if embedding.shape[1] == 2:
             if color is None:
-                df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "formula": self.formula_list})
-                fig = px.scatter(df, x="x", y="y", hover_name="formula", hover_data={"x":False, "y":False})
+                df = pd.DataFrame({"x": x, "y": y, "formula": self.formula_list})
+                fig = px.scatter(
+                    df,
+                    x="x",
+                    y="y",
+                    hover_name="formula",
+                    hover_data={"x": False, "y": False},
+                )
 
             else:
-                df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "formula": self.formula_list, color.name: color.to_numpy()})
-                fig = px.scatter(df, x="x", y="y",color=color.name, hover_data={"formula": True, color.name: True, "x":False, "y":False})
+                df = pd.DataFrame(
+                    {
+                        "x": x,
+                        "y": y,
+                        "formula": self.formula_list,
+                        color.name: color.to_numpy(),
+                    }
+                )
+                fig = px.scatter(
+                    df,
+                    x="x",
+                    y="y",
+                    color=color.name,
+                    hover_data={
+                        "formula": True,
+                        color.name: True,
+                        "x": False,
+                        "y": False,
+                    },
+                )
 
         elif embedding.shape[1] == 3:
+            z = embedding[:, 2]
             if color is None:
-                df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "z": embedding[:, 2], "formula": self.formula_list})
-                fig = px.scatter_3d(df, x="x", y="y", z="z", hover_name="formula", hover_data={"x":False, "y":False, "z":False})
+                df = pd.DataFrame(
+                    {"x": x, "y": y, "z": z, "formula": self.formula_list}
+                )
+                fig = px.scatter_3d(
+                    df,
+                    x="x",
+                    y="y",
+                    z="z",
+                    hover_name="formula",
+                    hover_data={"x": False, "y": False, "z": False},
+                )
 
             else:
-                df = pd.DataFrame({"x": embedding[:, 0], "y": embedding[:, 1], "z": embedding[:, 2], "formula": self.formula_list, color.name: color.to_numpy()})
-                fig = px.scatter_3d(df, x="x", y="y", z="z", color=color.name, hover_data={"formula": True, color.name: True, "x":False, "y":False, "z":False})
-        
+                df = pd.DataFrame(
+                    {
+                        "x": x,
+                        "y": y,
+                        "z": z,
+                        "formula": self.formula_list,
+                        color.name: color.to_numpy(),
+                    }
+                )
+                fig = px.scatter_3d(
+                    df,
+                    x="x",
+                    y="y",
+                    z="z",
+                    color=color.name,
+                    hover_data={
+                        "formula": True,
+                        color.name: True,
+                        "x": False,
+                        "y": False,
+                        "z": False,
+                    },
+                )
+
         elif embedding.shape[1] > 3:
             print("Too many dimensions to plot directly, using first three components")
             fig = self.plot(fp=fp, color=color, embedding=embedding[:, :3])
@@ -157,20 +290,31 @@ def plot(self, fp=None, color=None, embedding=None):
 
         return fig
 
-    def fit(self, X):
-        '''
+    def fit(self, X, target=None):
+        """
+        Construct and store an ElMD distance matrix.
+
         Take an input vector, either of a precomputed distance matrix, or
         an iterable of strings of composition formula, construct an ElMD distance
         matrix and store to self.dm.
 
-        Input
-        X - A list of compound formula strings, or a precomputed distance matrix
-        (ensure self.metric = "precomputed")
-        '''
+        Parameters
+        ----------
+        X : list of str OR 2D array
+            A list of compound formula strings, or a precomputed distance matrix. If
+            using a precomputed distance matrix, ensure self.metric == "precomputed"
+
+
+        Returns
+        -------
+        None.
+
+        """
         self.formula_list = X
         n = len(X)
 
-        if self.verbose: print(f"Fitting {self.metric} kernel matrix")
+        if self.verbose:
+            print(f"Fitting {self.metric} kernel matrix")
         if self.metric == "precomputed":
             self.dm = X
 
@@ -182,90 +326,100 @@ def fit(self, X):
                 x = ElMD(X[i], metric=self.metric)
                 for j in range(i + 1, n):
                     distances.append(x.elmd(X[j]))
-            
+
             dist_vec = np.array(distances)
             self.dm = squareform(dist_vec)
 
         else:
-            if self.verbose: print("Constructing distances")
-            dist_vec = self._process_list(X, n_proc=self.n_proc)
-            self.dm = squareform(dist_vec)
-
-    def fit_transform(self, X, y=None, how="UMAP", n_components=2):
+            if self.verbose:
+                print("Constructing distances")
+            if self.emd_algorithm == "network_simplex":
+                dist_vec = self._process_list(X, n_proc=self.n_proc)
+                self.dm = squareform(dist_vec)
+            elif self.emd_algorithm == "wasserstein":
+                self.dm = self.EM2D(X, X, target=target)
+
+    def fit_transform(self, X, y=None, how="UMAP", n_components=2, target=None):
         """
-        Successively call fit and transform
+        Successively call fit and transform.
+
+        Parameters
+        ----------
+        X : list of str
+            Compositions to embed.
+        y : 1D numerical array, optional
+            Target values to use for supervised UMAP embedding. The default is None.
+        how : str, optional
+            How to perform embedding ("UMAP" or "PCA"). The default is "UMAP".
+        n_components : int, optional
+            Number of dimensions to embed to. The default is 2.
+
+        Returns
+        -------
+        embedding : TYPE
+            DESCRIPTION.
 
-        Parameters:
-        X - List of compositions to embed 
-        how - "UMAP" or "PCA", the embedding technique to use
-        n_components - The number of dimensions to embed to
         """
-        self.fit(X)
-        embedding = self.transform(how=how, n_components=self.umap_kwargs["n_components"], y=y)
+        self.fit(X, target=target)
+        embedding = self.transform(
+            how=how, n_components=self.umap_kwargs["n_components"], y=y
+        )
         return embedding
 
     def transform(self, how="UMAP", n_components=2, y=None):
         """
-        Call the selected embedding method (UMAP or PCA) and embed to 
-        n_components dimensions.
+        Call the selected embedding method (UMAP or PCA) and embed.
+
+        Parameters
+        ----------
+        how : str, optional
+            How to perform embedding ("UMAP" or "PCA"). The default is "UMAP".
+            The default is "UMAP".
+        n_components : int, optional
+            Number of dimensions to embed to. The default is 2.
+        y : 1D numerical array, optional
+            Target values to use for supervised UMAP embedding. The default is None.
+
+        Returns
+        -------
+        2D array
+            UMAP or PCA embedding.
+
+        """
+        """
+
         """
         if self.dm is None:
             print("No distance matrix computed, run fit() first")
-            return 
+            return
 
+        n = self.umap_kwargs["n_components"]
         if how == "UMAP":
             if y is None:
-                if self.verbose: print(f"Constructing UMAP Embedding to {self.umap_kwargs['n_components']} dimensions")
+                if self.verbose:
+                    print(f"Constructing UMAP Embedding to {n} dimensions")
                 self.embedder = umap.UMAP(**self.umap_kwargs)
                 self.embedding = self.embedder.fit_transform(self.dm)
 
             else:
                 y = y.to_numpy(dtype=float)
-                if self.verbose: print(f"Constructing UMAP Embedding to {self.umap_kwargs['n_components']} dimensions, with a targetted embedding")
+                if self.verbose:
+                    print(
+                        f"Constructing UMAP Embedding to {n} dimensions, with \
+                            a targeted embedding"
+                    )
                 self.embedder = umap.UMAP(**self.umap_kwargs)
                 self.embedding = self.embedder.fit_transform(self.dm, y)
 
         elif how == "PCA":
-            if self.verbose: print(f"Constructing PCA Embedding to {self.umap_kwargs['n_components']} dimensions")
+            if self.verbose:
+                print(f"Constructing PCA Embedding to {n} dimensions")
             self.embedding = self.PCA(n_components=self.umap_kwargs["n_components"])
-            if self.verbose: print(f"Finished Embedding")
+            if self.verbose:
+                print("Finished Embedding")
 
         return self.embedding
 
-    def PCA(self, n_components=5):
-        """
-        Multidimensional Scaling - Given a matrix of interpoint distances,
-        find a set of low dimensional points that have similar interpoint
-        distances.
-        https://github.com/stober/mds/blob/master/src/mds.py
-        """
-
-        if self.dm is None:
-            raise Exception("No distance matrix computed, call fit_transform with a list of compositions, or load a saved matrix with load_dm()")
- 
-
-        (n,n) = self.dm.shape
-
-        if self.verbose: print(f"Constructing {n}x{n_components} Gram matrix")
-        E = (-0.5 * self.dm**2)
-
-        # Use this matrix to get column and row means
-        Er = np.mat(np.mean(E,1))
-        Es = np.mat(np.mean(E,0))
-
-        # From Principles of Multivariate Analysis: A User's Perspective (page 107).
-        F = np.array(E - np.transpose(Er) - Es + np.mean(E))
-
-        if self.verbose: print(f"Computing Eigen Decomposition")
-        [U, S, V] = np.linalg.svd(F)
-
-        Y = U * np.sqrt(S)
-
-        if self.verbose: print(f"PCA Projected Points Computed")
-        self.mds_points = Y
-
-        return Y[:, :n_components]
-
     def sort(self, formula_list=None):
         """
         Sorts compositions based on their ElMD similarity.
@@ -278,15 +432,18 @@ def sort(self, formula_list=None):
         sorted_indices = mapper.sort()
         sorted_comps = mapper.sorted_comps
         """
-
         if formula_list is None and self.formula_list is None:
-            raise Exception("Must input a list of compositions or fit a list of compositions first") # TODO Exceptions?
+            # TODO Exceptions?
+            raise Exception(
+                "Must input a list of compositions or fit a list of compositions first"
+            )
 
         elif formula_list is None:
             formula_list = self.formula_list
 
         elif self.formula_list is None:
-            formula_list = process_map(ElMD, formula_list, chunksize=self.chunksize)
+            E = ElMD
+            formula_list = map(E, formula_list, chunksize=self.chunksize)
             self.formula_list = formula_list
 
         sorted_comps = sorted(formula_list)
@@ -294,23 +451,23 @@ def sort(self, formula_list=None):
 
         return sorted_comps
 
-
-
     def cross_validate(self, y=None, X=None, k=5, shuffle=True, seed=42):
         """
-        Implementation of cross validation with K-Folds.
-        
-        Splits the formula_list into k equal sized partitions and returns five 
-        tuples of training and test sets. Returns a list of length k, each item 
-        containing 2 (4 with target data) numpy arrays of formulae of 
+        Cross validate with K-Folds.
+
+        Splits the formula_list into k equal sized partitions and returns five
+        tuples of training and test sets. Returns a list of length k, each item
+        containing 2 (4 with target data) numpy arrays of formulae of
         length n - n/k and n/k.
 
-        Parameters:
+        Parameters
+        ----------
             y=None: (optional) a numpy array of target properties to cross validate
             k=5: Number of k-folds
             shuffle=True: whether to shuffle the input formulae or not
 
-        Usage:
+        Usage
+        -----
             cvs = mapper.cross_validate()
             for i, (X_train, X_test) in enumerate(cvs):
                 sub_mapper = ElM2D()
@@ -326,29 +483,29 @@ def cross_validate(self, y=None, X=None, k=5, shuffle=True, seed=42):
                 errors.append(mae(y_pred, y_test))
             print(np.mean(errors))
         """
-        inds = np.arange(len(self.formula_list)) # TODO Exception
+        inds = np.arange(len(self.formula_list))  # TODO Exception
 
         if shuffle:
-            np.random.seed(seed) 
+            np.random.seed(seed)
             np.random.shuffle(inds)
 
         if X is None:
             formulas = self.formula_list.to_numpy(str)[inds]
-        
+
         else:
             formulas = X
-        
+
         splits = np.array_split(formulas, k)
 
         X_ret = []
-        
+
         for i in range(k):
             train_splits = np.delete(np.arange(k), i)
             X_train = splits[train_splits[0]]
 
             for index in train_splits[1:]:
                 X_train = np.concatenate((X_train, splits[index]))
-            
+
             X_test = splits[i]
             X_ret.append((X_train, X_test))
 
@@ -365,26 +522,40 @@ def cross_validate(self, y=None, X=None, k=5, shuffle=True, seed=42):
 
             for index in train_splits[1:]:
                 y_train = np.concatenate((y_train, y_splits[index]))
-            
+
             y_test = y_splits[i]
             y_ret.append((y_train, y_test))
 
         return [(X_ret[i][0], X_ret[i][1], y_ret[i][0], y_ret[i][1]) for i in range(k)]
 
     def _process_list(self, formula_list, n_proc):
-        '''
-        Given an iterable list of formulas in composition form
-        use multiple processes to convert these to pettifor ratio
-        vector form, and then calculate the distance between these
-        pairings, returning this as a condensed distance vector
-        '''
+        """
+        Process a list of formulas into a pairwise distance matrix.
+
+        Parameters
+        ----------
+        formula_list : list of str
+            Chemical formulas.
+        n_proc : int
+            number of processors (i.e. CPU cores).
+
+        Returns
+        -------
+        2D array
+            2D distance matrix.
+
+        Given an iterable list of formulas in composition form use multiple processes
+        to convert these to pettifor ratio vector form, and then calculate the
+        distance between these pairings, returning this as a condensed distance vector.
+        """
         pool_list = []
 
-
         n_elements = len(ElMD().periodic_tab[self.metric])
-        self.input_mat = np.ndarray(shape=(len(formula_list), n_elements), dtype=np.float64)
+        self.input_mat = np.ndarray(
+            shape=(len(formula_list), n_elements), dtype=np.float64
+        )
 
-        if self.verbose: 
+        if self.verbose:
             print("Parsing Formula")
             for i, formula in tqdm(list(enumerate(formula_list))):
                 self.input_mat[i] = ElMD(formula, metric=self.metric).ratio_vector
@@ -393,7 +564,7 @@ def _process_list(self, formula_list, n_proc):
                 self.input_mat[i] = ElMD(formula, metric=self.metric).ratio_vector
 
         # Create input pairings
-        if self.verbose: 
+        if self.verbose:
             print("Constructing joint compositional pairings")
             for i in tqdm(range(len(formula_list) - 1)):
                 sublist = [(i, j) for j in range(i + 1, len(formula_list))]
@@ -404,55 +575,222 @@ def _process_list(self, formula_list, n_proc):
                 pool_list.append(sublist)
 
         # Distribute amongst processes
-        if self.verbose: print("Creating Process Pool\nScattering compositions between processes and computing distances")
-        
-        scores = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize)
-        
-        if self.verbose: print("Distances computed closing processes")
+        if self.verbose:
+            print(
+                "Creating Process Pool\nScattering compositions between processes \
+                    and computing distances"
+            )
+
+        # scores = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize)
+        scores = map(self._pool_ElMD, pool_list)
 
-        if self.verbose: print("Flattening sublists")
+        if self.verbose:
+            print("Distances computed closing processes")
+
+        if self.verbose:
+            print("Flattening sublists")
         # Flattens list of lists to single list
-        distances = [dist for sublist in scores for dist in sublist]
-       
-        return np.array(distances, dtype=np.float64)
+        distances = np.array(
+            [dist for sublist in scores for dist in sublist], dtype=np.float64
+        )
+
+        return distances
+
+    def EM2D(self, formulas, formulas2=None, target=None):
+        """
+        Earth Mover's 2D distances. See also EMD.
+
+        Parameters
+        ----------
+        formulas : list of str
+            First list of formulas for which to compute distances. If only formulas
+            is specified, then a `pdist`-like array is returned, i.e. pairwise
+            distances within a single set.
+        formulas2 : list of str, optional
+                Second list of formulas, which if specified, causes `cdist`-like
+                behavior (i.e. pairwise distances between two sets).
+
+        Returns
+        -------
+        2D array
+            Pairwise distances.
+
+        """
+        isXY = formulas2 is None
+        E = ElMD()
+
+        def gen_ratio_vector(comp):
+            """Create a numpy array from a composition dictionary."""
+            if isinstance(comp, str):
+                comp = E._parse_formula(comp)
+                comp = E._normalise_composition(comp)
+
+            sorted_keys = sorted(comp.keys())
+            comp_labels = [E._get_position(k) for k in sorted_keys]
+            comp_ratios = [comp[k] for k in sorted_keys]
+
+            indices = np.array(comp_labels, dtype=np.int64)
+            ratios = np.array(comp_ratios, dtype=np.float64)
+
+            numeric = np.zeros(shape=len(E.periodic_tab[E.metric]), dtype=np.float64)
+            numeric[indices] = ratios
+
+            return numeric
+
+        def gen_ratio_vectors(comps):
+            return np.array([gen_ratio_vector(comp) for comp in comps])
+
+        U_weights = gen_ratio_vectors(formulas)
+        if isXY:
+            V_weights = gen_ratio_vectors(formulas2)
+
+        lookup, periodic_tab, metric = attrgetter("lookup", "periodic_tab", "metric")(E)
+        ptab_metric = periodic_tab[metric]
+
+        def get_mod_petti(x):
+            return [ptab_metric[lookup[a]] if b > 0 else 0 for a, b in enumerate(x)]
+
+        def get_mod_pettis(X):
+            return np.array([get_mod_petti(x) for x in X])
+
+        U = get_mod_pettis(U_weights)
+        if isXY:
+            V = get_mod_pettis(V_weights)
+
+        # decide whether to use cpu or cuda version
+        if target is None:
+            if self.target is None or not cuda.is_available():
+                target = "cpu"
+            elif self.target == "cuda" or cuda.is_available():
+                target = "cuda"
+
+        # if target == "cpu":
+        #     dist_matrix = njit_dist_matrix
+        # elif target == "cuda":
+        #     dist_matrix = cuda_dist_matrix
+
+        if isXY:
+            if target == "cpu":
+                distances = njit_dist_matrix(
+                    U,
+                    V=V,
+                    U_weights=U_weights,
+                    V_weights=V_weights,
+                    metric="wasserstein",
+                )
+            elif target == "cuda":
+                distances = cuda_dist_matrix(
+                    U,
+                    V=V,
+                    U_weights=U_weights,
+                    V_weights=V_weights,
+                    metric="wasserstein",
+                )
+        else:
+            if target == "cpu":
+                distances = njit_dist_matrix(
+                    U, U_weights=U_weights, metric="wasserstein"
+                )
+            elif target == "cuda":
+                distances = cuda_dist_matrix(
+                    U, U_weights=U_weights, metric="wasserstein"
+                )
+
+        # package
+        self.U = U
+        self.U_weights = U_weights
+
+        if isXY:
+            self.V = V
+            self.V_weights = V_weights
+
+        return distances
+
+    def PCA(self, n_components=5):
+        """
+        Perform multidimensional scaling (MDS) on a matrix of interpoint distances.
+
+        This finds a set of low dimensional points that have similar interpoint
+        distances.
+        Source: https://github.com/stober/mds/blob/master/src/mds.py
+        """
+        if self.dm == []:
+            raise Exception(
+                "No distance matrix computed, call fit_transform with a list of \
+                    compositions, or load a saved matrix with load_dm()"
+            )
+
+        (n, n) = self.dm.shape
+
+        if self.verbose:
+            print(f"Constructing {n}x{n_components} Gram matrix")
+        E = -0.5 * self.dm ** 2
+
+        # Use this matrix to get column and row means
+        Er = np.mat(np.mean(E, 1))
+        Es = np.mat(np.mean(E, 0))
+
+        # From Principles of Multivariate Analysis: A User's Perspective (page 107).
+        F = np.array(E - np.transpose(Er) - Es + np.mean(E))
+
+        if self.verbose:
+            print("Computing Eigen Decomposition")
+        [U, S, V] = np.linalg.svd(F)
+
+        Y = U * np.sqrt(S)
+
+        if self.verbose:
+            print("PCA Projected Points Computed")
+        self.mds_points = Y
+
+        return Y[:, :n_components]
 
     def _pool_ElMD(self, input_tuple):
-        '''
-        Uses multiprocessing module to call the numba compiled EMD function
-        '''
+        """Use multiprocessing module to call the numba compiled EMD function."""
         distances = np.ndarray(len(input_tuple))
         elmd_obj = ElMD(metric=self.metric)
-        
+
         for i, (input_1, input_2) in enumerate(input_tuple):
-            distances[i] = EMD(self.input_mat[input_1], 
-                               self.input_mat[input_2],
-                               elmd_obj.lookup,
-                               elmd_obj.periodic_tab[self.metric])
+            distances[i] = EMD(
+                self.input_mat[input_1],
+                self.input_mat[input_2],
+                elmd_obj.lookup,
+                elmd_obj.periodic_tab[self.metric],
+            )
 
         return distances
 
     def __repr__(self):
+        """Summary of ElM2D object: length, diversity, and max distance if dm exists."""
         if self.dm is not None:
-            return f"ElM2D(size={len(self.formula_list)},  chemical_diversity={np.mean(self.dm)} +/- {np.std(self.dm)}, maximal_distance={np.max(self.dm)})"
+            return f"ElM2D(size={len(self.formula_list)},  \
+                chemical_diversity={np.mean(self.dm)} +/- {np.std(self.dm)}, \
+                    maximal_distance={np.max(self.dm)})"
         else:
-            return f"ElM2D()"
+            return "ElM2D()"
 
     def export_dm(self, path):
+        """Export distance matrix as .csv to path."""
         np.savetxt(path, self.dm, delimiter=",")
-        
+
     def import_dm(self, path):
+        """Import distance matrix from .csv file located at path."""
         self.dm = np.loadtxt(path, delimiter=",")
 
     def export_embedding(self, path):
+        """Export embedding as .csv file to path."""
         np.savetxt(path, self.embedding, delimiter=",")
-        
+
     def import_embedding(self, path):
+        """Import embedding from .csv file located at path."""
         self.embedding = np.loadtxt(path, delimiter=",")
 
     def _pool_featurize(self, comp):
+        """Extract the feature vector for a given composition (comp)."""
         return ElMD(comp, metric=self.metric).feature_vector
 
     def featurize(self, formula_list=None, how="mean"):
+        """Featurize a list of formulas."""
         if formula_list is None and self.formula_list is None:
             raise Exception("You must enter a list of compositions first")
 
@@ -462,15 +800,25 @@ def featurize(self, formula_list=None, how="mean"):
         elif self.formula_list is None:
             self.formula_list = formula_list
 
-        elmd_obj = ElMD(metric=self.metric)
+        # elmd_obj = ElMD(metric=self.metric)
 
         # if type(elmd_obj.periodic_tab[self.metric]["H"]) is int:
-        #     vectors = np.ndarray((len(compositions), len(elmd_obj.periodic_tab[self.metric])))
+        #     vectors = np.ndarray(
+        #         (len(compositions), len(elmd_obj.periodic_tab[self.metric]))
+        #     )
         # else:
-        #     vectors = np.ndarray((len(compositions), len(elmd_obj.periodic_tab[self.metric]["H"])))
-
-        print(f"Constructing compositionally weighted {self.metric} feature vectors for each composition")
-        vectors = process_map(self._pool_featurize, formula_list, chunksize=self.chunksize)
+        #     vectors = np.ndarray(
+        #         (len(compositions), len(elmd_obj.periodic_tab[self.metric]["H"]))
+        #     )
+
+        print(
+            f"Constructing compositionally weighted {self.metric} feature vectors \
+                for each composition"
+        )
+        # vectors = process_map(
+        #     self._pool_featurize, formula_list, chunksize=self.chunksize
+        # )
+        vectors = map(self._pool_featurize, formula_list)
 
         print("Complete")
 
@@ -478,9 +826,11 @@ def featurize(self, formula_list=None, how="mean"):
 
     def intersect(self, y=None, X=None):
         """
-        Takes in a second formula list, y, and computes the intersectional distance 
+        Find intersectional distance matrix between two formula lists.
+
+        Takes in a second formula list, y, and computes the intersectional distance
         matrix between the two under the given metric. If a two formula lists
-        are given the intersection between the two is computed, returning a 
+        are given the intersection between the two is computed, returning a
         distance matrix of the form:
 
               X_0             X_1            X_2            ...
@@ -490,63 +840,78 @@ def intersect(self, y=None, X=None):
         ...
         """
         if X is None and y is None:
-            raise Exception("Must enter two lists of formula or fit a list of formula and enter an intersecting list of formula")
+            raise Exception(
+                "Must enter two lists of formula or fit a list of formula and enter \
+                    an intersecting list of formula"
+            )
         if X is None:
             X = self.formula_list
 
-        elmd = ElMD()
-        dm = []
-        process_pool = Pool(cpu_count())
+        # elmd = ElMD()
+        # dm = []
 
-        for comp_1 in tqdm(X):
-            ionic = process_pool.starmap(elmd.elmd, ((comp_1, comp_2) for comp_2 in y))
-            dm.append(ionic)
-        
-        distance_matrix = np.array(dm)
+        # with Pool(cpu_count()) as process_pool:
+        #     for comp_1 in tqdm(X):
+        #         ionic = process_pool.starmap(
+        #             elmd.elmd, ((comp_1, comp_2) for comp_2 in y)
+        #         )
+        #         dm.append(ionic)
 
-        return distance_matrix
+        # distance_matrix = np.array(dm)
+
+        distances = self.EM2D(X, formulas2=y)
+
+        return distances
 
         # Not working currently, might be faster...
         # intersection_dm = self._process_intersection(X, y, self.n_proc)
 
         # return intersection_dm
-        
 
     def _process_intersection(self, X, y, n_proc):
-        '''
-        Compute the intersection of two lists of compositions
-        '''
+        """Compute the intersection of two lists of compositions."""
         pool_list = []
 
         n_elements = len(ElMD().periodic_tab[self.metric])
         X_mat = np.ndarray(shape=(len(X), n_elements), dtype=np.float64)
         y_mat = np.ndarray(shape=(len(y), n_elements), dtype=np.float64)
 
-        if self.verbose: print("Parsing X Formula")
+        if self.verbose:
+            print("Parsing X Formula")
         for i, formula in tqdm(list(enumerate(X))):
             X_mat[i] = ElMD(formula, metric=self.metric).ratio_vector
 
-        if self.verbose: print("Parsing Y Formula")
+        if self.verbose:
+            print("Parsing Y Formula")
         for i, formula in tqdm(list(enumerate(y))):
             y_mat[i] = ElMD(formula, metric=self.metric).ratio_vector
 
         # Create input pairings
-        if self.verbose: print("Constructing joint compositional pairings")
+        if self.verbose:
+            print("Constructing joint compositional pairings")
         for y in tqdm(range(len(y_mat))):
             sublist = [(y, x) for x in range(len(X_mat))]
             pool_list.append(sublist)
-        
+
         # Distribute amongst processes
-        if self.verbose: print("Creating Process Pool\nScattering compositions between processes and computing distances")
-        distances = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize)
+        if self.verbose:
+            print(
+                "Creating Process Pool\nScattering compositions between processes \
+                    and computing distances"
+            )
+        # distances = process_map(self._pool_ElMD, pool_list, chunksize=self.chunksize)
+        distances = map(self._pool_ElMD, pool_list)
 
-        if self.verbose: print("Distances computed closing processes")
+        if self.verbose:
+            print("Distances computed closing processes")
 
         # if self.verbose: print("Flattening sublists")
         # Flattens list of lists to single list
         # distances = [dist for sublist in scores for dist in sublist]
-       
+
         return np.array(distances, dtype=np.float64)
 
+
 if __name__ == "__main__":
+    freeze_support()
     main()
diff --git a/ElM2D/benchmark_njit.py b/ElM2D/benchmark_njit.py
new file mode 100644
index 0000000..1a9ca97
--- /dev/null
+++ b/ElM2D/benchmark_njit.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep  8 01:36:19 2021
+
+@author: sterg
+"""
+from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
+from scipy.spatial.distance import cdist
+
+# %% example runs
+pairtest = np.array([(0, 1), (1, 2), (2, 3)])
+Utest = U[0:6]
+Vtest = V[0:6]
+Uwtest = U_weights[0:6]
+Vwtest = V_weights[0:6]
+
+one_set = dist_matrix(Utest, Uw=Uwtest, metric="wasserstein")
+print(one_set)
+
+two_set = dist_matrix(Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, metric="wasserstein")
+print(two_set)
+
+pairs = np.array([(0, 1), (1, 2), (2, 3)])
+
+one_set_sparse = dist_matrix(Utest, Uw=Uwtest, pairs=pairtest, metric="wasserstein")
+print(one_set_sparse)
+
+two_set_sparse = dist_matrix(
+    Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, pairs=pairtest, metric="wasserstein"
+)
+print(two_set_sparse)
+
+# %% timing of large distance matrices
+
+
+# for compilation purposes, maybe just once is necessary?
+dist_matrix(U, Uw=U_weights, metric="wasserstein")
+dist_matrix(Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, metric="wasserstein")
+dist_matrix(Utest, Uw=Uwtest, pairs=pairtest, metric="wasserstein")
+dist_matrix(Utest, V=Vtest, Uw=Uwtest, Vw=Vwtest, pairs=pairtest, metric="wasserstein")
+
+with Timer("cdist Euclidean"):
+    d = cdist(U, V)
+
+with Timer("two-set dist_matrix Euclidean"):
+    d = dist_matrix(U, V=V, Uw=U_weights, Vw=V_weights, metric="euclidean")
+
+with Timer("cdist SciPy Wasserstein"):
+    d = cdist(U_Uw, V_Vw, metric=scipy_wasserstein_distance)
+
+with Timer("cdist Wasserstein"):
+    d = cdist(U_Uw, V_Vw, metric=my_wasserstein_distance)
+
+with Timer("two-set dist_matrix Wasserstein"):
+    d = dist_matrix(U, V=V, Uw=U_weights, Vw=V_weights, metric="wasserstein")
diff --git a/ElM2D/cuda_dist_matrix.py b/ElM2D/cuda_dist_matrix.py
new file mode 100644
index 0000000..c2e07c9
--- /dev/null
+++ b/ElM2D/cuda_dist_matrix.py
@@ -0,0 +1,544 @@
+"""
+Compute a distance matrix for various cases.
+
+Cases:
+    within a single set of vectors (like pdist)
+    between two sets of vectors (like cdist)
+    between prespecified pairs (i.e. sparse) for either one or two sets of
+        vectors.
+
+Various distance metrics are available.
+"""
+import os
+import numpy as np
+from math import ceil
+from metrics import wasserstein_distance, euclidean_distance
+
+# CUDA Simulator not working..
+# os.environ["NUMBA_ENABLE_CUDASIM"] = "1"
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1"
+
+from numba import cuda, jit  # noqa
+from numba.types import int32, float32, int64, float64  # noqa
+
+inline = os.environ.get("INLINE", "never")
+fastmath = bool(os.environ.get("FASTMATH", "1"))
+cols = os.environ.get("COLUMNS")  # 121 for ElM2D repo
+USE_64 = bool(os.environ.get("USE_64", "0"))
+target = os.environ.get("TARGET", "cuda")
+
+if USE_64 is None:
+    USE_64 = False
+if USE_64:
+    bits = 64
+    nb_float = float64
+    nb_int = int64
+    np_float = np.float64
+    np_int = np.int64
+else:
+    bits = 32
+    nb_float = float32
+    nb_int = int32
+    np_float = np.float32
+    np_int = np.int32
+
+if cols is not None:
+    cols = int(cols)
+    cols_plus_1 = cols + 1
+    tot_cols = cols * 2
+    tot_cols_minus_1 = tot_cols - 1
+else:
+    raise KeyError(
+        "For performance reasons and architecture constraints "
+        "the number of columns of U (which is the same as V) "
+        "must be defined as the environment variable, COLUMNS, "
+        'via e.g. `os.environ["COLUMNS"] = "100"`.'
+    )
+
+# override types
+if target == "cpu":
+    nb_float = np_float
+    nb_int = np_int
+
+# threads-per-block
+TPB = 16
+
+
+@jit(inline=inline)
+def compute_distance(u, v, u_weights, v_weights, metric_num):
+    """
+    Calculate weighted distance between two vectors, u and v.
+
+    Parameters
+    ----------
+    u : 1D array of float
+        First vector.
+    v : 1D array of float
+        Second vector.
+    u_weights : 1D array of float
+        Weights for u.
+    v_weights : 1D array of float
+        Weights for v.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Raises
+    ------
+    NotImplementedError
+        "Specified metric is mispelled or has not been implemented yet.
+        If not implemented, consider submitting a pull request."
+
+    Returns
+    -------
+    d : float
+        Weighted distance between u and v.
+
+    """
+    if metric_num == 0:
+        d = euclidean_distance(u, v)
+    elif metric_num == 1:
+        # assume u and v are presorted, and weights are sorted by u and v
+        d = wasserstein_distance(u, v, u_weights, v_weights, True, True, True)
+    else:
+        raise NotImplementedError(
+            "Specified metric is mispelled or has not been implemented yet. "
+            "If not implemented, consider submitting a pull request."
+        )
+    return d
+
+
+@cuda.jit(
+    "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+    "int{0}[:,:], float{0}[:], int{0})".format(bits),
+    fastmath=fastmath,
+)
+def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num):
+    """
+    Calculate sparse pairwise distances between two sets of vectors for pairs.
+
+    Parameters
+    ----------
+    mat : numeric cuda array
+        First set of vectors for which to compute a single pairwise distance.
+    mat2 : numeric cuda array
+        Second set of vectors for which to compute a single pairwise distance.
+    pairs : cuda array of 2-tuples
+        All pairs for which distances are to be computed.
+    out : numeric cuda array
+        The initialized array which will be populated with distances.
+
+    Raises
+    ------
+    ValueError
+        Both matrices should have the same number of columns.
+
+    Returns
+    -------
+    None.
+
+    """
+    k = cuda.grid(1)
+
+    npairs = pairs.shape[0]
+
+    if k < npairs:
+        pair = pairs[k]
+        # extract indices of the pair for which the distance will be computed
+        i, j = pair
+
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+
+        out[k] = compute_distance(u, v, uw, vw, metric_num)
+
+
+@cuda.jit(
+    "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits),
+    fastmath=fastmath,
+)
+def one_set_distance_matrix(U, U_weights, out, metric_num):
+    """
+    Calculate pairwise distances within single set of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Initialized matrix to populate with pairwise distances.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    dm_rows = U.shape[0]
+    dm_cols = U.shape[0]
+
+    if i < j and i < dm_rows and j < dm_cols and i != j:
+        u = U[i]
+        v = U[j]
+        uw = U_weights[i]
+        vw = U_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+        out[j, i] = d
+
+
+# faster compilation *and* runtimes with explicit signature
+@cuda.jit(
+    "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+    "float{0}[:,:], int{0})".format(bits),
+    fastmath=fastmath,
+)
+def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num):
+    """
+    Calculate pairwise distances between two sets of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    V : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    V_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Pairwise distance matrix between two sets of vectors.
+    metric_num : int
+        Distance metric number {"euclidean": 0, "wasserstein": 1}.
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    # distance matrix shape
+    dm_rows = U.shape[0]
+    dm_cols = V.shape[0]
+
+    if i < dm_rows and j < dm_cols:
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+
+
+def dist_matrix(
+    U, V=None, U_weights=None, V_weights=None, pairs=None, metric="euclidean"
+):  # noqa
+    """
+    Compute distance matrices using Numba/CUDA.
+
+    Parameters
+    ----------
+    mat : array
+        First set of vectors for which to compute pairwise distances.
+
+    mat2 : array, optional
+        Second set of vectors for which to compute pairwise distances.
+        If not specified, then mat2 is a copy of mat.
+
+    pairs : array, optional
+        List of 2-tuples which contain the indices for which to compute
+        distances. If mat2 was specified, then the second index accesses
+        mat2 instead of mat. If not specified, then the pairs are
+        auto-generated. If mat2 was specified, all combinations of the two
+        vector sets are used. If mat2 isn't specified, then only the upper
+        triangle (minus diagonal) pairs are computed.
+
+    metric : str, optional
+        Possible options are 'euclidean', 'wasserstein'.
+        Defaults to Euclidean distance. These are converted to integers
+        internally due to Numba's lack of support for string arguments
+        (2021-08-14). See compute_distance() for other keys.
+
+        For example:
+            0 - 'euclidean'
+            1 - 'wasserstein'
+    target : str, optional
+        Which target to use: "cuda" or "cpu". Default is "cuda".
+    inline : str, optional
+        Whether to inline functions: "always" or "never". Default is "never".
+    fastmath : bool, optional
+        Whether to use fastmath or not. The default is True.
+    USE_64 : bool, optional
+        Whether to use 64 bit or 34 bit types. The default is False.
+
+    Raises
+    ------
+    ValueError
+        DESCRIPTION.
+    NotImplementedError
+        DESCRIPTION.
+
+    Returns
+    -------
+    out : array
+        A pairwise distance matrix, or if pairs are specified, then a
+        vector of distances corresponding to the pairs.
+
+    """
+    rows, cols_check = U.shape
+
+    if cols_check != cols:
+        raise KeyError(
+            "For performance reasons and architecture constraints "
+            "the number of columns of U (which is the same as V) "
+            "must be defined as the environment variable: COLUMNS. "
+            'However, os.environ["COLUMNS"] does not match the '
+            "number of columns in U. Reset the environment variable "
+            'via e.g. `os.environ["COLUMNS"] = str(U.shape[0])` '
+            "defined at the top of your script and make sure that "
+            "dist_matrix is reloaded. You may also need to restart "
+            "Python."
+        )
+
+    if V is not None and cols is not V.shape[1]:
+        raise ValueError("The number of columns for U and V should match")
+
+    # is it a distance matrix between two sets of vectors?
+    # (rather than within a single set)
+    isXY = V is not None
+
+    # were pairs specified? (useful for sparse matrix generation)
+    pairQ = pairs is not None
+
+    # block, grid, and out shape
+    if pairQ:
+        block_dim = TPB
+        npairs = pairs.shape[0]
+        grid_dim = ceil(npairs / block_dim)
+    else:
+        block_dim = (TPB, TPB)
+        blockspergrid_x = ceil(rows / block_dim[0])
+        if isXY:
+            blockspergrid_y = ceil(V.shape[0] / block_dim[1])
+        else:
+            blockspergrid_y = ceil(rows / block_dim[1])
+        grid_dim = (blockspergrid_x, blockspergrid_y)
+
+    # CUDA setup
+    stream = cuda.stream()
+
+    # sorting and cumulative weights
+    if metric == "wasserstein":
+        # presort values (and weights by sorted value indices)
+        U_sorter = np.argsort(U)
+        U = np.take_along_axis(U, U_sorter, axis=-1)
+        U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1)
+
+        # calculate cumulative weights
+        U_weights = np.cumsum(U_weights, axis=1)
+
+        # prepend a column of zeros
+        zero = np.zeros((U_weights.shape[0], 1))
+        U_weights = np.column_stack((zero, U_weights))
+
+        # do the same for V and V_weights
+        if isXY:
+            V_sorter = np.argsort(V)
+            V = np.take_along_axis(V, V_sorter, axis=-1)
+            V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1)
+            V_weights = np.cumsum(V_weights, axis=1)
+            V_weights = np.column_stack((zero, V_weights))
+
+    # assign dummy arrays
+    if V is None:
+        V = np.zeros((2, 2))
+
+    if U_weights is None:
+        U_weights = np.zeros((2, 2))
+
+    if V_weights is None:
+        V_weights = np.zeros((2, 2))
+
+    if pairs is None:
+        pairs = np.zeros((2, 2))
+
+    # assign metric_num based on specified metric (no string support)
+    metric_dict = {"euclidean": 0, "wasserstein": 1}
+    metric_num = metric_dict[metric]
+
+    # # same for target_num
+    # target_dict = {"cuda": 0, "cpu": 1}
+    # target_num = target_dict[target]
+
+    m = U.shape[0]
+
+    if isXY:
+        m2 = V.shape[0]
+    else:
+        m2 = m
+
+    if pairQ:
+        shape = (npairs,)
+    else:
+        shape = (m, m2)
+
+    # values should be initialized instead of using cuda.device_array
+    out = np.zeros(shape)
+
+    if target == "cuda":
+        # copying/allocating to GPU
+        cU = cuda.to_device(np.asarray(U, dtype=np_float), stream=stream)
+
+        cU_weights = cuda.to_device(
+            np.asarray(U_weights, dtype=np_float), stream=stream
+        )
+
+        if isXY or pairQ:
+            cV = cuda.to_device(np.asarray(V, dtype=np_float), stream=stream)
+            cV_weights = cuda.to_device(
+                np.asarray(V_weights, dtype=np_float), stream=stream
+            )
+
+        if pairQ:
+            cpairs = cuda.to_device(np.asarray(pairs, dtype=np_int), stream=stream)
+
+        cuda_out = cuda.to_device(np.asarray(out, dtype=np_float), stream=stream)
+
+    elif target == "cpu":
+        cU = U
+        cV = V
+        cU_weights = U_weights
+        cV_weights = V_weights
+        cpairs = pairs
+        cuda_out = out
+
+    # distance matrix between two sets of vectors
+    if isXY and not pairQ:
+        fn = two_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif not isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU, cU_weights, cU_weights, cpairs, cuda_out, metric_num)
+
+    # distance matrix within single set of vectors
+    elif not isXY and not pairQ:
+        fn = one_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cpairs, cuda_out, metric_num)
+
+    out = cuda_out.copy_to_host(stream=stream)
+
+    return out
+
+
+# %% Code Graveyard
+
+# u_stack = cuda.local.array(cols, nb_int)
+# v_stack = cuda.local.array(cols, nb_int)
+
+# copy(u_sorted, utmp)
+# copy(v_sorted, vtmp)
+
+# copy(u_weights, u_sorted_weights)
+# copy(v_weights, v_sorted_weights)
+
+# stack = [0] * size
+# ids = list(range(len(arr)))
+
+# u_sorted_weights = cuda.local.array(cols, nb_float)
+# v_sorted_weights = cuda.local.array(cols, nb_float)
+
+# sorting stack
+# uv_stack = cuda.local.array(tot_cols, nb_int)
+
+# def sort_cum_prepend(X, X_weights):
+#     # presort values (and weights by sorted value indices)
+#     X_sorter = np.argsort(X)
+#     X = np.take_along_axis(X, X_sorter, axis=-1)
+#     X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1)
+#     # calculate cumulative weights
+#     X_weights = np.cumsum(X_weights)
+#     # prepend a column of zeros
+#     zero = np.zeros((X_weights.shape[0], 1))
+#     X_weights = np.column_stack((zero, X_weights))
+
+# def mean_squared_error(y, y_pred, squared=False):
+#     """
+#     Return mean squared error (MSE) without using sklearn.
+
+#     If squared == True, then return root mean squared error (RMSE).
+
+#     Parameters
+#     ----------
+#     y : 1D numeric array
+#         "True" or first set of values.
+#     y_pred : 1D numeric array
+#         "Predicted" or second set of values.
+
+#     Returns
+#     -------
+#     rmse : numeric scalar
+#         DESCRIPTION.
+
+#     """
+#     mse = np.mean((y - y_pred)**2)
+#     if squared is True:
+#         rmse = np.sqrt(mse)
+#         return rmse
+#     else:
+#         return mse
+#         return mse
+
+# opt = False
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0"
+# opt = True
+# os.environ["NUMBA_BOUNDSCHECK"] = "0"
+# os.environ["OPT"] = "0"
+
+
+# HACK
+# a "hacky" way to get compatibility between @njit and @cuda.jit
+# myjit = get_myjit(target=target, inline=inline)
+# mycudajit = get_myjit(device=False, target=target, inline=inline)
+
+# local array shapes needs to be defined globally due to lack of dynamic
+# array allocation support. Don't wrap with np.int32, etc. see
+# https://github.com/numba/numba/issues/7314
+
+# np.savetxt('100-zeros.csv',
+#           np.zeros(tmp, dtype=np.int32),
+#           delimiter=",")
+# OK to take cols from .shape
+# cols = np.genfromtxt('100-zeros.csv',
+#                      dtype=np.int32,
+#                      delimiter=",").shape[0]
+
+
+# HACK from numba.cuda.kernels.device.myjit import get_myjit
+# from numba.cuda.cudadrv.driver import driver
+# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash..
+
+
+# os.environ["MACHINE_BITS"] = str(bits)
diff --git a/ElM2D/helper.py b/ElM2D/helper.py
new file mode 100644
index 0000000..3ff059b
--- /dev/null
+++ b/ElM2D/helper.py
@@ -0,0 +1,355 @@
+"""Helper functions for distance matrix computations."""
+import os
+from math import sqrt
+from numba import jit
+from time import time
+
+inline = os.environ.get("INLINE", "never")
+
+
+class Timer(object):
+    """
+    Simple timer class.
+
+    https://stackoverflow.com/a/5849861/13697228
+    Usage
+    -----
+    with Timer("description"):
+        # do stuff
+    """
+
+    def __init__(self, name=None):
+        self.name = name
+
+    def __enter__(self):
+        """Enter the timer."""
+        self.tstart = time()
+
+    def __exit__(self, type, value, traceback):
+        """Exit the timer."""
+        if self.name:
+            print("[%s]" % self.name,)
+        print(("Elapsed: {}\n").format(round((time() - self.tstart), 5)))
+
+
+@jit(inline=inline)
+def copy(a, out):
+    """
+    Copy a into out.
+
+    Parameters
+    ----------
+    a : vector
+        values to copy.
+    out : vector
+        values to copy into.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(a)):
+        i = int(i)
+        out[i] = a[i]
+
+
+@jit(inline=inline)
+def insertionSort(arr):
+    """
+    Perform insertion sorting on a vector.
+
+    Source: https://www.geeksforgeeks.org/insertion-sort/
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array to be sorted in-place.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(1, len(arr)):
+        key = arr[i]
+
+        # Move elements of arr[0..i-1], that are greater than key, to one
+        # position ahead of their current position
+        j = i - 1
+        while j >= 0 and key < arr[j]:
+            arr[j + 1] = arr[j]
+            j -= 1
+        arr[j + 1] = key
+
+
+@jit(inline=inline)
+def insertionArgSort(arr, ids):
+    """
+    Perform insertion sorting on a vector and track the sorting indices.
+
+    Source: https://www.geeksforgeeks.org/insertion-sort/
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array to be sorted in-place.
+
+    ids : 1D array
+        Initialized array to fill with sorting indices.
+
+    Returns
+    -------
+    None.
+
+    """
+    # fill ids
+    for i in range(len(arr)):
+        ids[i] = i
+
+    for i in range(1, len(arr)):
+        key = arr[i]
+        id_key = ids[i]
+
+        # Move elements of arr[0..i-1], that are greater than key, to one
+        # position ahead of their current position
+        j = i - 1
+        while j >= 0 and key < arr[j]:
+            arr[j + 1] = arr[j]
+            ids[j + 1] = ids[j]
+            j -= 1
+        arr[j + 1] = key
+        ids[j + 1] = id_key
+
+
+@jit(inline=inline)
+def concatenate(vec, vec2, out):
+    """
+    Concatenate two vectors.
+
+    It also reassigns the values of out back into vec and vec2 to prevent odd
+    roundoff issues.
+
+    Parameters
+    ----------
+    vec : vector
+        First vector to concatenate.
+    vec2 : vector
+        Second vector to concatenate.
+    out : vector
+        Initialization of concatenated vector.
+
+    Returns
+    -------
+    None.
+
+    """
+    # number of elements
+    n = len(vec)
+    n2 = len(vec2)
+    # populate out
+    for i in range(n):
+        out[i] = vec[i]
+    for i in range(n2):
+        out[i + n] = vec2[i]
+
+
+@jit(inline=inline)
+def diff(vec, out):
+    """
+    Compute gradient.
+
+    Parameters
+    ----------
+    vec : vector of floats
+        The vector for which to calculate the gradient.
+    out : vector of floats
+        Initialization of output vector (one less element than vec) which
+        contains the gradient.
+
+    Returns
+    -------
+    None.
+
+    """
+    # number of elements
+    n = len(vec) - 1
+    # progressively add next elements
+    for i in range(n):
+        out[i] = vec[i + 1] - vec[i]
+
+# maybe there's a faster implementation somewhere
+
+
+@jit(inline=inline)
+def bisect_right(a, v, ids):
+    """
+    Return indices where to insert items in v in list a, assuming a is sorted.
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array for which to perform a binary search.
+    v : 1D array
+        Values to perform the binary search for.
+    ids : 1D array
+        Initialized list of indices, same size as v.
+
+    Returns
+    -------
+    None.
+
+    The return value i is such that all e in a[:i] have e <= x, and all e in
+    a[i:] have e > x.  So if x already appears in the list, a.insert(x) will
+    insert just after the rightmost x already there. Optional args lo
+    (default 0) and hi (default len(a)) bound the slice of a to be searched.
+
+    Source: modified from
+    https://github.com/python/cpython/blob/43bab0537ceb6e2ca3597f8f3a3c79733b897434/Lib/bisect.py#L15-L35 # noqa
+    """
+    n = len(a)
+    n2 = len(v)
+    for i in range(n2):
+        lo = 0
+        mid = 0
+        hi = n
+        x = v[i]
+        while lo < hi:
+            mid = (lo + hi) // 2
+            # Use __lt__ to match the logic in list.sort() and in heapq
+            if x < a[mid]:
+                hi = mid
+            else:
+                lo = mid + 1
+        ids[i] = lo
+
+
+@jit(inline=inline)
+def sort_by_indices(v, ids, out):
+    """
+    Sort v by ids and assign to out, as in out = v[ids].
+
+    Parameters
+    ----------
+    v : vector
+        values to sort.
+    ids : vector
+        indices to use for sorting.
+    out : vector
+        preallocated vector to put sorted values into.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(ids)):
+        idx = ids[i]
+        out[i] = v[idx]
+
+
+@jit(inline=inline)
+def cumsum(vec, out):
+    """
+    Return the cumulative sum of the elements in a vector.
+
+    Referenced https://stackoverflow.com/a/15889203/13697228
+
+    Parameters
+    ----------
+    vec : 1D array
+        Vector for which to compute the cumulative sum.
+
+    out: 1D array
+        Initialized vector which stores the cumulative sum.
+
+    Returns
+    -------
+    out : 1D array
+        The cumulative sum of elements in vec (same shape as vec).
+
+    """
+    # number of elements
+    n = len(vec)
+    # initialize
+    total = 0.0
+    # progressively add next elements
+    for i in range(n):
+        total += vec[i]
+        out[i] = total
+
+
+@jit(inline=inline)
+def divide(v, b, out):
+    """
+    Divide a vector by a scalar.
+
+    Parameters
+    ----------
+    v : vector
+        vector numerator.
+    b : float
+        scalar denominator.
+    out : vector
+        initialized vector to populate with divided values.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(v)):
+        out[i] = v[i] / b
+
+
+@jit(inline=inline)
+def integrate(u_cdf, v_cdf, deltas, p):
+    """
+    Integrate between two vectors using a p-norm.
+
+    Parameters
+    ----------
+    u_cdf : numeric vector
+        First CDF.
+    v_cdf : numeric vector
+        Second CDF of same length as a.
+    deltas : numeric vector
+        The differences between the original vectors a and b.
+    p : int, optional
+        The finite power for which to compute the p-norm. The default is 2.
+
+    If p == 1 or p == 2, use of power is avoided, which introduces an overhead
+    # of about 15%
+    Source: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404-L8488 # noqa
+    However, without support for math.square or np.square (2021-08-19),
+    math.pow is required for p == 2.
+
+    Returns
+    -------
+    out : numeric scalar
+        The p-norm of a and b.
+
+    """
+    n = len(u_cdf)
+    out = 0.0
+    if p == 1:
+        for i in range(n):
+            out += abs(u_cdf[i] - v_cdf[i]) * deltas[i]
+    elif p == 2:
+        for i in range(n):
+            out += ((u_cdf[i] - v_cdf[i]) ** p) * deltas[i]
+            out = sqrt(out)
+    elif p > 2:
+        for i in range(n):
+            out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i]
+            out = out ** (1 / p)
+    return out
+
+
+# %% CODE GRAVEYARD
+# from numba.core import config
+# config_keys = dir(config)
+# if "INLINE" in config_keys:
+#     inline = config.INLINE
+# else:
+#     inline = "never"
diff --git a/ElM2D/metrics.py b/ElM2D/metrics.py
new file mode 100644
index 0000000..ef945e9
--- /dev/null
+++ b/ElM2D/metrics.py
@@ -0,0 +1,291 @@
+"""
+Numba/CUDA-compatible distance metrics.
+
+Created on Wed Sep  8 14:47:43 2021
+
+@author: sterg
+"""
+import os
+import numpy as np
+import helper as hp
+from math import sqrt
+from numba import cuda, jit  # noqa
+from numba.types import int32, float32, int64, float64  # noqa
+
+inline = os.environ.get("INLINE", "never")
+fastmath = bool(os.environ.get("FASTMATH", "1"))
+cols = os.environ.get("COLUMNS")  # 121 for ElM2D repo
+USE_64 = bool(os.environ.get("USE_64", "0"))
+target = os.environ.get("TARGET", "cuda")
+
+
+if USE_64 is None:
+    USE_64 = False
+if USE_64:
+    bits = 64
+    nb_float = float64
+    nb_int = int64
+    np_float = np.float64
+    np_int = np.int64
+else:
+    bits = 32
+    nb_float = float32
+    nb_int = int32
+    np_float = np.float32
+    np_int = np.int32
+
+if cols is not None:
+    cols = int(cols)
+    cols_plus_1 = cols + 1
+    tot_cols = cols * 2
+    tot_cols_minus_1 = tot_cols - 1
+else:
+    raise KeyError(
+        "For performance reasons and architecture constraints "
+        "the number of columns of U (which is the same as V) "
+        "must be defined as the environment variable, COLUMNS, "
+        'via e.g. `os.environ["COLUMNS"] = "100"`.'
+    )
+
+
+@jit(inline=inline)
+def cdf_distance(
+    u, v, u_weights, v_weights, p, presorted, cumweighted, prepended
+):  # noqa
+    r""" # noqa
+    Compute distance between two 1D distributions :math:`u` and :math:`v`.
+
+    The respective CDFs are :math:`U` and :math:`V`, and the
+    statistical distance is defined as:
+    .. math::
+        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
+    p is a positive parameter; p = 1 gives the Wasserstein distance,
+    p = 2 gives the energy distance.
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like
+        Weight for each value.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from
+        1, it must still be positive and finite so that the weights can
+        be normalized to sum to 1.
+    p : scalar
+        positive parameter that determines the type of cdf distance.
+    presorted : bool
+        Whether u and v have been sorted already *and* u_weights and
+        v_weights have been sorted using the same indices used to sort
+        u and v, respectively.
+    cumweighted : bool
+        Whether u_weights and v_weights have been converted to their
+        cumulative weights via e.g. np.cumsum().
+    prepended : bool
+        Whether a zero has been prepended to accumated, sorted
+        u_weights and v_weights.
+
+    By setting presorted, cumweighted, *and* prepended to False, the
+    computationproceeds proceeds in the same fashion as _cdf_distance
+    from scipy.stats.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from
+    samples whose values are effectively inputs of the function, or
+    they can be seen as generalized functions, in which case they are
+    weighted sums of Dirac delta functions located at the specified
+    values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan,
+            Hoyer, Munos "The Cramer Distance as a Solution to Biased
+            Wasserstein Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    # allocate local float arrays
+    # combined vector
+    uv = cuda.local.array(tot_cols, nb_float)
+    uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # CDFs
+    u_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+    v_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # allocate local int arrays
+    # CDF indices via binary search
+    u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+    v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+
+    u_cdf_sorted_cumweights = cuda.local.array(tot_cols_minus_1, nb_float)
+    v_cdf_sorted_cumweights = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # short-circuit
+    if presorted and cumweighted and prepended:
+        u_sorted = u
+        v_sorted = v
+
+        u_0_cumweights = u_weights
+        v_0_cumweights = v_weights
+
+    # sorting, accumulating, and prepending (for compatibility)
+    else:
+        # check arguments
+        if not presorted and (cumweighted or prepended):
+            raise ValueError(
+                "if cumweighted or prepended are True, then presorted cannot be False"
+            )  # noqa
+
+        if (not presorted or not cumweighted) and prepended:
+            raise ValueError(
+                "if prepended is True, then presorted and cumweighted must both be True"
+            )  # noqa
+
+        # sorting
+        if not presorted:
+            # local arrays
+            u_sorted = cuda.local.array(cols, np_float)
+            v_sorted = cuda.local.array(cols, np_float)
+
+            u_sorter = cuda.local.array(cols, nb_int)
+            v_sorter = cuda.local.array(cols, nb_int)
+
+            u_sorted_weights = cuda.local.array(cols, nb_float)
+            v_sorted_weights = cuda.local.array(cols, nb_float)
+
+            # local copy since quickArgSortIterative sorts in-place
+            hp.copy(u, u_sorted)
+            hp.copy(v, v_sorted)
+
+            # sorting
+            hp.insertionArgSort(u_sorted, u_sorter)
+            hp.insertionArgSort(v_sorted, v_sorter)
+
+            # inplace to avoid extra cuda local array
+            hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights)
+            hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights)
+
+        # cumulative weights
+        if not cumweighted:
+            # local arrays
+            u_cumweights = cuda.local.array(cols, nb_float)
+            v_cumweights = cuda.local.array(cols, nb_float)
+            # accumulate
+            hp.cumsum(u_sorted_weights, u_cumweights)
+            hp.cumsum(v_sorted_weights, v_cumweights)
+
+        # prepend weights with zero
+        if not prepended:
+            zero = cuda.local.array(1, nb_float)
+
+            u_0_cumweights = cuda.local.array(cols_plus_1, nb_float)
+            v_0_cumweights = cuda.local.array(cols_plus_1, nb_float)
+
+            hp.concatenate(zero, u_cumweights, u_0_cumweights)
+            hp.concatenate(zero, v_cumweights, v_0_cumweights)
+
+    # concatenate u and v into uv
+    hp.concatenate(u_sorted, v_sorted, uv)
+
+    # sorting
+    # quickSortIterative(uv, uv_stack)
+    hp.insertionSort(uv)
+
+    # Get the respective positions of the values of u and v among the
+    # values of both distributions. See also np.searchsorted
+    hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices)
+    hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices)
+
+    # empirical CDFs
+    hp.sort_by_indices(u_0_cumweights, u_cdf_indices, u_cdf_sorted_cumweights)
+    hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf)
+
+    hp.sort_by_indices(v_0_cumweights, v_cdf_indices, v_cdf_sorted_cumweights)
+    hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf)
+
+    # # Integration
+    hp.diff(uv, uv_deltas)  # See also np.diff
+
+    out = hp.integrate(u_cdf, v_cdf, uv_deltas, p)
+
+    return out
+
+
+@jit(inline=inline)
+def wasserstein_distance(
+    u, v, u_weights, v_weights, presorted, cumweighted, prepended
+):  # noqa
+    r"""
+    Compute the first Wasserstein distance between two 1D distributions.
+
+    This distance is also known as the earth mover's distance, since it can be
+    seen as the minimum amount of "work" required to transform :math:`u` into
+    :math:`v`, where "work" is measured as the amount of distribution weight
+    that must be moved, multiplied by the distance it has to be moved.
+
+    Source
+    ------
+    https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+           Munos "The Cramer Distance as a Solution to Biased Wasserstein
+           Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    return cdf_distance(
+        u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended
+    )  # noqa
+
+
+@jit(inline=inline)
+def euclidean_distance(a, b):
+    """
+    Calculate Euclidean distance between vectors a and b.
+
+    Parameters
+    ----------
+    a : 1D array
+        First vector.
+    b : 1D array
+        Second vector.
+
+    Returns
+    -------
+    d : numeric scalar
+        Euclidean distance between vectors a and b.
+    """
+    d = 0
+    for i in range(len(a)):
+        d += (b[i] - a[i]) ** 2
+    d = sqrt(d)
+    return d
diff --git a/ElM2D/njit_dist_matrix.py b/ElM2D/njit_dist_matrix.py
new file mode 100644
index 0000000..06d8561
--- /dev/null
+++ b/ElM2D/njit_dist_matrix.py
@@ -0,0 +1,369 @@
+"""
+Nopython version of dist_matrix.
+
+Author: Sterling Baird
+"""
+from numba import prange, njit
+from numba.types import float32, int32
+import numpy as np
+
+np_int = np.int32
+np_float = np.float32
+
+nb_int = int32
+nb_float = float32
+
+fastmath = True
+parallel = True
+debug = False
+
+
+@njit(fastmath=fastmath, debug=debug)
+def wasserstein_distance(
+    u_values, v_values, u_weights=None, v_weights=None, p=1, presorted=False
+):
+    r"""
+    Compute first Wasserstein distance.
+
+    Compute, between two one-dimensional distributions :math:`u` and
+    :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the
+    statistical distance that is defined as:
+    .. math::
+        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
+    p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2
+    gives the energy distance.
+
+    Source:
+        https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+            Munos "The Cramer Distance as a Solution to Biased Wasserstein
+            Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    if not presorted:
+        u_sorter = np.argsort(u_values)
+        v_sorter = np.argsort(v_values)
+
+        u_values = u_values[u_sorter]
+        v_values = v_values[v_sorter]
+
+        u_weights = u_weights[u_sorter]
+        v_weights = v_weights[v_sorter]
+
+    all_values = np.concatenate((u_values, v_values))
+    # all_values.sort(kind='mergesort')
+    all_values.sort()
+
+    # Compute the differences between pairs of successive values of u and v.
+    deltas = np.diff(all_values)
+
+    # Get the respective positions of the values of u and v among the values of
+    # both distributions.
+    u_cdf_indices = np.searchsorted(u_values, all_values[:-1], side="right")
+    v_cdf_indices = np.searchsorted(v_values, all_values[:-1], side="right")
+
+    zero = np.array([0])
+    # Calculate the CDFs of u and v using their weights, if specified.
+    if u_weights is None:
+        u_cdf = u_cdf_indices / u_values.size
+    else:
+        uw_cumsum = np.cumsum(u_weights)
+        u_sorted_cumweights = np.concatenate((zero, uw_cumsum))
+        u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1]
+
+    if v_weights is None:
+        v_cdf = v_cdf_indices / v_values.size
+    else:
+        vw_cumsum = np.cumsum(v_weights)
+        v_sorted_cumweights = np.concatenate((zero, vw_cumsum))
+        v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1]
+
+    # Compute the value of the integral based on the CDFs.
+    # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead
+    # of about 15%.
+    if p == 1:
+        return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas))
+    if p == 2:
+        return np.sqrt(np.sum(np.multiply(np.power(u_cdf - v_cdf, 2), deltas)))
+    return np.power(
+        np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p), deltas)), 1 / p
+    )
+
+
+@njit(fastmath=fastmath, debug=debug)
+def euclidean_distance(a, b):
+    """
+    Calculate Euclidean distance between vectors a and b.
+
+    Parameters
+    ----------
+    a : 1D array
+        First vector.
+    b : 1D array
+        Second vector.
+
+    Returns
+    -------
+    d : numeric scalar
+        Euclidean distance between vectors a and b.
+    """
+    d = 0
+    for i in range(len(a)):
+        d += (b[i] - a[i]) ** 2
+    d = np.sqrt(d)
+    return d
+
+
+@njit(fastmath=fastmath, debug=debug)
+def compute_distance(u, v, u_weights, v_weights, metric_num):
+    """
+    Calculate weighted distance between two vectors, u and v.
+
+    Parameters
+    ----------
+    u : 1D array of float
+        First vector.
+    v : 1D array of float
+        Second vector.
+    u_weights : 1D array of float
+        Weights for u.
+    v_weights : 1D array of float
+        Weights for v.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Raises
+    ------
+    NotImplementedError
+        "Specified metric is mispelled or has not been implemented yet.
+        If not implemented, consider submitting a pull request."
+
+    Returns
+    -------
+    d : float
+        Weighted distance between u and v.
+
+    """
+    if metric_num == 0:
+        # d = np.linalg.norm(vec - vec2)
+        d = euclidean_distance(u, v)
+    elif metric_num == 1:
+        # d = my_wasserstein_distance(vec, vec2)
+        d = wasserstein_distance(
+            u, v, u_weights=u_weights, v_weights=v_weights, p=1, presorted=True
+        )
+    else:
+        raise NotImplementedError(
+            "Specified metric is mispelled or has not been implemented yet. \
+                If not implemented, consider submitting a pull request."
+        )
+    return d
+
+
+@njit(fastmath=fastmath, parallel=parallel, debug=debug)
+def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, isXY, metric_num):
+    """
+    Calculate sparse pairwise distances between two sets of vectors for pairs.
+
+    Parameters
+    ----------
+    mat : numeric cuda array
+        First set of vectors for which to compute a single pairwise distance.
+    mat2 : numeric cuda array
+        Second set of vectors for which to compute a single pairwise distance.
+    pairs : cuda array of 2-tuples
+        All pairs for which distances are to be computed.
+    out : numeric cuda array
+        The initialized array which will be populated with distances.
+
+    Raises
+    ------
+    ValueError
+        Both matrices should have the same number of columns.
+
+    Returns
+    -------
+    None.
+
+    """
+    npairs = pairs.shape[0]
+
+    for k in prange(npairs):
+        pair = pairs[k]
+        i, j = pair
+
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[k] = d
+
+
+@njit(fastmath=fastmath, parallel=parallel, debug=debug)
+def one_set_distance_matrix(U, U_weights, out, metric_num):
+    """
+    Calculate pairwise distances within single set of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Initialized matrix to populate with pairwise distances.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Returns
+    -------
+    None.
+
+    """
+    dm_rows = U.shape[0]
+    dm_cols = U.shape[0]
+
+    for i in prange(dm_rows):
+        for j in range(dm_cols):
+            if i < j:
+                u = U[i]
+                v = U[j]
+                uw = U_weights[i]
+                vw = U_weights[j]
+                d = compute_distance(u, v, uw, vw, metric_num)
+                out[i, j] = d
+                out[j, i] = d
+
+
+@njit(fastmath=fastmath, parallel=parallel, debug=debug)
+def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num):
+    """Calculate distance matrix between two sets of vectors."""
+    dm_rows = U.shape[0]
+    dm_cols = V.shape[0]
+
+    for i in prange(dm_rows):
+        for j in range(dm_cols):
+            u = U[i]
+            v = V[j]
+            uw = U_weights[i]
+            vw = V_weights[j]
+            d = compute_distance(u, v, uw, vw, metric_num)
+            out[i, j] = d
+
+
+def dist_matrix(
+    U, V=None, U_weights=None, V_weights=None, pairs=None, metric="euclidean"
+):
+    """
+    Compute pairwise distances using Numba/CUDA.
+
+    Parameters
+    ----------
+    mat : array
+        First set of vectors for which to compute pairwise distances.
+
+    mat2 : array, optional
+        Second set of vectors for which to compute pairwise distances. If not specified,
+        then mat2 is a copy of mat.
+
+    pairs : array, optional
+        List of 2-tuples which contain the indices for which to compute distances for.
+        If mat2 was specified, then the second index accesses mat2 instead of mat.
+        If not specified, then the pairs are auto-generated. If mat2 was specified,
+        all combinations of the two vector sets are used. If mat2 isn't specified,
+        then only the upper triangle (minus diagonal) pairs are computed.
+
+    metric : str, optional
+        Possible options are 'euclidean', 'wasserstein'.
+        Defaults to Euclidean distance. These are converted to integers internally
+        due to Numba's lack of support for string arguments (2021-08-14).
+        See compute_distance() for other keys. For example, 0 corresponds to Euclidean
+        distance and 1 corresponds to Wasserstein distance.
+
+    Returns
+    -------
+    out : array
+        A pairwise distance matrix, or if pairs are specified, then a vector of
+        distances corresponding to the pairs.
+
+    """
+    # is it distance matrix between two sets of vectors rather than within a single set?
+    isXY = V is not None
+
+    # were pairs specified? (useful for sparse matrix generation)
+    pairQ = pairs is not None
+
+    # assign metric_num based on specified metric (Numba doesn't support strings)
+    metric_dict = {"euclidean": 0, "wasserstein": 1}
+    metric_num = metric_dict[metric]
+
+    m = U.shape[0]
+
+    if isXY:
+        m2 = V.shape[0]
+    else:
+        m2 = m
+
+    if pairQ:
+        npairs = pairs.shape[0]
+        shape = (npairs, 1)
+    else:
+        shape = (m, m2)
+
+    if metric == "wasserstein":
+        U_sorter = np.argsort(U)
+        U = np.take_along_axis(U, U_sorter, axis=-1)
+        U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1)
+
+        if isXY:
+            V_sorter = np.argsort(V)
+            V = np.take_along_axis(V, V_sorter, axis=-1)
+            V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1)
+
+    out = np.zeros(shape, dtype=np_float)
+
+    if isXY and not pairQ:
+        # distance matrix between two sets of vectors
+        two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num)
+
+    elif not isXY and pairQ:
+        # specified pairwise distances within single set of vectors
+        sparse_distance_matrix(U, U, U_weights, U_weights, pairs, out, isXY, metric_num)
+
+    elif not isXY and not pairQ:
+        # distance matrix within single set of vectors
+        one_set_distance_matrix(U, U_weights, out, metric_num)
+
+    elif isXY and pairQ:
+        # specified pairwise distances between two sets of vectors
+        sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, isXY, metric_num)
+
+    return out
diff --git a/ElM2D/setup.cfg b/ElM2D/setup.cfg
new file mode 100644
index 0000000..08aedd7
--- /dev/null
+++ b/ElM2D/setup.cfg
@@ -0,0 +1,2 @@
+[metadata]
+description_file = README.md
diff --git a/ElM2D/test_ElM2D.py b/ElM2D/test_ElM2D.py
new file mode 100644
index 0000000..e5daca6
--- /dev/null
+++ b/ElM2D/test_ElM2D.py
@@ -0,0 +1,55 @@
+"""
+Test Element Mover's 2D Distance Matrix via network simplex and "wasserstein" methods.
+
+This test ensures that the fast implementation "wasserstein" produces "close" values
+to the original network simplex method.
+
+Note: "network_simplex" emd_algorithm produces a "freeze_support()" error in
+Spyder IPython and Spyder external terminal without the if __name == "__main__" on
+Windows. This is likely due to the way Pool is used internally within ElM2D.
+
+Created on Mon Sep  6 22:02:36 2021
+
+@author: sterg
+"""
+import unittest
+from numpy.testing import assert_allclose
+from helper import Timer
+import pandas as pd
+
+
+class Testing(unittest.TestCase):
+    def test_dm_close(self):
+        from ElM2D import ElM2D
+
+        mapper = ElM2D()
+
+        # df = pd.read_csv("train-debug.csv")
+        df = pd.read_csv("stable-mp.csv")
+        formulas = df["formula"]
+        sub_formulas = formulas[0:500]
+        with Timer("fit-cuda-wasserstein"):
+            mapper.fit(sub_formulas, target="cuda")
+            dm_wasserstein = mapper.dm
+
+        with Timer("fit-cpu-wasserstein"):
+            mapper.fit(sub_formulas)
+            dm_wasserstein = mapper.dm
+
+        mapper2 = ElM2D(emd_algorithm="network_simplex")
+
+        with Timer("fit-network_simplex"):
+            mapper2.fit(sub_formulas)
+            dm_network = mapper2.dm
+        assert_allclose(
+            dm_wasserstein,
+            dm_network,
+            atol=1e-3,
+            err_msg="wasserstein did not match network simplex.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
+# network_simplex gives a strange error (freeze_support() on Spyder IPython);
+# however dist_matrix does not give this error
diff --git a/ElM2D/test_dist_matrix.py b/ElM2D/test_dist_matrix.py
new file mode 100644
index 0000000..ade816e
--- /dev/null
+++ b/ElM2D/test_dist_matrix.py
@@ -0,0 +1,291 @@
+# -*- coding: utf-8 -*-
+"""Test distance matrix calculations using CUDA/Numba."""
+from scipy.spatial.distance import euclidean, cdist
+from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
+from numpy.testing import assert_allclose
+import numpy as np
+import os
+from importlib import reload
+
+# os.environ["NUMBA_DISABLE_JIT"] = "1"
+
+# number of columns of U and V must be set as env var before import dist_matrix
+cols = 100
+os.environ["COLUMNS"] = str(cols)
+
+# other environment variables (set before importing dist_matrix)
+os.environ["USE_64"] = "0"
+os.environ["INLINE"] = "never"
+os.environ["FASTMATH"] = "1"
+os.environ["TARGET"] = "cuda"
+
+from helper import Timer  # noqa
+from numba.cuda.testing import unittest, CUDATestCase  # noqa
+import cuda_dist_matrix  # noqa
+
+# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228)
+reload(cuda_dist_matrix)
+dist_matrix = cuda_dist_matrix.dist_matrix
+
+verbose_test = True
+
+
+class TestDistMat(CUDATestCase):
+    """Test distance matrix calculations on GPU for various metrics."""
+
+    def test_dist_matrix(self):
+        """
+        Loop through distance metrics and perform unit tests.
+
+        The four test cases are:
+            pairwise distances within a single set of vectors
+            pairwise distances between two sets of vectors
+            sparse pairwise distances within a single set of vectors
+            sparse pairwise distances between two sets of vectors
+
+        The ground truth for Euclidean comes from cdist.
+        The ground truth for Earth Mover's distance (1-Wasserstein) is via
+        a scipy.stats function.
+
+        Helper functions are used to generate test data and support the use of
+        Wasserstein distances in cdist.
+
+        Returns
+        -------
+        None.
+
+        """
+
+        def test_data(rows, cols):
+            """
+            Generate seeded test values and weights for two distributions.
+
+            Returns
+            -------
+            U : 2D array
+                Values of first distribution.
+            V : 2D array
+                Values of second distribution.
+            U_weights : 2D array
+                Weights of first distribution.
+            V_weights : 2D array
+                Weights of second distribution.
+
+            """
+            np.random.seed(42)
+            [U, V, U_weights, V_weights] = [
+                np.random.rand(rows, cols) for i in range(4)
+            ]
+            return U, V, U_weights, V_weights
+
+        def my_wasserstein_distance(u_uw, v_vw):
+            """
+            Return Earth Mover's distance using concatenated values and weights.
+
+            Parameters
+            ----------
+            u_uw : 1D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            Returns
+            -------
+            distance : numeric scalar
+                Earth Mover's distance given two distributions.
+
+            """
+            # split into values and weights
+            n = len(u_uw)
+            i = n // 2
+            u = u_uw[0:i]
+            uw = u_uw[i:n]
+            v = v_vw[0:i]
+            vw = v_vw[i:n]
+            # calculate distance
+            distance = scipy_wasserstein_distance(u, v, u_weights=uw, v_weights=vw)
+            return distance
+
+        def join_wasserstein(U, V, Uw, Vw):
+            """
+            Horizontally stack values and weights for each distribution.
+
+            Weights are added as additional columns to values.
+
+            Example:
+                u_uw, v_vw = join_wasserstein(u, v, uw, vw)
+                d = my_wasserstein_distance(u_uw, v_vw)
+                cdist(u_uw, v_vw, metric=my_wasserstein_distance)
+
+            Parameters
+            ----------
+            u : 1D or 2D numeric array
+                First set of distribution values.
+            v : 1D or 2D numeric array
+                Second set of values of distribution values.
+            uw : 1D or 2D numeric array
+                Weights for first distribution.
+            vw : 1D or 2D numeric array
+                Weights for second distribution.
+
+            Returns
+            -------
+            u_uw : 1D or 2D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            """
+            U_Uw = np.concatenate((U, Uw), axis=1)
+            V_Vw = np.concatenate((V, Vw), axis=1)
+            return U_Uw, V_Vw
+
+        tol = 1e-5
+
+        # test and production data
+        pairs = np.array([(0, 1), (1, 2), (2, 3)])
+        i, j = pairs[0]
+
+        for testQ in [True, False]:
+            # large # of rows with testQ == True is slow due to use of
+            # cdist and non-jitted scipy-wasserstein
+            if testQ:
+                rows = 6
+            else:
+                rows = 5000
+            print("====[PARTIAL TEST WITH {} ROWS]====".format(rows))
+            U, V, U_weights, V_weights = test_data(rows, cols)
+            Utest, Vtest, Uwtest, Vwtest = [
+                x[0:6] for x in [U, V, U_weights, V_weights]
+            ]  # noqa
+
+            for target in ["cuda"]:  # "cpu" not implemented
+                for metric in ["euclidean", "wasserstein"]:
+                    print("[" + target.upper() + "_" + metric.upper() + "]")
+
+                    if testQ:
+                        # compile
+                        dist_matrix(Utest, U_weights=Uwtest, metric=metric)
+                        with Timer("one set"):
+                            one_set = dist_matrix(
+                                U, U_weights=U_weights, metric=metric
+                            )  # noqa
+                            if verbose_test:
+                                print(one_set, "\n")
+
+                    # compile
+                    dist_matrix(
+                        Utest,
+                        V=Vtest,
+                        U_weights=Uwtest,
+                        V_weights=Vwtest,
+                        metric=metric,
+                    )
+                    with Timer("two set"):
+                        two_set = dist_matrix(
+                            U,
+                            V=V,
+                            U_weights=U_weights,
+                            V_weights=V_weights,
+                            metric=metric,
+                        )  # noqa
+                        if testQ and verbose_test:
+                            print(two_set, "\n")
+
+                        one_set_sparse = dist_matrix(
+                            U, U_weights=U_weights, pairs=pairs, metric=metric
+                        )  # noqa
+                        if testQ and verbose_test:
+                            print(one_set_sparse, "\n")
+
+                    two_set_sparse = dist_matrix(
+                        U,
+                        V=V,
+                        U_weights=U_weights,
+                        V_weights=V_weights,
+                        pairs=pairs,
+                        metric=metric,
+                    )
+                    if testQ and verbose_test:
+                        print(two_set_sparse, "\n")
+
+                    if testQ:
+                        if metric == "euclidean":
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(U, U)
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(U, V)
+
+                            one_sparse_check = [euclidean(U[i], U[j]) for i, j in pairs]
+                            two_sparse_check = [euclidean(U[i], V[j]) for i, j in pairs]
+
+                        elif metric == "wasserstein":
+                            U_Uw, V_Vw = join_wasserstein(U, V, U_weights, V_weights)
+
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(
+                                    U_Uw, U_Uw, metric=my_wasserstein_distance
+                                )
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(
+                                    U_Uw, V_Vw, metric=my_wasserstein_distance
+                                )
+
+                            one_sparse_check = [
+                                my_wasserstein_distance(U_Uw[i], U_Uw[j])  # noqa
+                                for i, j in pairs
+                            ]
+                            two_sparse_check = [
+                                my_wasserstein_distance(U_Uw[i], V_Vw[j])  # noqa
+                                for i, j in pairs
+                            ]
+
+                        # check results
+                        assert_allclose(
+                            one_set.ravel(),
+                            one_set_check.ravel(),
+                            rtol=tol,
+                            err_msg="one set {} {} distance matrix inaccurate".format(
+                                target, metric
+                            ),
+                        )  # noqa
+                        assert_allclose(
+                            two_set.ravel(),
+                            two_set_check.ravel(),
+                            rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(
+                                target, metric
+                            ),
+                        )  # noqa
+                        assert_allclose(
+                            one_set_sparse,
+                            one_sparse_check,
+                            rtol=tol,
+                            err_msg="one set {} {} sparse distance matrix \
+                                inaccurate".format(
+                                target, metric
+                            ),
+                        )  # noqa
+                        assert_allclose(
+                            two_set_sparse,
+                            two_sparse_check,
+                            rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(
+                                target, metric
+                            ),
+                        )  # noqa
+                    elif metric == "euclidean":
+                        with Timer("two set check (cdist)"):
+                            two_set_check = cdist(U, V)
+
+
+if __name__ == "__main__":
+    unittest.main()
+
+
+# TF = [
+#     np.allclose(one_set, one_set_check),
+#     np.allclose(two_set, two_set_check),
+#     np.allclose(one_set_sparse, one_sparse_check),
+#     np.allclose(two_set_sparse, two_sparse_check)
+#     ]
diff --git a/ElM2D/test_helper.py b/ElM2D/test_helper.py
new file mode 100644
index 0000000..67383ca
--- /dev/null
+++ b/ElM2D/test_helper.py
@@ -0,0 +1,83 @@
+"""
+Test helper functions for distance matrix calculations.
+"""
+import os
+import numpy as np
+from numba.cuda.testing import unittest, CUDATestCase
+import numba.cuda.kernels.device.helper as hp
+from numpy.testing import assert_allclose, assert_equal
+
+bits = int(os.environ.get("MACHINE_BITS", "32"))
+
+if bits == 32:
+    np_float = np.float32
+    np_int = np.float32
+elif bits == 64:
+    np_float = np.float64
+    np_int = np.float64
+
+tol = 1e-5
+
+
+class TestHelperFunctions(CUDATestCase):
+    def test_concatenate(self):
+        vec = np.random.rand(5)
+        vec2 = np.random.rand(10)
+        check = np.concatenate((vec, vec2))
+        out = np.zeros(len(vec) + len(vec2), dtype=np_float)
+        hp.concatenate(vec, vec2, out)
+
+        assert_allclose(out, check)
+        vec[0] = 100.0
+        assert_allclose(out, check)
+
+    def test_diff(self):
+        vec = np.random.rand(10)
+        out = np.zeros(len(vec) - 1)
+        hp.diff(vec, out)
+        check = np.diff(vec)
+        assert_equal(out, check)
+
+    def test_bisect_right(self):
+        a = np.random.rand(10)
+        a.sort()
+        v = np.random.rand(5)
+        ids = np.zeros_like(v[:-1], dtype=np_int)
+        hp.bisect_right(a, v[:-1], ids)
+        check = np.searchsorted(a, v[:-1], side="right")
+        assert_equal(ids, check)
+
+    def test_sort_by_indices(self):
+        v = np.random.rand(10)
+        ids = np.arange(len(v))
+        np.random.shuffle(ids)
+        out = np.zeros_like(v)
+        hp.sort_by_indices(v, ids, out)
+        check = v[ids]
+        assert_equal(out, check)
+
+        v = np.array([0.0, 0.5528931, 1.1455898, 1.5933731])
+        ids = np.array([1, 2, 3, 3, 3])
+        out = np.zeros_like(ids, dtype=np_float)
+        hp.sort_by_indices(v, ids, out)
+        check = v[ids]
+        assert_allclose(out, check)
+
+    def test_cumsum(self):
+        vec = np.random.rand(10)
+        out = np.zeros_like(vec, dtype=np_float)
+        hp.cumsum(vec, out)
+        check = np.cumsum(vec)
+        assert_allclose(out, check)
+
+    def test_divide(self):
+        v = np.random.rand(10)
+        b = 4.62
+        out = np.zeros_like(v, dtype=np_float)
+        hp.divide(v, b, out)
+        check = v / b
+        assert_allclose(out, check)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/ElM2D/test_njit.py b/ElM2D/test_njit.py
new file mode 100644
index 0000000..924f0ab
--- /dev/null
+++ b/ElM2D/test_njit.py
@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep  8 01:36:51 2021
+
+@author: sterg
+"""
+
+from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
+from scipy.spatial.distance import cdist
+
+# generate test data
+np.random.seed(42)
+rows = 200
+cols = 100
+[U, V, U_weights, V_weights] = [np.random.rand(rows, cols) for i in range(4)]
+
+testpairs = np.array([(1, 2), (2, 3), (3, 4)])
+
+# %% unit tests
+one_set_check = scipy_wasserstein_distance(
+    U[0], U[1], u_weights=U_weights[0], v_weights=U_weights[1]
+)
+two_set_check = scipy_wasserstein_distance(
+    U[0], V[0], u_weights=U_weights[0], v_weights=V_weights[0]
+)
+i, j = pairs[0]
+one_sparse_check = scipy_wasserstein_distance(
+    U[i], U[j], u_weights=U_weights[i], v_weights=U_weights[j]
+)
+two_sparse_check = scipy_wasserstein_distance(
+    U[i], V[j], u_weights=U_weights[i], v_weights=V_weights[j]
+)
+
+tol = 1e-6
+print(
+    abs(one_set[0, 1] - one_set_check) < tol,
+    abs(two_set[0, 0] - two_set_check) < tol,
+    abs(one_set_sparse[0, 0] - one_sparse_check) < tol,
+    abs(two_set_sparse[0, 0] - two_sparse_check) < tol,
+)
+
+
+# %% wasserstein helper functions
+def my_wasserstein_distance(u_uw, v_vw):
+    """
+    Return Earth Mover's distance using concatenated values and weights.
+
+    Parameters
+    ----------
+    u_uw : 1D numeric array
+        Horizontally stacked values and weights of first distribution.
+    v_vw : TYPE
+        Horizontally stacked values and weights of second distribution.
+
+    Returns
+    -------
+    distance : numeric scalar
+        Earth Mover's distance given two distributions.
+
+    """
+    # split into values and weights
+    n = len(u_uw)
+    i = n // 2
+    u = u_uw[0:i]
+    uw = u_uw[i:n]
+    v = v_vw[0:i]
+    vw = v_vw[i:n]
+    # calculate distance
+    distance = wasserstein_distance(u, v, u_weights=uw, v_weights=vw)
+    return distance
+
+
+def join_wasserstein(U, V, Uw, Vw):
+    """
+    Horizontally stack values and weights for each distribution.
+
+    Weights are added as additional columns to values.
+
+    Example:
+        u_uw, v_vw = join_wasserstein(u, v, uw, vw)
+        d = my_wasserstein_distance(u_uw, v_vw)
+        cdist(u_uw, v_vw, metric=my_wasserstein_distance)
+
+    Parameters
+    ----------
+    u : 1D or 2D numeric array
+        First set of distribution values.
+    v : 1D or 2D numeric array
+        Second set of values of distribution values.
+    uw : 1D or 2D numeric array
+        Weights for first distribution.
+    vw : 1D or 2D numeric array
+        Weights for second distribution.
+
+    Returns
+    -------
+    u_uw : 1D or 2D numeric array
+        Horizontally stacked values and weights of first distribution.
+    v_vw : TYPE
+        Horizontally stacked values and weights of second distribution.
+
+    """
+    U_Uw = np.concatenate((U, Uw), axis=1)
+    V_Vw = np.concatenate((V, Vw), axis=1)
+    return U_Uw, V_Vw
+
+
+@njit(fastmath=fastmath, debug=debug)
+def wasserstein_distance_check(u_values, v_values, u_weights=None, v_weights=None, p=1):
+    r"""
+    Compute first
+
+    Compute, between two one-dimensional distributions :math:`u` and
+    :math:`v`, whose respective CDFs are :math:`U` and :math:`V`, the
+    statistical distance that is defined as:
+    .. math::
+        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
+    p is a positive parameter; p = 1 gives the Wasserstein distance, p = 2
+    gives the energy distance.
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+           Munos "The Cramer Distance as a Solution to Biased Wasserstein
+           Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    u_sorter = np.argsort(u_values)
+    v_sorter = np.argsort(v_values)
+
+    all_values = np.concatenate((u_values, v_values))
+    all_values.sort(kind="mergesort")
+
+    # Compute the differences between pairs of successive values of u and v.
+    deltas = np.diff(all_values)
+
+    # Get the respective positions of the values of u and v among the values of
+    # both distributions.
+    u_cdf_indices = u_values[u_sorter].searchsorted(all_values[:-1], "right")
+    v_cdf_indices = v_values[v_sorter].searchsorted(all_values[:-1], "right")
+
+    # Calculate the CDFs of u and v using their weights, if specified.
+    if u_weights is None:
+        u_cdf = u_cdf_indices / u_values.size
+    else:
+        u_sorted_cumweights = np.concatenate(([0], np.cumsum(u_weights[u_sorter])))
+        u_cdf = u_sorted_cumweights[u_cdf_indices] / u_sorted_cumweights[-1]
+
+    if v_weights is None:
+        v_cdf = v_cdf_indices / v_values.size
+    else:
+        v_sorted_cumweights = np.concatenate(([0], np.cumsum(v_weights[v_sorter])))
+        v_cdf = v_sorted_cumweights[v_cdf_indices] / v_sorted_cumweights[-1]
+
+    # Compute the value of the integral based on the CDFs.
+    # If p = 1 or p = 2, we avoid using np.power, which introduces an overhead
+    # of about 15%.
+    if p == 1:
+        return np.sum(np.multiply(np.abs(u_cdf - v_cdf), deltas))
+    if p == 2:
+        return np.sqrt(np.sum(np.multiply(np.square(u_cdf - v_cdf), deltas)))
+    return np.power(
+        np.sum(np.multiply(np.power(np.abs(u_cdf - v_cdf), p), deltas)), 1 / p
+    )
+
+
+U_Uw, V_Vw = join_wasserstein(U, V, U_weights, V_weights)
+
+
+# %% CODE GRAVEYARD
+# def setdiff(a, b):
+#     """
+#     Find the rows in a which are not in b.
+
+#     Source: modified from https://stackoverflow.com/a/11903368/13697228
+#     See also: https://www.mathworks.com/help/matlab/ref/double.setdiff.html
+
+#     Parameters
+#     ----------
+#     a : 2D array
+#         Set of vectors.
+#     b : 2D array
+#         Set of vectors.
+
+#     Returns
+#     -------
+#     out : 2D array
+#         Set of vectors in a that are not in b.
+
+#     """
+#     a_rows = a.view([("", a.dtype)] * a.shape[1])
+#     b_rows = b.view([("", b.dtype)] * b.shape[1])
+#     out = np.setdiff1d(a_rows, b_rows).view(a.dtype).reshape(-1, a.shape[1])
+#     return out
+
+# n_neighbors = 3
+# n_neighbors2 = 1
diff --git a/ElM2D/train-debug.csv b/ElM2D/train-debug.csv
new file mode 100644
index 0000000..e4de4c6
--- /dev/null
+++ b/ElM2D/train-debug.csv
@@ -0,0 +1,99 @@
+cif_file,formula,target
+Zr1_ICSD_164572.cif,Zr1,999
+K2Mg5Sn3_ICSD_421342.cif,K2Mg5Sn3,999
+C1In1La3_ICSD_80956.cif,C1In1La3,999
+Al3B2Ru4_ICSD_43844.cif,Al3B2Ru4,999
+Au1Ho1Pb1_ICSD_58487.cif,Au1Ho1Pb1,999
+F2Xe1_ICSD_260950.cif,F2Xe1,999
+Dy1S2_ICSD_630195.cif,Dy1S2,999
+Nb3Te4_ICSD_16606.cif,Nb3Te4,999
+Rb2Te1_ICSD_182743.cif,Rb2Te1,999
+Sb2Tb1_ICSD_651606.cif,Sb2Tb1,999
+Hf1Pd5_ICSD_168289.cif,Hf1Pd5,999
+O3Pb1Zr1_ICSD_262104.cif,O3Pb1Zr1,999
+Ag1Al1Se2_ICSD_28745.cif,Ag1Al1Se2,999
+Pd1Sb1Tb1_ICSD_648792.cif,Pd1Sb1Tb1,999
+Ga1La1Zn1_ICSD_634520.cif,Ga1La1Zn1,999
+Ag1As1S1_ICSD_604740.cif,Ag1As1S1,999
+Ca1Pd1_ICSD_58926.cif,Ca1Pd1,999
+Cd2Yb1_ICSD_620601.cif,Cd2Yb1,999
+K1Li1Se1_ICSD_290451.cif,K1Li1Se1,999
+Al2N1Nb3_ICSD_60644.cif,Al2N1Nb3,999
+Pt7Sb1_ICSD_54340.cif,Pt7Sb1,999
+Ag1Er1_ICSD_167893.cif,Ag1Er1,999
+Ca2O4Pb1_ICSD_36629.cif,Ca2O4Pb1,999
+Cl4K2Pd1_ICSD_23997.cif,Cl4K2Pd1,999
+P1Ru1Zr1_ICSD_648037.cif,P1Ru1Zr1,999
+As1Li1Mg1_ICSD_107954.cif,As1Li1Mg1,999
+Au1Ga1Zr1_ICSD_156264.cif,Au1Ga1Zr1,999
+In2Li1Rh1_ICSD_106806.cif,In2Li1Rh1,999
+Ca3Ge13Ir4_ICSD_619308.cif,Ca3Ge13Ir4,999
+Ba1Mg4Si3_ICSD_165630.cif,Ba1Mg4Si3,999
+Mo3P1_ICSD_43238.cif,Mo3P1,999
+Ni1Zr2_ICSD_102805.cif,Ni1Zr2,999
+As1Hf1Ru1_ICSD_610654.cif,As1Hf1Ru1,999
+Co2Ge1Zn1_ICSD_52994.cif,Co2Ge1Zn1,999
+N1Ni2W3_ICSD_86170.cif,N1Ni2W3,999
+B6K1_ICSD_166487.cif,B6K1,999
+Lu1Pb2_ICSD_104811.cif,Lu1Pb2,999
+Au1Dy1_ICSD_58439.cif,Au1Dy1,999
+Pt3Sn1_ICSD_105795.cif,Pt3Sn1,999
+Sn1Zr3_ICSD_106108.cif,Sn1Zr3,999
+Bi1_ICSD_51674.cif,Bi1,999
+In1P1Pd5_ICSD_640193.cif,In1P1Pd5,999
+In2O3_ICSD_14387.cif,In2O3,999
+C1Sc3Tl1_ICSD_50164.cif,C1Sc3Tl1,999
+Al1Pd1Y1_ICSD_156923.cif,Al1Pd1Y1,999
+Ho1In3_ICSD_104411.cif,Ho1In3,999
+Dy1Rh2Si2_ICSD_108427.cif,Dy1Rh2Si2,999
+Al2Y1_ICSD_186476.cif,Al2Y1,999
+Pu1Se1_ICSD_649958.cif,Pu1Se1,999
+Co1Y1_ICSD_20954.cif,Co1Y1,999
+Al4Er1Ni1_ICSD_57779.cif,Al4Er1Ni1,999
+Er5Si3_ICSD_57268.cif,Er5Si3,999
+As1Hf1_ICSD_42915.cif,As1Hf1,999
+Be2Re1_ICSD_58732.cif,Be2Re1,999
+Al1Ge1Sc1_ICSD_608009.cif,Al1Ge1Sc1,999
+C1Pb1Pd3_ICSD_108178.cif,C1Pb1Pd3,999
+Cu2In1Y1_ICSD_103023.cif,Cu2In1Y1,999
+Hf1Pt1_ICSD_104257.cif,Hf1Pt1,999
+As3La4_ICSD_610771.cif,As3La4,999
+Al16Hf6Pd7_ICSD_608137.cif,Al16Hf6Pd7,999
+Ba1Os2P2_ICSD_602106.cif,Ba1Os2P2,999
+B6Os1Y2_ICSD_603494.cif,B6Os1Y2,999
+Ho1In1_ICSD_104410.cif,Ho1In1,999
+Er1Ru2Si2_ICSD_106608.cif,Er1Ru2Si2,999
+Au1Be5_ICSD_150580.cif,Au1Be5,999
+Rh2Si2Y1_ICSD_602257.cif,Rh2Si2Y1,999
+Ce1Ga1Ni1_ICSD_102171.cif,Ce1Ga1Ni1,999
+Au1Ca1Cd1_ICSD_420574.cif,Au1Ca1Cd1,999
+Ga12Lu4Pd1_ICSD_634587.cif,Ga12Lu4Pd1,999
+O6Os2Rb1_ICSD_161127.cif,O6Os2Rb1,999
+Dy1Ga1Pt1_ICSD_629731.cif,Dy1Ga1Pt1,999
+Pt1Sb2_ICSD_43105.cif,Pt1Sb2,999
+La1Rh2_ICSD_104705.cif,La1Rh2,999
+Er1Ge2Rh2_ICSD_630646.cif,Er1Ge2Rh2,999
+F6K2Mn1_ICSD_47213.cif,F6K2Mn1,999
+Ho2Rh3Sn5_ICSD_639639.cif,Ho2Rh3Sn5,999
+Cl6Cs2Te1_ICSD_26704.cif,Cl6Cs2Te1,999
+Ni2Si2Zr1_ICSD_20317.cif,Ni2Si2Zr1,999
+P1Rh2_ICSD_38356.cif,P1Rh2,999
+Au1Pb4Rb3_ICSD_107447.cif,Au1Pb4Rb3,999
+N1Ru2Zr4_ICSD_644658.cif,N1Ru2Zr4,999
+Ca1O3Ti1_ICSD_153173.cif,Ca1O3Ti1,999
+As1Fe1Nb1_ICSD_610502.cif,As1Fe1Nb1,999
+Au1Cu4Tb1_ICSD_611769.cif,Au1Cu4Tb1,999
+Ir5Th1_ICSD_104573.cif,Ir5Th1,999
+P2Zn3_ICSD_24487.cif,P2Zn3,999
+In1Pd2_ICSD_417907.cif,In1Pd2,999
+As2Cu1Er1_ICSD_412553.cif,As2Cu1Er1,999
+Al1F3_ICSD_130021.cif,Al1F3,999
+C2Ir1_ICSD_181489.cif,C2Ir1,999
+Cu4O3_ICSD_100566.cif,Cu4O3,999
+Ir2S3Sn3_ICSD_640953.cif,Ir2S3Sn3,999
+Si136_ICSD_56721.cif,Si136,999
+Ce2Pd21Si6_ICSD_174079.cif,Ce2Pd21Si6,999
+O4Sb1V1_ICSD_84567.cif,O4Sb1V1,999
+In1Pt2Tb1_ICSD_640312.cif,In1Pt2Tb1,999
+Sr1Tl2_ICSD_106111.cif,Sr1Tl2,999
+Ca1O3Rh1_ICSD_164775.cif,Ca1O3Rh1,999