refactor gmm to have more consistent API with other models

ddbourgin · ddbourgin · commit 36bba1f4ad32 · 2020-06-21T15:56:54.000-04:00
diff --git a/numpy_ml/gmm/gmm.py b/numpy_ml/gmm/gmm.py
@@ -1,5 +1,7 @@
+"""A Gaussian mixture model class"""
 import numpy as np
-from numpy.testing import assert_allclose
+
+from numpy_ml.utils.misc import logsumexp, log_gaussian_pdf
 
 
 class GMM(object):
@@ -31,48 +33,52 @@ def __init__(self, C=3, seed=None):
         sigma : :py:class:`ndarray <numpy.ndarray>` of shape `(C, d, d)`
             The cluster covariance matrices.
         """
-        self.C = C  # number of clusters
-        self.N = None  # number of objects
-        self.d = None  # dimension of each object
-        self.seed = seed
+        self.elbo = None
+        self.parameters = {}
+        self.hyperparameters = {
+            "C": C,
+            "seed": seed,
+        }
 
-        if self.seed:
-            np.random.seed(self.seed)
+        self.is_fit = False
+
+        if seed:
+            np.random.seed(seed)
+
+    def _initialize_params(self, X):
+        """Randomly initialize the starting GMM parameters."""
+        N, d = X.shape
+        C = self.hyperparameters["C"]
 
-    def _initialize_params(self):
-        """
-        Randomly initialize the starting GMM parameters.
-        """
-        C, d = self.C, self.d
         rr = np.random.rand(C)
 
-        self.pi = rr / rr.sum()  # cluster priors
-        self.Q = np.zeros((self.N, C))  # variational distribution q(T)
-        self.mu = np.random.uniform(-5, 10, C * d).reshape(C, d)  # cluster means
-        self.sigma = np.array([np.identity(d) for _ in range(C)])  # cluster covariances
+        self.parameters = {
+            "pi": rr / rr.sum(),  # cluster priors
+            "Q": np.zeros((N, C)),  # variational distribution q(T)
+            "mu": np.random.uniform(-5, 10, C * d).reshape(C, d),  # cluster means
+            "sigma": np.array([np.eye(d) for _ in range(C)]),  # cluster covariances
+        }
 
-        self.best_pi = None
-        self.best_mu = None
-        self.best_sigma = None
-        self.best_elbo = -np.inf
+        self.elbo = None
+        self.is_fit = False
 
-    def likelihood_lower_bound(self):
-        """
-        Compute the LLB under the current GMM parameters.
-        """
-        N = self.N
-        C = self.C
+    def likelihood_lower_bound(self, X):
+        """Compute the LLB under the current GMM parameters."""
+        N = X.shape[0]
+        P = self.parameters
+        C = self.hyperparameters["C"]
+        pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]
 
         eps = np.finfo(float).eps
         expec1, expec2 = 0.0, 0.0
         for i in range(N):
-            x_i = self.X[i]
+            x_i = X[i]
 
             for c in range(C):
-                pi_k = self.pi[c]
-                z_nk = self.Q[i, c]
-                mu_k = self.mu[c, :]
-                sigma_k = self.sigma[c, :, :]
+                pi_k = pi[c]
+                z_nk = Q[i, c]
+                mu_k = mu[c, :]
+                sigma_k = sigma[c, :, :]
 
                 log_pi_k = np.log(pi_k + eps)
                 log_p_x_i = log_gaussian_pdf(x_i, mu_k, sigma_k)
@@ -110,49 +116,83 @@ def fit(self, X, max_iter=100, tol=1e-3, verbose=False):
             mixture components collapsed and training was halted prematurely
             (-1).
         """
-        self.X = X
-        self.N = X.shape[0]  # number of objects
-        self.d = X.shape[1]  # dimension of each object
-
-        self._initialize_params()
         prev_vlb = -np.inf
+        self._initialize_params(X)
 
         for _iter in range(max_iter):
             try:
-                self._E_step()
-                self._M_step()
-                vlb = self.likelihood_lower_bound()
+                self._E_step(X)
+                self._M_step(X)
+                vlb = self.likelihood_lower_bound(X)
 
                 if verbose:
-                    print("{}. Lower bound: {}".format(_iter + 1, vlb))
+                    print(f"{_iter + 1}. Lower bound: {vlb}")
 
                 converged = _iter > 0 and np.abs(vlb - prev_vlb) <= tol
                 if np.isnan(vlb) or converged:
                     break
 
                 prev_vlb = vlb
 
-                # retain best parameters across fits
-                if vlb > self.best_elbo:
-                    self.best_elbo = vlb
-                    self.best_mu = self.mu
-                    self.best_pi = self.pi
-                    self.best_sigma = self.sigma
-
             except np.linalg.LinAlgError:
                 print("Singular matrix: components collapsed")
                 return -1
+
+        self.elbo = vlb
+        self.is_fit = True
         return 0
 
-    def _E_step(self):
-        for i in range(self.N):
-            x_i = self.X[i, :]
+    def predict(self, X, soft_labels=True):
+        """
+        Return the log probability of each data point in `X` under each
+        mixture components.
+
+        Parameters
+        ----------
+        X : :py:class:`ndarray <numpy.ndarray>` of shape `(M, d)`
+            A collection of `M` data points, each with dimension `d`.
+        soft_labels : bool
+            If True, return the log probabilities of the M data points in X
+            under each mixture component. If False, return only the ID of the
+            most probable mixture. Default is True.
+
+        Returns
+        -------
+        y : :py:class:`ndarray <numpy.ndarray>` of shape `(M, C)` or `(M,)`
+            If `soft_labels` is True, `y` is a 2D array where index (i,j) gives
+            the log probability of the `i` th data point under the `j` th
+            mixture component. If `soft_labels` is False, `y` is a 1D array
+            where the `i` th index contains the ID of the most probable mixture
+            component.
+        """
+        assert self.is_fit, "Must call the `.fit` method before making predictions"
+
+        P = self.parameters
+        C = self.hyperparameters["C"]
+        mu, sigma = P["mu"], P["sigma"]
+
+        y = []
+        for x_i in X:
+            cprobs = [log_gaussian_pdf(x_i, mu[c, :], sigma[c, :, :]) for c in range(C)]
 
+            if not soft_labels:
+                y.append(np.argmax(cprobs))
+            else:
+                y.append(cprobs)
+
+        return np.array(y)
+
+    def _E_step(self, X):
+        P = self.parameters
+        C = self.hyperparameters["C"]
+        pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]
+
+        for i, x_i in enumerate(X):
             denom_vals = []
-            for c in range(self.C):
-                pi_c = self.pi[c]
-                mu_c = self.mu[c, :]
-                sigma_c = self.sigma[c, :, :]
+            for c in range(C):
+                pi_c = pi[c]
+                mu_c = mu[c, :]
+                sigma_c = sigma[c, :, :]
 
                 log_pi_c = np.log(pi_c)
                 log_p_x_i = log_gaussian_pdf(x_i, mu_c, sigma_c)
@@ -163,63 +203,38 @@ def _E_step(self):
             # log \sum_c exp{ log N(X_i | mu_c, Sigma_c) + log pi_c } ]
             log_denom = logsumexp(denom_vals)
             q_i = np.exp([num - log_denom for num in denom_vals])
-            assert_allclose(np.sum(q_i), 1, err_msg="{}".format(np.sum(q_i)))
+            np.testing.assert_allclose(np.sum(q_i), 1, err_msg="{}".format(np.sum(q_i)))
 
-            self.Q[i, :] = q_i
+            Q[i, :] = q_i
 
-    def _M_step(self):
-        C, N, X = self.C, self.N, self.X
-        denoms = np.sum(self.Q, axis=0)
+    def _M_step(self, X):
+        N, d = X.shape
+        P = self.parameters
+        C = self.hyperparameters["C"]
+        pi, Q, mu, sigma = P["pi"], P["Q"], P["mu"], P["sigma"]
+
+        denoms = np.sum(Q, axis=0)
 
         # update cluster priors
-        self.pi = denoms / N
+        pi = denoms / N
 
         # update cluster means
-        nums_mu = [np.dot(self.Q[:, c], X) for c in range(C)]
+        nums_mu = [np.dot(Q[:, c], X) for c in range(C)]
         for ix, (num, den) in enumerate(zip(nums_mu, denoms)):
-            self.mu[ix, :] = num / den if den > 0 else np.zeros_like(num)
+            mu[ix, :] = num / den if den > 0 else np.zeros_like(num)
 
         # update cluster covariances
         for c in range(C):
-            mu_c = self.mu[c, :]
+            mu_c = mu[c, :]
             n_c = denoms[c]
 
-            outer = np.zeros((self.d, self.d))
+            outer = np.zeros((d, d))
             for i in range(N):
-                wic = self.Q[i, c]
-                xi = self.X[i, :]
+                wic = Q[i, c]
+                xi = X[i, :]
                 outer += wic * np.outer(xi - mu_c, xi - mu_c)
 
             outer = outer / n_c if n_c > 0 else outer
-            self.sigma[c, :, :] = outer
-
-        assert_allclose(np.sum(self.pi), 1, err_msg="{}".format(np.sum(self.pi)))
-
-
-#######################################################################
-#                                Utils                                #
-#######################################################################
-
-
-def log_gaussian_pdf(x_i, mu, sigma):
-    """
-    Compute log N(x_i | mu, sigma)
-    """
-    n = len(mu)
-    a = n * np.log(2 * np.pi)
-    _, b = np.linalg.slogdet(sigma)
-
-    y = np.linalg.solve(sigma, x_i - mu)
-    c = np.dot(x_i - mu, y)
-    return -0.5 * (a + b + c)
-
+            sigma[c, :, :] = outer
 
-def logsumexp(log_probs, axis=None):
-    """
-    Redefine scipy.special.logsumexp
-    see: http://bayesjumping.net/log-sum-exp-trick/
-    """
-    _max = np.max(log_probs)
-    ds = log_probs - _max
-    exp_sum = np.exp(ds).sum(axis=axis)
-    return _max + np.log(exp_sum)
+        np.testing.assert_allclose(np.sum(pi), 1, err_msg="{}".format(np.sum(pi)))