Source code for pymlfs.kmeans

import logging
import torch
from tqdm import tqdm


[docs]class Kmeans:
    """Implements a simplified version of the K-means algorithm."""

    def __init__(self, k: int, max_iters: int) -> None:
        """Constructor function.

        Init parameters.

        Parameters
        ----------
        k : int
            Number of clusters.
        max_iters : int
            Maximum number of iterations.
        """
        self.k = k
        self.max_iters = max_iters

        # logger
        self._logger_config()

[docs]    def fit(self, X: torch.Tensor) -> None:
        """
        This method is used to train the K-means algorithm.

        Parameters
        ----------
        X : torch.Tensor
            The training data (m x n) with m being the number of examples
            and n the number of features.
        """

        self.m, self.n = X.shape
        self.centroids = torch.rand(self.k, self.n)

        self.logger.info("Training K-means started")

        for iter in tqdm(range(self.max_iters + 1)):
            # Performing the Expectation step
            data_clusters = self._e_step(X)

            # Keeps in memory the last version of the centroids
            old_centroids = self.centroids.clone()

            # Performing the Maximization step
            self._m_step(X, data_clusters)

            # Convergence test
            if self._stopping_criterion(iter, old_centroids):
                break

        self.logger.info("Training K-means ended")

[docs]    def predict(self, X: torch.Tensor) -> torch.Tensor:
        """Performs cluster prediction

        This method is used to predict the cluster for a batch of examples.

        Parameters
        ----------
        X : torch.Tensor
            The test data (m x n) with m being the number of examples
            and n the number of features.

        Returns
        -------
        torch.Tensor
            returns pcluster predictions in a torch tensor.
        """
        return self._e_step(X)

[docs]    def _e_step(self, X: torch.Tensor) -> torch.Tensor:
        """Performs the expectation step.

        This method performs the expectation step of the EM algorithm.
        In other words, it assigns a cluster to every training point
        given the cluster's centroids.

        Parameters
        ----------
        X : torch.Tensor
            The training data (m x n) with m being the number
            of examples and n the number of features.

        Returns
        -------
            data_clusters : torch.Tensor
                Clusters of the input examples.
        """
        # Compute the distance between training points
        # and the current centroids
        distances = (X - self.centroids.reshape(self.k, 1, self.n)) ** 2
        distances = distances.sum(axis=2)

        # Assign the point to the closest centroid
        data_clusters = torch.argmin(distances, dim=0)

        return data_clusters

[docs]    def _m_step(self, X: torch.Tensor, data_clusters: torch.Tensor) -> None:
        """Performs the maximization step.

        This method performs the maximization step of the EM algorithm.
        In other words, given the clusters for each training point,
        it estimates the new centroids.

        Parameters
        ----------
        X : torch.Tensor
            The training data (m x n) with m being the number of examples
            and n the number of features.
        data_clusters : torch.Tensor
            Clusters of the input examples.
        """
        # The new centroid is the average over the examples assigned to it.
        for k in range(self.k):
            nb_samples = sum(data_clusters == k)
            if nb_samples != 0:
                self.centroids[k] = (1 / nb_samples) * X[data_clusters == k, :].sum(
                    axis=0
                )  # k x n

[docs]    def _stopping_criterion(self, iter: int, old_centroids: torch.Tensor) -> bool:
        """Stopping criterion definition.

        This method defines the stopping criterion for the K-means algorithm.

        Parameters
        ----------
        iter : int
            The curent iteration.
        old_centroids : torch.Tensor
            The previous version of clusters centroids.

        Returns
        -------
        bool
            returns True if the conditions of the stopping criterion are
            satisfied.
        """
        # Either the maxmimum number of iterations is reached
        # or the centroids do not move.
        return iter == self.max_iters or torch.equal(self.centroids, old_centroids)

[docs]    def _logger_config(self) -> None:
        """
        Logger configuration
        """
        self.logger = logging.getLogger("Kmeans")
        self.logger.setLevel(logging.INFO)
        formatter = logging.Formatter(
            "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
        )
        stream_handler = logging.StreamHandler()
        stream_handler.setLevel(logging.INFO)
        stream_handler.setFormatter(formatter)
        self.logger.addHandler(stream_handler)