Source code for mdance.cluster.nani

import numpy as np
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score

from mdance.tools.bts import diversity_selection, calculate_comp_sim, quota_sampling


[docs]class KmeansNANI:
    """*k*-means NANI clustering alogorithm (*N*-Ary Natural Initialization).
    
    Valid Values for ``init_types``: (*k* means number of clusters)
    | ``strat_all``: A number of bins are computed based on specified percentage of the data. Stratified sampling is then applied, and the first *k* points from the stratified data are selected as the initial centers.
    | ``strat_reduced``: Identifies high-density regions using complementary similarity, selecting a specified percentage of points. Applies stratified sampling to this subset using a number of bins based on the subset size, and selects the first *k* points as initial centers.
    | ``comp_sim``: Identifies high-density regions using complementary similarity, selecting a percentage% of the data. From this subset, diversity selection (with ``comp_sim`` as the sampling method) is used to choose the first *k* points as the initial centers.
    | ``div_select``: Applies diversity selection (using ``comp_sim`` as the sampling method) on specified percentage% of points. First *k* points are the initial centers.
    | ``quota``: Uses quota sampling to select initial centers based on complementary similarity values divided into bins.
    | ``k-means++`` selects the initial centers based on the greedy *k*-means++ algorithm.
    | ``random`` selects the initial centers randomly.
    | ``vanilla_kmeans++`` selects the initial centers based on the vanilla *k*-means++ algorithm.
    
    Parameters
    ----------
    data : array-like of shape (n_samples, n_features)
        A feature array.
    n_clusters : int
        Number of clusters.
    metric : str
        The metric to when calculating distance between *n* objects in an array. 
        It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`.
    N_atoms : int
        Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` 
        for non-MD systems.
    init_type : str, default='comp_sim'
        Type of initiator selection for initiating *k*-means. It must be an 
        options allowed by :class:`mdance.cluster.nani.KmeansNANI`.
    percentage : int, default=10
        Percentage of the dataset to be used for the initial selection of the 
        initial centers. (**kwargs)
    
    Attributes
    ----------
    labels : array-like of shape (n_samples,)
        An array of the labels of each point.
    centers : array-like of shape (n_clusters, n_features)
        An array of the cluster centers.
    n_iter : int
        Number of iterations until coverage.
    cluster_dict : dict
        Dictionary of the clusters and their corresponding indices.
    """
    def __init__(self, data, n_clusters, metric, N_atoms, init_type='strat_all', 
                 **kwargs):
        self.data = data
        self.n_clusters = n_clusters
        self.metric = metric
        self.N_atoms = N_atoms
        self.init_type = init_type
        self._check_init_type()
        if self.init_type in ['comp_sim', 'div_select', 'strat_reduced', 'strat_all','quota']:
            self.percentage = kwargs.get('percentage', 10)
            self._check_percentage()
    
    
    def _check_init_type(self):
        """Checks the ``init_type`` attribute.

        Raises
        ------
        ValueError
            If ``init_type`` is not one of the following: ``comp_sim``, ``div_select``, 
            ``k-means++``, ``random``, ``vanilla_kmeans++``.
        """
        if self.init_type not in ['comp_sim', 'div_select', 'k-means++', 
                                  'random', 'vanilla_kmeans++', 'strat_all',
                                  'strat_reduced', 'quota']:
            raise ValueError('init_type must be one of the following: comp_sim, \
                             div_select, k-means++, random, vanilla_kmeans++, strat_all, \
                             strat_reduced, quota.')
    
    
    def _check_percentage(self):
        """Checks the ``percentage`` attribute.
        
        Raises
        ------
        TypeError
            If percentage is not an integer.
        ValueError
            If percentage is not between 0 and 100.
        """
        if not isinstance(self.percentage, int):
            raise TypeError('percentage must be an integer [0, 100].')
        if not 0 <= self.percentage <= 100:
            raise ValueError('percentage must be an integer [0, 100].')
    
    
[docs]    def initiate_kmeans(self, **kwargs):
        """Initializes the *k*-means algorithm with the selected initiators.
        
        Raises
        ------
        ValueError
            If the number of initiators is less than the number of clusters.
        
        Returns
        -------
        numpy.ndarray
            The initial centers for *k*-means of shape (n_clusters, n_features).
        """
        if self.init_type in ['strat_reduced', 'comp_sim']:
            n_total = len(self.data)
            n_max = int(n_total * self.percentage / 100)
            comp_sim = calculate_comp_sim(self.data, self.metric, self.N_atoms)
            sorted_indices = np.argsort(comp_sim)
            top_comp_sim_indices = sorted_indices[-n_max:]
            top_cc_data = self.data[top_comp_sim_indices]

            if self.init_type == 'strat_reduced':
                sampling_method, start_method = 'strat', 'medoid'
            else:
                sampling_method, start_method = 'comp_sim', 'medoid'
            initiator_idxs = diversity_selection(top_cc_data, 100, self.metric, self.N_atoms, 
                                                 sampling_method, start_method)
            initiators = top_cc_data[initiator_idxs]
            
        elif self.init_type == 'strat_all':
            initiator_idxs = diversity_selection(self.data, self.percentage, self.metric, 
                                                 self.N_atoms, 'strat', 'medoid')
            initiators = self.data[initiator_idxs]
        
        elif self.init_type == 'div_select':
            initiator_idxs = diversity_selection(self.data, self.percentage, self.metric, 
                                                 self.N_atoms, 'comp_sim', 'medoid')
            initiators = self.data[initiator_idxs]

        elif self.init_type == 'quota':
            n_bins = kwargs.get('n_bins', 10)
            initiator_idxs = quota_sampling(self.data, self.metric, 
                                          percentage=self.percentage,
                                          n_bins=n_bins,
                                          N_atoms=self.N_atoms)
            initiators = self.data[initiator_idxs]
        
        elif self.init_type == 'vanilla_kmeans++':
            initiators, indices = kmeans_plusplus(self.data, self.n_clusters, random_state=None, 
                                                  n_local_trials=1)
        
        if len(initiators) < self.n_clusters:
            raise ValueError('The number of initiators is less than the number of clusters. Try increasing the percentage.')
        
        return initiators[:self.n_clusters]
    
    
[docs]    def kmeans_clustering(self, initiators):
        """Executes the *k*-means algorithm with the selected initiators.

        Parameters
        ----------
        initiators : {numpy.ndarray, 'k-means++', 'random'}
            Method for selecting initial centers.
            ``k-means++`` selects initial centers in a smart way to speed up convergence.
            ``random`` selects initial centers randomly.
            numpy.ndarray selects initial centers based on the input array.

        Returns
        -------
        tuple
            Labels, centers and number of iterations.
        """
        if self.init_type in ['k-means++', 'random']:
            initiators = self.init_type
        n_init = 1
        kmeans = KMeans(self.n_clusters, init=initiators, n_init=n_init, 
                        random_state=None)
        kmeans.fit(self.data)
        labels = kmeans.labels_
        centers = kmeans.cluster_centers_
        n_iter = kmeans.n_iter_
        return labels, centers, n_iter


[docs]    def create_cluster_dict(self, labels):
        """Creates a dictionary with the labels as keys and the indices of the 
        data as values.
        
        Parameters
        ----------
        labels : array-like of shape (n_samples,)
            Cluster labels.
        
        Returns
        -------
        dict
            Dictionary with the labels as keys and the indices of the data as values.
        """
        dict_labels = {}
        for i in range(self.n_clusters):
            dict_labels[i] = np.where(labels == i)[0]
        return dict_labels
    
    
[docs]    def compute_scores(self, labels):
        """Computes the Davies-Bouldin and Calinski-Harabasz scores.
        
        Parameters
        ----------
        labels : array-like of shape (n_samples,)
            Cluster labels.
        
        Returns
        -------
        tuple
            Davies-Bouldin and Calinski-Harabasz scores.
        """
        ch_score = calinski_harabasz_score(self.data, labels)
        db_score = davies_bouldin_score(self.data, labels)
        return ch_score, db_score


[docs]    def write_centroids(self, centers, n_iter):
        """Writes the centroids to a file.

        Parameters
        ----------
        centers : array-like of shape (n_clusters, n_features)
            Centroids of the clusters.
        n_iter : int
            Number of iterations until converage.
        """
        header = f'Number of clusters: {self.n_clusters}, Number of iterations: {n_iter}\n\nCentroids\n'
        np.savetxt('centroids.txt', centers, delimiter=',', header=header)
    
    
[docs]    def execute_kmeans_all(self):
        """Function to complete all steps of NANI for all different ``init_type`` options.

        Returns
        -------
        tuple
            Labels, centers and number of iterations.
        """
        if self.init_type == 'k-means++' or self.init_type == 'random':
            labels, centers, n_iter = self.kmeans_clustering(initiators=self.init_type)
        else:
            initiators = self.initiate_kmeans()
            labels, centers, n_iter = self.kmeans_clustering(initiators)
        return labels, centers, n_iter


[docs]def compute_scores(data, labels):
    """Computes the Calinski-Harabasz and Davies-Bouldin scores.
    
    Parameters
    ----------
    labels : array-like of shape (n_samples,)
        Cluster labels.
    
    Returns
    -------
    tuple
        Calinski-Harabasz and Davies-Bouldin scores (in that order).
    """
    ch_score = calinski_harabasz_score(data, labels)
    db_score = davies_bouldin_score(data, labels)
    return ch_score, db_score