import numpy as np
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score
from mdance.tools.bts import diversity_selection, calculate_comp_sim, quota_sampling
[docs]class KmeansNANI:
"""*k*-means NANI clustering alogorithm (*N*-Ary Natural Initialization).
Valid Values for ``init_types``: (*k* means number of clusters)
| ``strat_all``: A number of bins are computed based on specified percentage of the data. Stratified sampling is then applied, and the first *k* points from the stratified data are selected as the initial centers.
| ``strat_reduced``: Identifies high-density regions using complementary similarity, selecting a specified percentage of points. Applies stratified sampling to this subset using a number of bins based on the subset size, and selects the first *k* points as initial centers.
| ``comp_sim``: Identifies high-density regions using complementary similarity, selecting a percentage% of the data. From this subset, diversity selection (with ``comp_sim`` as the sampling method) is used to choose the first *k* points as the initial centers.
| ``div_select``: Applies diversity selection (using ``comp_sim`` as the sampling method) on specified percentage% of points. First *k* points are the initial centers.
| ``quota``: Uses quota sampling to select initial centers based on complementary similarity values divided into bins.
| ``k-means++`` selects the initial centers based on the greedy *k*-means++ algorithm.
| ``random`` selects the initial centers randomly.
| ``vanilla_kmeans++`` selects the initial centers based on the vanilla *k*-means++ algorithm.
Parameters
----------
data : array-like of shape (n_samples, n_features)
A feature array.
n_clusters : int
Number of clusters.
metric : str
The metric to when calculating distance between *n* objects in an array.
It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`.
N_atoms : int
Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1``
for non-MD systems.
init_type : str, default='comp_sim'
Type of initiator selection for initiating *k*-means. It must be an
options allowed by :class:`mdance.cluster.nani.KmeansNANI`.
percentage : int, default=10
Percentage of the dataset to be used for the initial selection of the
initial centers. (**kwargs)
Attributes
----------
labels : array-like of shape (n_samples,)
An array of the labels of each point.
centers : array-like of shape (n_clusters, n_features)
An array of the cluster centers.
n_iter : int
Number of iterations until coverage.
cluster_dict : dict
Dictionary of the clusters and their corresponding indices.
"""
def __init__(self, data, n_clusters, metric, N_atoms, init_type='strat_all',
**kwargs):
self.data = data
self.n_clusters = n_clusters
self.metric = metric
self.N_atoms = N_atoms
self.init_type = init_type
self._check_init_type()
if self.init_type in ['comp_sim', 'div_select', 'strat_reduced', 'strat_all','quota']:
self.percentage = kwargs.get('percentage', 10)
self._check_percentage()
def _check_init_type(self):
"""Checks the ``init_type`` attribute.
Raises
------
ValueError
If ``init_type`` is not one of the following: ``comp_sim``, ``div_select``,
``k-means++``, ``random``, ``vanilla_kmeans++``.
"""
if self.init_type not in ['comp_sim', 'div_select', 'k-means++',
'random', 'vanilla_kmeans++', 'strat_all',
'strat_reduced', 'quota']:
raise ValueError('init_type must be one of the following: comp_sim, \
div_select, k-means++, random, vanilla_kmeans++, strat_all, \
strat_reduced, quota.')
def _check_percentage(self):
"""Checks the ``percentage`` attribute.
Raises
------
TypeError
If percentage is not an integer.
ValueError
If percentage is not between 0 and 100.
"""
if not isinstance(self.percentage, int):
raise TypeError('percentage must be an integer [0, 100].')
if not 0 <= self.percentage <= 100:
raise ValueError('percentage must be an integer [0, 100].')
[docs] def initiate_kmeans(self, **kwargs):
"""Initializes the *k*-means algorithm with the selected initiators.
Raises
------
ValueError
If the number of initiators is less than the number of clusters.
Returns
-------
numpy.ndarray
The initial centers for *k*-means of shape (n_clusters, n_features).
"""
if self.init_type in ['strat_reduced', 'comp_sim']:
n_total = len(self.data)
n_max = int(n_total * self.percentage / 100)
comp_sim = calculate_comp_sim(self.data, self.metric, self.N_atoms)
sorted_indices = np.argsort(comp_sim)
top_comp_sim_indices = sorted_indices[-n_max:]
top_cc_data = self.data[top_comp_sim_indices]
if self.init_type == 'strat_reduced':
sampling_method, start_method = 'strat', 'medoid'
else:
sampling_method, start_method = 'comp_sim', 'medoid'
initiator_idxs = diversity_selection(top_cc_data, 100, self.metric, self.N_atoms,
sampling_method, start_method)
initiators = top_cc_data[initiator_idxs]
elif self.init_type == 'strat_all':
initiator_idxs = diversity_selection(self.data, self.percentage, self.metric,
self.N_atoms, 'strat', 'medoid')
initiators = self.data[initiator_idxs]
elif self.init_type == 'div_select':
initiator_idxs = diversity_selection(self.data, self.percentage, self.metric,
self.N_atoms, 'comp_sim', 'medoid')
initiators = self.data[initiator_idxs]
elif self.init_type == 'quota':
n_bins = kwargs.get('n_bins', 10)
initiator_idxs = quota_sampling(self.data, self.metric,
percentage=self.percentage,
n_bins=n_bins,
N_atoms=self.N_atoms)
initiators = self.data[initiator_idxs]
elif self.init_type == 'vanilla_kmeans++':
initiators, indices = kmeans_plusplus(self.data, self.n_clusters, random_state=None,
n_local_trials=1)
if len(initiators) < self.n_clusters:
raise ValueError('The number of initiators is less than the number of clusters. Try increasing the percentage.')
return initiators[:self.n_clusters]
[docs] def kmeans_clustering(self, initiators):
"""Executes the *k*-means algorithm with the selected initiators.
Parameters
----------
initiators : {numpy.ndarray, 'k-means++', 'random'}
Method for selecting initial centers.
``k-means++`` selects initial centers in a smart way to speed up convergence.
``random`` selects initial centers randomly.
numpy.ndarray selects initial centers based on the input array.
Returns
-------
tuple
Labels, centers and number of iterations.
"""
if self.init_type in ['k-means++', 'random']:
initiators = self.init_type
n_init = 1
kmeans = KMeans(self.n_clusters, init=initiators, n_init=n_init,
random_state=None)
kmeans.fit(self.data)
labels = kmeans.labels_
centers = kmeans.cluster_centers_
n_iter = kmeans.n_iter_
return labels, centers, n_iter
[docs] def create_cluster_dict(self, labels):
"""Creates a dictionary with the labels as keys and the indices of the
data as values.
Parameters
----------
labels : array-like of shape (n_samples,)
Cluster labels.
Returns
-------
dict
Dictionary with the labels as keys and the indices of the data as values.
"""
dict_labels = {}
for i in range(self.n_clusters):
dict_labels[i] = np.where(labels == i)[0]
return dict_labels
[docs] def compute_scores(self, labels):
"""Computes the Davies-Bouldin and Calinski-Harabasz scores.
Parameters
----------
labels : array-like of shape (n_samples,)
Cluster labels.
Returns
-------
tuple
Davies-Bouldin and Calinski-Harabasz scores.
"""
ch_score = calinski_harabasz_score(self.data, labels)
db_score = davies_bouldin_score(self.data, labels)
return ch_score, db_score
[docs] def write_centroids(self, centers, n_iter):
"""Writes the centroids to a file.
Parameters
----------
centers : array-like of shape (n_clusters, n_features)
Centroids of the clusters.
n_iter : int
Number of iterations until converage.
"""
header = f'Number of clusters: {self.n_clusters}, Number of iterations: {n_iter}\n\nCentroids\n'
np.savetxt('centroids.txt', centers, delimiter=',', header=header)
[docs] def execute_kmeans_all(self):
"""Function to complete all steps of NANI for all different ``init_type`` options.
Returns
-------
tuple
Labels, centers and number of iterations.
"""
if self.init_type == 'k-means++' or self.init_type == 'random':
labels, centers, n_iter = self.kmeans_clustering(initiators=self.init_type)
else:
initiators = self.initiate_kmeans()
labels, centers, n_iter = self.kmeans_clustering(initiators)
return labels, centers, n_iter
[docs]def compute_scores(data, labels):
"""Computes the Calinski-Harabasz and Davies-Bouldin scores.
Parameters
----------
labels : array-like of shape (n_samples,)
Cluster labels.
Returns
-------
tuple
Calinski-Harabasz and Davies-Bouldin scores (in that order).
"""
ch_score = calinski_harabasz_score(data, labels)
db_score = davies_bouldin_score(data, labels)
return ch_score, db_score