Source code for mdance.tools.bts

import os
import random
import subprocess

import MDAnalysis as mda
import numpy as np
from shapeGMMTorch import align
import torch

import mdance.tools.esim as esim
from mdance.inputs.preprocess import gen_traj_numpy


[docs]def mean_sq_dev(matrix, N_atoms): """*O(N)* Mean square deviation (MSD) calculation for *n*-ary objects. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. N_atoms : int Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. Returns ------- float normalized MSD value. See Also -------- msd_condensed : Condensed version of MSD calculation for *n*-ary objects. extended_comparisons : *n*-ary similarity calculation for all indices/metrics. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8]]) >>> bts.mean_sq_dev(X, N_atoms=1) 32.8 """ N = len(matrix) if N == 1: return 0 sq_data = matrix ** 2 c_sum = np.sum(matrix, axis=0) sq_sum = np.sum(sq_data, axis=0) msd = np.sum(2 * (N * sq_sum - c_sum ** 2)) / (N ** 2) norm_msd = msd / N_atoms return norm_msd
[docs]def msd_condensed(c_sum, sq_sum, N, N_atoms): """Condensed version of Mean square deviation (MSD) calculation for *n*-ary objects. Parameters ---------- c_sum : array-like of shape (n_features,) A feature array of the column-wsie sum of the data. sq_sum : array-like of shape (n_features,) A feature array of the column-wise sum of the squared data. N : int Number of data points. N_atoms : int Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. Returns ------- float normalized MSD value. See Also -------- mean_sq_dev : Full version of MSD calculation for *n*-ary objects. extended_comparisons : *n*-ary similarity calculation for all indices/metrics. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> c_sum = np.array([21, 22]) >>> sq_sum = np.array([137, 130]) >>> bts.msd_condensed(c_sum, sq_sum, N=5, N_atoms=1) 32.8 """ if N == 1: return 0 msd = np.sum(2 * (N * sq_sum - c_sum ** 2)) / (N ** 2) norm_msd = msd / N_atoms return norm_msd
[docs]def extended_comparison(matrix, data_type='full', metric='MSD', N=None, N_atoms=1, **kwargs): """*O(N)* Extended comparison function for *n*-ary objects. Valid values for metric are: ``MSD``: Mean Square Deviation. Extended or Instant Similarity Metrics : | ``AC``: Austin-Colwell, ``BUB``: Baroni-Urbani-Buser, | ``CTn``: Consoni-Todschini n, ``Fai``: Faith, | ``Gle``: Gleason, ``Ja``: Jaccard, | ``Ja0``: Jaccard 0-variant, ``JT``: Jaccard-Tanimoto, | ``RT``: Rogers-Tanimoto, ``RR``: Russel-Rao, | ``SM``: Sokal-Michener, ``SSn``: Sokal-Sneath n. Parameters ---------- matrix : array-like of shape (n_samples, n_features) or tuple/list of \ length 1 or 2} A feature array of shape (n_samples, n_features) if ``data_type='full'``. Otherwise, tuple or list of length 1 (c_sum) or 2 (c_sum, sq_sum) if ``data_type='condensed'``. data_type : {'full', 'condensed'}, default='full' Type of data inputted. metric : str, default='MSD' The metric to when calculating distance between *n* objects in an array. It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`. N : int, optional, default=None Number of data points. N_atoms : int, default=1 Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. c_threshold : int, default=None Coincidence threshold for calculating extended similarity. It must be an options allowed by :func:`mdance.tools.esim.calculate_counters`. w_factor : {'fraction', 'power_n'}, default='fraction' The type of weight function for calculating extended similarity. It must be an options allowed by :func:`mdance.tools.esim.calculate_counters`. Raises ------ TypeError If data is not a numpy.ndarray or tuple/list of length 2. Returns ------- float Extended comparison value. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8]]) >>> bts.extended_comparison(X, data_type='full', metric='MSD', N_atoms=1) 32.8 """ if not N: N = len(matrix) if data_type == 'full': if not isinstance(matrix, np.ndarray): raise TypeError('data must be a numpy.ndarray') if matrix.ndim != 2: raise ValueError('Input must be numpy ndarray of shape (n_samples, n_features)') c_sum = np.sum(matrix, axis=0) if metric == 'MSD': sq_data = matrix ** 2 sq_sum = np.sum(sq_data, axis=0) elif data_type == 'condensed': if not isinstance(matrix, (tuple, list)): raise TypeError('data must be a tuple or list of length 1 or 2') c_sum = matrix[0] if metric == 'MSD': sq_sum = matrix[1] if metric == 'MSD': return msd_condensed(c_sum, sq_sum, N, N_atoms) else: if 'c_threshold' in kwargs: c_threshold = kwargs['c_threshold'] else: c_threshold = None if 'w_factor' in kwargs: w_factor = kwargs['w_factor'] else: w_factor = 'fraction' esim_dict = esim.gen_sim_dict(c_sum, n_objects=N, c_threshold=c_threshold, w_factor=w_factor) return 1 - esim_dict[metric]
[docs]def calculate_comp_sim(matrix, metric, N_atoms=1): """*O(N)* Complementary similarity calculation for *n*-ary objects. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. metric : str The metric to when calculating distance between *n* objects in an array. It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`. N_atoms : int, default=1 Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. Returns ------- numpy.ndarray Array of complementary similarities for each object. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8]]) >>> bts.calculate_comp_sim(X, metric='MSD', N_atoms=1) array([31, 34.375, 36.75, 27.75, 23.875]) """ N = len(matrix) sq_data = matrix ** 2 c_sum = np.sum(matrix, axis=0) sq_sum = np.sum(sq_data, axis=0) comp_csum = c_sum - matrix comp_sqsum = sq_sum - sq_data if metric == 'MSD': comp_msd = np.sum(2 * ((N-1) * comp_sqsum - comp_csum ** 2), axis=1) / (N-1)**2 comp_sims = comp_msd / N_atoms else: comp_sims = [] for object in matrix: object_square = object ** 2 value = extended_comparison([c_sum - object, sq_sum - object_square], data_type='condensed', metric=metric, N=N-1, N_atoms=N_atoms) comp_sims.append(value) comp_sims = np.array(comp_sims) return comp_sims
[docs]def calculate_medoid(matrix, metric, N_atoms=1): """*O(N)* medoid calculation for *n*-ary objects. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. metric : str The metric to when calculating distance between *n* objects in an array. It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`. N_atoms : int, default=1 Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. Returns ------- int The index of the medoid in the dataset. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8]]) >>> bts.calculate_medoid(X, metric='MSD', N_atoms=1) 2 """ return np.argmax(calculate_comp_sim(matrix, metric, N_atoms))
[docs]def calculate_outlier(matrix, metric, N_atoms=1): """*O(N)* Outlier calculation for *n*-ary objects. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. metric : str, default='MSD' The metric to when calculating distance between *n* objects in an array. It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`. N_atoms : int, default=1 Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. Returns ------- int The index of the outlier in the dataset. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8]]) >>> bts.calculate_outlier(X, metric='MSD', N_atoms=1) 4 """ return np.argmin(calculate_comp_sim(matrix, metric, N_atoms))
[docs]def trim_outliers(matrix, n_trimmed, metric, N_atoms, criterion='comp_sim'): """*O(N)* method of trimming a desired percentage of outliers (most dissimilar) from a data matrix through complementary similarity. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. n_trimmed : float or int The desired fraction of outliers to be removed or the number of outliers to be removed. float : Fraction of outliers to be removed. int : Number of outliers to be removed. metric : str, default='MSD' The metric to when calculating distance between *n* objects in an array. It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`. N_atoms : int, default=1 Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. criterion : {'comp_sim', 'sim_to_medoid'}, default='comp_sim' Criterion to use for data trimming. ``comp_sim`` criterion removes the most dissimilar objects based on the complement similarity. ``sim_to_medoid`` criterion removes the most dissimilar objects based on the similarity to the medoid. Returns ------- numpy.ndarray A ndarray with desired fraction of outliers removed. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]) >>> output = bts.trim_outliers(X, n_trimmed=0.6, metric='MSD', N_atoms=1) >>> output array([[2, 3], [8, 7], [8, 8]]) """ N = len(matrix) if isinstance(n_trimmed, int): cutoff = n_trimmed elif 0 < n_trimmed < 1: cutoff = int(np.floor(N * float(n_trimmed))) if criterion == 'comp_sim': c_sum = np.sum(matrix, axis = 0) sq_sum_total = np.sum(matrix ** 2, axis=0) comp_sims = [] for i, row in enumerate(matrix): c = c_sum - row sq = sq_sum_total - row ** 2 value = extended_comparison([c, sq], data_type='condensed', metric=metric, N=N - 1, N_atoms=N_atoms) comp_sims.append((i, value)) comp_sims = np.array(comp_sims) lowest_indices = np.argsort(comp_sims[:, 1])[:cutoff] matrix = np.delete(matrix, lowest_indices, axis=0) elif criterion == 'sim_to_medoid': medoid_index = calculate_medoid(matrix, metric, N_atoms=N_atoms) medoid = matrix[medoid_index] np.delete(matrix, medoid_index, axis=0) values = [] for i, frame in enumerate(matrix): value = extended_comparison(np.array([frame, medoid]), data_type='full', metric=metric, N_atoms=N_atoms) values.append((i, value)) values = np.array(values) highest_indices = np.argsort(values[:, 1])[-cutoff:] matrix = np.delete(matrix, highest_indices, axis=0) return matrix
[docs]def diversity_selection(matrix, percentage: int, metric, N_atoms=1, method='strat', start='medoid'): """*O(N)* method of selecting the most diverse subset of a data matrix using the complementary similarity. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. percentage : int If ``method='strat'``, percentage indicates how many bins of stratified data will be generated. If ``method='comp_sim'``, percentage indicates the percentage of data to be selected. metric : str, default='MSD' The metric to when calculating distance between *n* objects in an array. It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`. N_atoms : int, default=1 Number of atoms in the system used for normalization. ``N_atoms=1`` for non-Molecular Dynamics datasets. method : {'strat', 'comp_sim'}, default='strat' The method to use for diversity selection. ``strat``: stratified sampling. ``comp_sim``: maximizing the MSD between the selected objects and the rest of the data. start : {'medoid', 'outlier', 'random', list}, default='medoid' The initial seed for initiating diversity selection. Either from one of the options or a list of indices are valid inputs. Raises ------ ValueError If ``start`` is not ``medoid``, ``outlier``, ``random``, or a list. ValueError If ``percentage`` is too high. Returns ------- list List of indices of the diversity selected data. Examples -------- >>> from mdance.tools import bts >>> import numpy as np >>> X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [2, 9], [1, 8], [2, 7]]) >>> bts.diversity_selection(X, percentage=30, metric='MSD', N_atoms=1) [7 4] """ n_total = len(matrix) n_max = int(np.floor(n_total * percentage / 100)) if n_max > n_total: raise ValueError('Percentage is too high for the given matrix size') if method == 'strat': if n_max <= 1: raise ValueError('Percentage is too low for the given matrix size') if n_max == 1: indices_to_select = [0] else: step = (n_total - 1) / (n_max - 1) indices_to_select = np.round(np.arange(n_max) * step).astype(int) indices_to_select[0] = 0 comp_sims = calculate_comp_sim(matrix, metric=metric, N_atoms=N_atoms) sorted_comps = np.argsort(-comp_sims, kind='stable') selected_n = sorted_comps[indices_to_select].tolist() elif method == 'comp_sim': total_indices = np.array(range(n_total)) if start =='medoid': seed = calculate_medoid(matrix, metric=metric, N_atoms=N_atoms) selected_n = [seed] elif start == 'outlier': seed = calculate_outlier(matrix, metric=metric, N_atoms=N_atoms) selected_n = [seed] elif start == 'random': seed = random.randint(0, n_total - 1) selected_n = [seed] elif isinstance(start, list): selected_n = start else: raise ValueError('Select a correct starting point: medoid, outlier, \ random or outlier') n = len(selected_n) selection = [matrix[i] for i in selected_n] selection = np.array(selection) selected_condensed = np.sum(selection, axis=0) if metric == 'MSD': sq_selection = selection ** 2 sq_selected_condensed = np.sum(sq_selection, axis=0) while len(selected_n) < n_max: select_from_n = np.delete(total_indices, selected_n) if metric == 'MSD': new_index_n = get_new_index_n(matrix, metric=metric, selected_condensed=selected_condensed, sq_selected_condensed=sq_selected_condensed, n=n, select_from_n=select_from_n, N_atoms=N_atoms) sq_selected_condensed += matrix[new_index_n] ** 2 else: new_index_n = get_new_index_n(matrix, metric=metric, selected_condensed=selected_condensed, n=n, select_from_n=select_from_n) selected_condensed += matrix[new_index_n] selected_n.append(new_index_n) n = len(selected_n) else: raise ValueError('Select a correct sampling method: strat or comp_sim') return selected_n
[docs]def get_new_index_n(matrix, metric, selected_condensed, n, select_from_n, **kwargs): """Extract the new index to add to the list of selected indices. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. metric : str, default='MSD' The metric to when calculating distance between *n* objects in an array. It must be an options allowed by :func:`mdance.tools.bts.extended_comparison`. selected_condensed : array-like of shape (n_features,) Condensed sum of the selected fingerprints. n : int Number of selected objects. select_from_n : array-like of shape (n_samples,) Array of indices to select from. sq_selected_condensed : array-like of shape (n_features,), optional Condensed sum of the squared selected fingerprints. (**kwargs) N_atoms : int, optional Number of atoms in the system used for normalization. ``N_atoms=1`` for non-Molecular Dynamics datasets. (**kwargs) Returns ------- int index of the new fingerprint to add to the selected indices. """ if 'sq_selected_condensed' in kwargs: sq_selected_condensed = kwargs['sq_selected_condensed'] if 'N_atoms' in kwargs: N_atoms = kwargs['N_atoms'] # Number of fingerprints already selected and the new one to add n_total = n + 1 # Placeholders values min_value = -1 index = len(matrix) + 1 # Calculate MSD for each unselected object and select the index with the highest value. for i in select_from_n: if metric == 'MSD': sim_index = extended_comparison([selected_condensed + matrix[i], sq_selected_condensed + (matrix[i] ** 2)], data_type='condensed', metric=metric, N=n_total, N_atoms=N_atoms) else: sim_index = extended_comparison([selected_condensed + matrix[i]], data_type='condensed', metric=metric, N=n_total) if sim_index > min_value: min_value = sim_index index = i else: pass return index
[docs]def align_traj(data, N_atoms, align_method=None): """Aligns trajectory using uniform or kronecker alignment. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. N_atoms : int Number of atoms in the system. align_method : {'uni', 'kron'}, default=None Alignment method. ``uni`` or ``uniform``: Uniform alignment. ``kron`` or ``kronecker``: Kronecker alignment. Raises ------ ValueError if align_method is not ``uni``, ``kron``, or ``None``. Returns ------- numpy.ndarray matrix of aligned data. References ---------- Klem, H., Hocky, G. M., and McCullagh M., `"Size-and-Shape Space Gaussian Mixture Models for Structural Clustering of Molecular Dynamics Trajectories"`_. *Journal of Chemical Theory and Computation* **2022** 18 (5), 3218-3230 .. _"Size-and-Shape Space Gaussian Mixture Models for Structural Clustering of Molecular Dynamics Trajectories": https://pubs.acs.org/doi/abs/10.1021/acs.jctc.1c01290 """ if not align_method: return data data = data.reshape(len(data), N_atoms, 3) device = torch.device('cpu') dtype = torch.float32 traj_tensor = torch.tensor(data, device=device, dtype=dtype) traj_tensor = align.remove_center_of_geometry(traj_tensor) if align_method == 'uni' or align_method == 'uniform': uniform_aligned_traj_tensor, uniform_avg_tensor, uniform_var_tensor = align.maximum_likelihood_uniform_alignment(traj_tensor, verbose=True) aligned_traj_numpy = uniform_aligned_traj_tensor.cpu().numpy() elif align_method == 'kron' or align_method == 'kronecker': kronecker_aligned_traj_tensor, kronecker_avg_tensor, kronecker_precision_tensor, kronecker_lpdet_tensor = align.maximum_likelihood_kronecker_alignment(traj_tensor, verbose=True) aligned_traj_numpy = kronecker_aligned_traj_tensor.cpu().numpy() else: raise ValueError('Please select a correct alignment method: uni, kron, or None') reshaped = aligned_traj_numpy.reshape(aligned_traj_numpy.shape[0], -1) return reshaped
[docs]def equil_align(indices, sieve, input_top, input_traj, mdana_atomsel, cpptraj_atomsel, ref_index): """Aligns the frames in the trajectory to the reference frame. Parameters ---------- indices : list List of indices of the data points in the cluster. input_top : str Path to the input topology file. input_traj : str Path to the input trajectory file. mdana_atomsel : str Atom selection string for MDAnalysis. cpptraj_atomsel : str Atom selection string for cpptraj. ref_index : int Index of the reference frame. Returns ------- aligned_traj_numpy : numpy.ndarray Numpy array of the aligned trajectory. """ u = mda.Universe(input_top, input_traj) with mda.Writer(f'unaligned_traj.pdb', u.atoms.n_atoms) as W: for ts in u.trajectory[[i * sieve for i in indices]]: W.write(u.atoms) with open('cpptraj.in', 'w') as outfile: outfile.write(f'parm {input_top}\n') outfile.write(f'trajin unaligned_traj.pdb\n') outfile.write('autoimage\n') outfile.write(f'reference {input_traj} frame {ref_index}\n') outfile.write(f'rms ToAvg reference {cpptraj_atomsel}\n') outfile.write('trajout aligned_traj.pdb nobox\n') outfile.write('run\n') subprocess.run(['cpptraj', '-i', 'cpptraj.in'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Read aligned trajectory aligned_traj_numpy = gen_traj_numpy(input_top, 'aligned_traj.pdb', atomSel=mdana_atomsel) # Clean up os.remove('cpptraj.in') os.remove('unaligned_traj.pdb') os.remove('aligned_traj.pdb') return aligned_traj_numpy
[docs]def rep_sample(data, metric='MSD', N_atoms=1, n_bins=10, n_samples=100, hard_cap=True): """Representative sampling according to comp_sim values. Divides the range of comp_sim values in nbins and then uniformly selects n_samples molecules, consecutively taking one from each bin Parameters ---------- data : array-like of shape (n_samples, n_features) The data to be sampled. metric : str, default='MSD' The metric to be used for the comparison. N_atoms : int, default=1 Number of atoms in the Molecular Dynamics (MD) system. ``N_atom=1`` for non-MD systems. n_bins : int, default=10 Number of bins to divide the comp_sim values. n_samples : int, default=100 Number of samples to be selected. hard_cap : bool, default=True If True, the number of samples will be exactly n_samples. If False, the number of samples may not be exactly n_samples. Returns ------- sampled_mols : list List of indices of the sampled objects in the original data """ n = len(data) if n_samples < 1: n_samples = int(n * n_samples) cs = calculate_comp_sim(data, metric=metric, N_atoms=N_atoms) tups = [] for i, comp in enumerate(cs): tups.append((i, comp)) comp_sims = np.sort(cs) mi = np.min(comp_sims) ma = np.max(comp_sims) D = ma - mi step = D / n_bins bins = [] indices = np.array(list(range(n))) for i in range(n_bins - 1): low = mi + i * step up = mi + (i + 1) * step bins.append(indices[(comp_sims >= low) * (comp_sims < up)]) bins.append(indices[(comp_sims >= up) * (comp_sims <= ma)]) order_sampled = [] i = 0 while len(order_sampled) < n_samples: for b in bins: if len(b) > i: order_sampled.append(b[i]) if hard_cap: if len(order_sampled) >= n_samples: break else: pass i += 1 tups.sort(key = lambda tups : tups[1]) sampled_mols = [] for i in order_sampled: sampled_mols.append(tups[i][0]) return sampled_mols
[docs]def refine_dis_matrix(matrix): """Refine a distance matrix by setting the diagonal to zero and symmetrizing the matrix. Parameters ---------- matrix : array-like of shape (n_samples, n_features) A feature array. Returns ------- numpy.ndarray A refined 2D matrix. """ distances = np.array(matrix) if distances.ndim != 2: raise ValueError('Matrix must be 2D') if distances.shape[0] != distances.shape[1]: raise ValueError('Matrix must be square') distances += distances.T distances *= 0.5 distances -= np.min(distances) np.fill_diagonal(distances, 0) return distances
[docs]def quota_sampling(data, metric, percentage=10, n_bins=10, hard_cap=True, N_atoms=1, comp_sim=None): """Quota sampling according to complementary similarity values. Divides the range of comp_sim values in n_bins and then uniformly selects nsample frames, consecutively taking one from each bin. Parameters ---------- data : array-like of shape (n_samples, n_features) A feature array. metric : str The metric to when calculating distance between *n* objects in an array. percentage : int, default=10 Percentage of objects to sample. n_bins : int, default=10 Number of bins to divide the comp_sim range into. hard_cap : bool, default=True Whether to strictly enforce the number of samples. N_atoms : int, default=1 Number of atoms in the MD system. comp_sim : array-like, optional Pre-computed complementary similarity values. Returns ------- numpy.ndarray Indices of the sampled objects. """ if comp_sim is None: comp_sim = calculate_comp_sim(data, metric=metric, N_atoms=N_atoms) n_objects = len(data) else: n_objects = len(comp_sim) n_sample = int(n_objects * percentage / 100) if n_sample < 1 or n_sample < n_bins: raise ValueError("The number of objects to sample is too low for the number of bins. " "Please specify a higher percentage, or a lower number of bins") min_val = np.min(comp_sim) max_val = np.max(comp_sim) D = max_val - min_val step = D / n_bins bins = [] indices = np.array(range(n_objects)) for i in range(n_bins - 1): low = min_val + i * step up = min_val + (i + 1) * step ind = indices[(comp_sim >= low) * (comp_sim < up)] bin_comp_sim = comp_sim[ind] bins.append(ind[np.argsort(bin_comp_sim)]) low = min_val + (n_bins - 1) * step ind = indices[(comp_sim >= low) * (comp_sim <= max_val)] bin_comp_sim = comp_sim[ind] bins.append(ind[np.argsort(bin_comp_sim)]) # Sample the objects from each bin order_sampled = [] i = 0 while len(order_sampled) < n_sample: for b in bins: if len(b) > i: order_sampled.append(b[i]) if hard_cap and len(order_sampled) >= n_sample: break i += 1 return np.array(order_sampled[:n_sample])