Source code for mdance.prime.sim_calc

import glob
import json
import re

import numpy as np

from mdance.tools.esim import SimilarityIndex, calc_medoid, calc_outlier


[docs]class FrameSimilarity: """A class to calculate the similarity between clusters. Parameters ---------- cluster_folder : str The path to the folder containing the normalized cluster files. summary_file : str The path to the summary file containing the number of frames for each cluster. trim_frac : float The fraction of outliers to trim from the top cluster. n_clusters : int The number of clusters to analyze. weighted_by_frames : bool Whether to weight the similarity values by the number of frames in the cluster. n_ary : {'RR', 'SM'} The n_ary similarity metric to use. weight : {'nw', 'w', 'fraction'} The weight to use for the similarity metric. Returns ------- dict A dictionary containing the average similarity between each pair of clusters. References ---------- Chen, L., Mondal, A., Perez, A. & Miranda-Quintana, R.A. `"Protein Retrieval via Integrative Molecular Ensembles (PRIME) through Extended Similarity Indices."`_. *Journal of Chemical Theory and Computation* **2024** 20 (14), 6303-6315 Examples -------- >>> from mdance.prime.sim_calc import FrameSimilarity >>> sim = FrameSimilarity(cluster_folder="path/to/cluster_folder", summary_file="path/to/summary_file", ... trim_frac=0.1, n_clusters=10, weighted_by_frames=True, n_ary='RR', weight='nw') >>> sim.calculate_pairwise() >>> sim.calculate_union() >>> sim.calculate_medoid() >>> sim.calculate_outlier() .. _"Protein Retrieval via Integrative Molecular Ensembles (PRIME) through Extended Similarity Indices.": https://pubs.acs.org/doi/abs/10.1021/acs.jctc.4c00362 """ def __init__(self, cluster_folder=None, summary_file=None, trim_frac=None, n_clusters=None, weighted_by_frames=True, n_ary='RR', weight='nw'): self.c0 = np.load(f"{cluster_folder}/normed_clusttraj.c0.npy") if trim_frac: self.c0 = _trim_outliers(self.c0, trim_frac=trim_frac, n_ary=n_ary, weight=weight) self.input_files = sorted(glob.glob(f"{cluster_folder}/normed_clusttraj.c*"), key=lambda x: int(re.findall("\d+", x)[0]))[1:] self.summary_file = summary_file self.n_clusters = n_clusters self.weighted_by_frames = weighted_by_frames self.n_ary = n_ary self.weight = weight self.sims = {}
[docs] def calculate_pairwise(self): """Calculates pairwise similarity between each cluster and all other clusters. The similarity score is calculated as the average of pairwise similarity values between each frame in the cluster and the top c0 cluster. Returns ------- dict A dictionary containing the average similarity between each pair of clusters. ``weighted_by_frames=True`` will return the frame-weighted similarity values. ``weighted_by_frames=False`` will return the unweighted similarity values. """ for each, file in enumerate(self.input_files): ck = np.load(file) self.sims[each] = {} for i, x in enumerate(self.c0): total = 0 for j, y in enumerate(ck): c_total = np.sum(np.array([x, y]), axis=0) pair_sim = SimilarityIndex(c_total, 2, n_ary=self.n_ary, weight=self.weight, c_threshold=None, w_factor="fraction")() total += pair_sim avg = total / len(ck) if f"f{i}" not in self.sims[each]: self.sims[each][f"f{i}"] = [] self.sims[each][f"f{i}"] = avg nw_dict = _format_dict(self.sims) if not self.weighted_by_frames: return nw_dict elif self.weighted_by_frames: return weight_dict(file_path=None, summary_file=self.summary_file, dict=nw_dict, n_clusters=self.n_clusters)
[docs] def calculate_union(self): """Calculates the extended similarity between the union of frame in c0 and cluster k. The similarity score is calculated as the union similarity between all frames in the cluster and the top c0 cluster. Returns ------- dict A dictionary containing the average similarity between each pair of clusters. ``weighted_by_frames=True`` will return the frame-weighted similarity values. ``weighted_by_frames=False`` will return the unweighted similarity values. """ for each, file in enumerate(self.input_files): ck = np.load(file) self.sims[each] = {} for i, x in enumerate(self.c0): c_total = np.sum(ck, axis=0) + x n_fingerprints = len(ck) + 1 index = SimilarityIndex(c_total, n_fingerprints, n_ary=self.n_ary, weight=self.weight, c_threshold=None, w_factor="fraction")() if f"f{i}" not in self.sims[each]: self.sims[each][f"f{i}"] = [] self.sims[each][f"f{i}"] = index nw_dict = _format_dict(self.sims) if not self.weighted_by_frames: return nw_dict elif self.weighted_by_frames: return weight_dict(file_path=None, summary_file=self.summary_file, dict=nw_dict, n_clusters=self.n_clusters)
def _perform_calculation(self, index_func): """Auxillary function for ``calculate_medoid`` and ``calculate_outlier``. Parameters ---------- index_func : function The function to calculate the medoid or outlier of each cluster. Returns ------- dict A dictionary containing the average similarity between each pair of clusters. """ for each, file in enumerate(self.input_files): ck = np.load(file) index = index_func(ck, n_ary=self.n_ary, weight=self.weight) medoid = ck[index] self.sims[each] = {} for i, x in enumerate(self.c0): c_total = medoid + x if f"f{i}" not in self.sims[each]: self.sims[each][f"f{i}"] = [] pair_sim = SimilarityIndex(c_total, 2, n_ary=self.n_ary, weight=self.weight, c_threshold=None, w_factor="fraction")() self.sims[each][f"f{i}"] = pair_sim nw_dict = _format_dict(self.sims) return nw_dict
[docs] def calculate_medoid(self): """Calculates the pairwise similarity between every frame in c0 and the medoid of each cluster. The pairwise similarity value between each frame in c0 and the medoid of each cluster is calculated using similarity indices. Returns ------- dict A dictionary containing the average similarity between each pair of clusters. ``weighted_by_frames=True`` will return the frame-weighted similarity values. ``weighted_by_frames=False`` will return the unweighted similarity values. """ nw_dict = self._perform_calculation(calc_medoid) if not self.weighted_by_frames: return nw_dict elif self.weighted_by_frames: return weight_dict(file_path=None, summary_file=self.summary_file, dict=nw_dict, n_clusters=self.n_clusters)
[docs] def calculate_outlier(self): """Calculates the pairwise similarity between every frame in c0 and the outlier of each cluster. The pairwise similarity value between each frame in c0 and the outlier of each cluster is calculated using similarity indices. Returns ------- dict A dictionary containing the average similarity between each pair of clusters. ``weighted_by_frames=True`` will return the frame-weighted similarity values. ``weighted_by_frames=False`` will return the unweighted similarity values. """ nw_dict = self._perform_calculation(calc_outlier) if not self.weighted_by_frames: return nw_dict elif self.weighted_by_frames: return weight_dict(file_path=None, summary_file=self.summary_file, dict=nw_dict, n_clusters=self.n_clusters)
def _trim_outliers(total_data, trim_frac=0.1, n_ary='RR', weight='nw', removal='nan'): """Trims a desired percentage of outliers (most dissimilar) from the dataset by calculating largest complement similarity. Parameters ---------- total_data : numpy.ndarray The input data to be trimmed. trim_frac : float The fraction of outliers to trim. Defaults to 0.1. n_ary : {'RR', 'SM'} The similarity metric to use. weight : {'nw', 'w', 'fraction'} The weight to use for the similarity metric. removal : {'nan', 'delete'} The value to replace the trimmed data with. Defaults to 'nan'. Returns ------- numpy.ndarray The trimmed data. """ n_fingerprints = len(total_data) c_total = np.sum(total_data, axis = 0) comp_sims = [] for i, pixel in enumerate(total_data): c_total_i = c_total - total_data[i] sim_index = SimilarityIndex(c_total_i, n_fingerprints - 1, n_ary=n_ary, weight=weight, c_threshold=None, w_factor="fraction")() comp_sims.append(sim_index) comp_sims = np.array(comp_sims) cutoff = int(np.floor(n_fingerprints * float(trim_frac))) highest_indices = np.argpartition(-comp_sims, cutoff)[:cutoff] if removal == 'nan': total_data[highest_indices] = np.nan elif removal == 'delete': total_data = np.delete(total_data, highest_indices, axis=0) return total_data
[docs]def weight_dict(file_path=None, summary_file=None, dict=None, n_clusters=None): """Calculates frame-weighted similarity values by the number of frames in each cluster. Parameters ---------- file_path : str The path to the json file containing the unweighted similarity values. summary_file : str The path to the summary file containing the number of frames for each cluster. dict : dict A dictionary containing the unweighted similarity values. n_clusters : int The number of clusters to analyze. Returns ------- dict A dictionary containing the frame-weighted similarity values between each pair of clusters. """ if file_path: with open(file_path, 'r') as file: dict = json.load(file) elif dict: dict = dict for key in dict: dict[key].pop() num = np.loadtxt(summary_file, unpack=True, usecols=(1), skiprows=(1), delimiter=',') if n_clusters: num = num[0:n_clusters] weights = num weights = weights[1:] w_dict = {} for key in dict: old_list = dict[key] w_dict[key] = [old_list[i] * v for i, v in enumerate(weights)] for k in w_dict: average = sum(w_dict[k]) / len(w_dict[k]) w_dict[k].append(average) return w_dict
def _format_dict(dict): """Sorts dict to have frame # as the key and attaches the average value to the end of each key. Parameters ---------- dict : dict A dictionary containing the similarity values. Returns ------- dict A dictionary sorted by the keys with the average value attached to the end of each key. """ nw_dict = {} for i in sorted(dict): for k, v in dict[i].items(): if k not in nw_dict: nw_dict[k] = [None] * len(dict) nw_dict[k][i] = v for k in nw_dict: average = sum(nw_dict[k]) / len(nw_dict[k]) nw_dict[k].append(average) return nw_dict