Source code for mdance.tools.esim

""" 
Miranda Quintana Group - University of Florida
eSIM: extended similarity indices

Please, cite the original papers on the *n*-ary indices:

Miranda-Quintana, R.A., Bajusz, D., Rácz, A. & Héberger, K. J Cheminform 13, 32 (2021).
https://doi.org/10.1186/s13321-021-00505-3
Miranda-Quintana, R.A., Rácz, A., Bajusz, D. & Héberger, K. J Cheminform 13, 33 (2021).
https://doi.org/10.1186/s13321-021-00504-4
"""

from math import ceil, log

import numpy as np


[docs]def calculate_counters(c_total, n_objects, c_threshold=None, w_factor='fraction'):
    """Calculate 1-similarity, 0-similarity, and dissimilarity counters.

    Parameters
    ---------
    c_total : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    
    n_objects : int
        Number of objects to be compared.

    c_threshold : {None, 'dissimilar', int}, default=None
        Coincidence threshold.
            - None : Default, c_threshold = n_objects % 2
            - ``dissimilar`` : c_threshold = ceil(n_objects / 2)
            - int : Integer number < n_objects
            - float : Real number in the (0, 1) interval. Indicates the % 
              of the total data that will serve as threshold.

    w_factor : {"fraction", "power_n"}, default='fraction'
        Type of weight function that will be used.
            - ``fraction`` : similarity = d[k]/n
                    dissimilarity = 1 - (d[k] - n_objects % 2)/n_objects
            - ``power_n`` : similarity = n**-(n_objects - d[k])
                    dissimilarity = n**-(d[k] - n_objects % 2)
            - other values : similarity = dissimilarity = 1

    Returns
    -------
    counters : dict
        Dictionary with the weighted and non-weighted counters.
    """
    # Assign c_threshold
    if not c_threshold:
        c_threshold = n_objects % 2
    if c_threshold == 'dissimilar':
        c_threshold = ceil(n_objects / 2)
    if c_threshold == 'min':
        c_threshold = n_objects % 2
    if isinstance(c_threshold, int):
        if c_threshold >= n_objects:
            raise ValueError('c_threshold cannot be equal or greater than n_objects.')
        c_threshold = c_threshold
    if 0 < c_threshold < 1:
        c_threshold *= n_objects
    
    # Set w_factor
    if w_factor:
        if 'power' in w_factor:
            power = int(w_factor.split("_")[-1])
            def f_s(d):
                return power**-float(n_objects - d)
    
            def f_d(d):
                return power**-float(d - n_objects % 2)
        elif w_factor == 'fraction':
            def f_s(d):
                return d/n_objects
    
            def f_d(d):
                return 1 - (d - n_objects % 2)/n_objects
        else:
            def f_s(d):
                return 1
    
            def f_d(d):
                return 1
    else:
        def f_s(d):
            return 1
    
        def f_d(d):
            return 1
    
    # Calculate a, d, b + c
    
    a_indices = 2 * c_total - n_objects > c_threshold
    d_indices = n_objects - 2 * c_total > c_threshold
    dis_indices = np.abs(2 * c_total - n_objects) <= c_threshold
    
    a = np.sum(a_indices)
    d = np.sum(d_indices)
    total_dis = np.sum(dis_indices)
    
    a_w_array = f_s(2 * c_total[a_indices] - n_objects)
    d_w_array = f_s(abs(2 * c_total[d_indices] - n_objects))
    total_w_dis_array = f_d(abs(2 * c_total[dis_indices] - n_objects))
    
    w_a = np.sum(a_w_array)
    w_d = np.sum(d_w_array)
    total_w_dis = np.sum(total_w_dis_array)
        
    total_sim = a + d
    total_w_sim = w_a + w_d
    p = total_sim + total_dis
    w_p = total_w_sim + total_w_dis
    
    counters = {"a": a, "w_a": w_a, "d": d, "w_d": w_d,
                "total_sim": total_sim, "total_w_sim": total_w_sim,
                "total_dis": total_dis, "total_w_dis": total_w_dis,
                "p": p, "w_p": w_p}
    return counters


[docs]def gen_sim_dict(c_total, n_objects, c_threshold=None, w_factor='fraction'):
    """Generate a dictionary with the similarity indices.
    
    Valid values for metric are:

    ``MSD``: Mean Square Deviation.

    Extended or Instant Similarity Metrics

    | ``AC``: Austin-Colwell, ``BUB``: Baroni-Urbani-Buser, 
    | ``CTn``: Consoni-Todschini n, ``Fai``: Faith, 
    | ``Gle``: Gleason, ``Ja``: Jaccard, 
    | ``Ja0``: Jaccard 0-variant, ``JT``: Jaccard-Tanimoto, 
    | ``RT``: Rogers-Tanimoto, ``RR``: Russel-Rao,
    | ``SM``: Sokal-Michener, ``SSn``: Sokal-Sneath n.
    
    Parameters
    ----------
    c_total : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    n_objects : int
        Number of objects to be compared.
    c_threshold : {None, 'dissimilar', int}
        Coincidence threshold.
    w_factor : {'fraction', 'power_n'}
        Type of weight function that will be used.
    
    Returns
    -------
    dict
        Dictionary with the similarity indices.
    """
    
    counters = calculate_counters(c_total, n_objects, c_threshold=c_threshold, 
                                  w_factor=w_factor)
    bub_nw = ((counters['w_a'] * counters['w_d']) ** 0.5 + counters['w_a'])/\
             ((counters['a'] * counters['d']) ** 0.5 + counters['a'] + counters['total_dis'])
    fai_nw = (counters['w_a'] + 0.5 * counters['w_d'])/\
             (counters['p'])
    gle_nw = (2 * counters['w_a'])/\
             (2 * counters['a'] + counters['total_dis'])
    ja_nw = (3 * counters['w_a'])/\
            (3 * counters['a'] + counters['total_dis'])
    jt_nw = (counters['w_a'])/\
            (counters['a'] + counters['total_dis'])
    rt_nw = (counters['total_w_sim'])/\
            (counters['p'] + counters['total_dis'])
    rr_nw = (counters['w_a'])/\
            (counters['p'])
    sm_nw = (counters['total_w_sim'])/\
            (counters['p'])
    ss1_nw = (counters['w_a'])/\
             (counters['a'] + 2 * counters['total_dis'])
    ss2_nw = (2 * counters['total_w_sim'])/\
             (counters['p'] + counters['total_sim'])

    Indices = {'BUB':bub_nw, 'Fai':fai_nw, 'Gle':gle_nw, 'Ja':ja_nw, 'JT':jt_nw, 
               'RT':rt_nw, 'RR':rr_nw, 'SM':sm_nw, 'SS1':ss1_nw, 'SS2':ss2_nw}
    return Indices


[docs]class SimilarityIndex:
    """*O(N)* similarity index calculation for a set.
    
    Parameters
    ----------
    data : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    n_objects : int
        Number of objects to be compared.
    c_threshold : {None, 'dissimilar', int}
        Coincidence threshold.
    n_ary : str
        string with the initials of the desired similarity index to calculate the medoid from.
    w_factor : {'fraction', 'power_n'}
        Type of weight function that will be used.
    weight : str
        Type of weight function that will be used.
    return_dict : bool
        If True, returns a dictionary with all the similarity indices.
    
    Returns
    -------
    float or dict
        The similarity index.
    """
    def __init__(self, data, n_objects = None, c_threshold = None, n_ary = 'RR', 
                 w_factor = 'fraction', weight = 'nw', return_dict = False):
        self.data = data
        self.n_objects = n_objects
        self.n_ary = n_ary
        self.w_factor = w_factor
        self.c_threshold = c_threshold
        self.weight = weight
        self.counters = calculate_counters(self.data, self.n_objects, self.c_threshold, self.w_factor)
        self.return_dict = return_dict
        
        if self.return_dict == True:
         self.index_functions = \
         {'nw': {'AC': self.ac_nw, 'BUB': self.bub_nw, 'CT1': self.ct1_nw, 
            'CT2': self.ct2_nw, 'CT3': self.ct3_nw, 'CT4': self.ct4_nw, 
            'Fai': self.fai_nw, 'Gle': self.gle_nw, 'Ja': self.ja_nw,
            'Ja0': self.ja0_nw, 'JT': self.jt_nw, 'RT': self.rt_nw, 'RR': self.rr_nw,
            'SM': self.sm_nw, 'SS1': self.ss1_nw, 'SS2': self.ss2_nw},
         'w': {'AC': self.ac_w, 'BUB': self.bub_w, 'CT1': self.ct1_w, 'CT2': self.ct2_w, 'CT3': self.ct3_w,
            'CT4': self.ct4_w, 'Fai': self.fai_w, 'Gle': self.gle_w, 'Ja': self.ja_w,
            'Ja0': self.ja0_w, 'JT': self.jt_w, 'RT': self.rt_w, 'RR': self.rr_w,
            'SM': self.sm_w, 'SS1': self.ss1_w, 'SS2': self.ss2_w},
         'nw_nw': {'RR': self.rr_nw_nw, 'SM': self.sm_nw_nw}}
   
[docs]    def __call__(self):
        """The default method to be called when the class is instantiated.
        
        Returns:
        --------
        float or dict
            The similarity index or a dictionary with all the similarity indices.
        """
        if self.return_dict:
            return self.gen_dict()
        else:
            return getattr(self, f"{self.n_ary.lower()}_{self.weight}")() 

[docs]    def gen_sim_dict(self): 
        """
        Generates a dictionary of all similarity indices.
        """
        return {outer_key: {inner_key: inner_func() for inner_key, inner_func in inner_dict.items()}
            for outer_key, inner_dict in self.index_functions.items()}


    # Calculate for individual Index Functions
    # Weighted Indices
[docs]    def ac_w(self):
        ac_w = (2/np.pi) * np.arcsin(np.sqrt(self.counters['total_w_sim']/
                                            self.counters['w_p']))
        return ac_w

[docs]    def bub_w(self):
        bub_w = ((self.counters['w_a'] * self.counters['w_d'])**0.5 + self.counters['w_a'])/\
                ((self.counters['w_a'] * self.counters['w_d'])**0.5 + self.counters['w_a'] + self.counters['total_w_dis'])
        return bub_w

[docs]    def ct1_w(self):
        ct1_w = (log(1 + self.counters['w_a'] + self.counters['w_d']))/\
                (log(1 + self.counters['w_p']))
        return ct1_w

[docs]    def ct2_w(self):
        ct2_w = (log(1 + self.counters['w_p']) - log(1 + self.counters['total_w_dis']))/\
                (log(1 + self.counters['w_p']))
        return ct2_w
    
[docs]    def ct3_w(self):
        ct3_w = (log(1 + self.counters['w_a']))/\
                (log(1 + self.counters['w_p']))
        return ct3_w

[docs]    def ct4_w(self):
        ct4_w = (log(1 + self.counters['w_a']))/\
                (log(1 + self.counters['w_a'] + self.counters['total_w_dis']))
        return ct4_w

[docs]    def fai_w(self):
        fai_w = (self.counters['w_a'] + 0.5 * self.counters['w_d'])/\
                (self.counters['w_p'])
        return fai_w

[docs]    def gle_w(self):
        gle_w = (2 * self.counters['w_a'])/\
                (2 * self.counters['w_a'] + self.counters['total_w_dis'])
        return gle_w

[docs]    def ja_w(self):
        ja_w = (3 * self.counters['w_a'])/\
               (3 * self.counters['w_a'] + self.counters['total_w_dis'])
        return ja_w

[docs]    def ja0_w(self):
        ja0_w = (3 * self.counters['total_w_sim'])/\
                (3 * self.counters['total_w_sim'] + self.counters['total_w_dis'])
        return ja0_w

[docs]    def jt_w(self):
        jt_w = (self.counters['w_a'])/\
               (self.counters['w_a'] + self.counters['total_w_dis'])
        return jt_w

[docs]    def rt_w(self):
        rt_w = (self.counters['total_w_sim'])/\
               (self.counters['w_p'] + self.counters['total_w_dis'])
        return rt_w

[docs]    def rr_w(self):
        rr_w = (self.counters['w_a'])/\
               (self.counters['w_p'])
        return rr_w

[docs]    def sm_w(self):
        sm_w = (self.counters['total_w_sim'])/\
               (self.counters['w_p'])
        return sm_w

[docs]    def ss1_w(self):
        ss1_w = (self.counters['w_a'])/\
                (self.counters['w_a'] + 2 * self.counters['total_w_dis'])
        return ss1_w

[docs]    def ss2_w(self):
        ss2_w = (2 * self.counters['total_w_sim'])/\
                (self.counters['w_p'] + self.counters['total_w_sim'])
        return ss2_w

    # Non-Weighted Indices
[docs]    def ac_nw(self):
        ac_nw = (2/np.pi) * np.arcsin(np.sqrt(self.counters['total_w_sim']/
                                            self.counters['p']))
        return ac_nw
    
[docs]    def bub_nw(self):
        bub_nw = ((self.counters['w_a'] * self.counters['w_d'])**0.5 + self.counters['w_a'])/\
                ((self.counters['a'] * self.counters['d'])**0.5 + self.counters['a'] + self.counters['total_dis'])
        return bub_nw

[docs]    def ct1_nw(self):
        ct1_nw = (log(1 + self.counters['w_a'] + self.counters['w_d']))/\
                (log(1 + self.counters['p']))
        return ct1_nw

[docs]    def ct2_nw(self):
        ct2_nw = (log(1 + self.counters['w_p']) - log(1 + self.counters['total_w_dis']))/\
                (log(1 + self.counters['p']))
        return ct2_nw

[docs]    def ct3_nw(self):
        ct3_nw = (log(1 + self.counters['w_a']))/\
                (log(1 + self.counters['p']))
        return ct3_nw

[docs]    def ct4_nw(self):
        ct4_nw = (log(1 + self.counters['w_a']))/\
                (log(1 + self.counters['a'] + self.counters['total_dis']))
        return ct4_nw

[docs]    def fai_nw(self):
        fai_nw = (self.counters['w_a'] + 0.5 * self.counters['w_d'])/\
                (self.counters['p'])
        return fai_nw

[docs]    def gle_nw(self):
        gle_nw = (2 * self.counters['w_a'])/\
                (2 * self.counters['a'] + self.counters['total_dis'])
        return gle_nw

[docs]    def ja_nw(self):
        ja_nw = (3 * self.counters['w_a'])/\
                (3 * self.counters['a'] + self.counters['total_dis'])
        return ja_nw

[docs]    def ja0_nw(self):
        ja0_nw = (3 * self.counters['total_w_sim'])/\
                (3 * self.counters['total_sim'] + self.counters['total_dis'])
        return ja0_nw

[docs]    def jt_nw(self):
        jt_nw = (self.counters['w_a'])/\
                (self.counters['a'] + self.counters['total_dis'])
        return jt_nw

[docs]    def rt_nw(self):
        rt_nw = (self.counters['total_w_sim'])/\
                (self.counters['p'] + self.counters['total_dis'])
        return rt_nw

[docs]    def rr_nw(self):
        rr_nw = (self.counters['w_a'])/\
                (self.counters['p'])
        return rr_nw

[docs]    def sm_nw(self):
        sm_nw = (self.counters['total_w_sim'])/\
                (self.counters['p'])
        return sm_nw

[docs]    def ss1_nw(self):
        ss1_nw = (self.counters['w_a'])/\
                (self.counters['a'] + 2 * self.counters['total_dis'])
        return ss1_nw

[docs]    def ss2_nw(self):
        ss2_nw = (2 * self.counters['total_w_sim'])/\
                (self.counters['p'] + self.counters['total_sim'])
        return ss2_nw
    
    # Non-Weighted Indices for Denominator and Numerator
[docs]    def rr_nw_nw(self):
        rr_nw_nw = (self.counters['a'])/\
                (self.counters['p'])
        return rr_nw_nw
    
[docs]    def sm_nw_nw(self):
        sm_nw_nw = (self.counters['total_sim'])/\
                (self.counters['p'])
        return sm_nw_nw


[docs]def calc_medoid(data, n_ary = 'RR', w_factor = 'fraction', weight = 'nw', c_total = None):
    """*O(N)* medoid calculation for a set.
    
    Parameters
    ----------
    data : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    n_ary : str
        string with the initials of the desired similarity index to calculate the medoid from.
    w_factor : {'fraction', 'power_n'}
        Type of weight function that will be used.
    weight : str
        Type of weight function that will be used.
    c_total : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    
    Raises
    ------
    ValueError
        If the dimensions of the objects and columnwise sum differ.
    
    Returns
    -------
    int
        The index of the medoid.
    """
    if c_total and len(data[0]) != c_total: raise ValueError("Dimensions of objects and columnwise sum differ")
    elif not c_total: c_total = np.sum(data, axis = 0)
    n_objects = len(data)
    index = n_objects + 1
    min_sim = 1.01
    comp_sums = c_total - data
    for i, obj in enumerate(comp_sums):
        SI = SimilarityIndex(obj, n_objects = n_objects - 1, n_ary = n_ary, 
                             w_factor = w_factor, weight = weight)
        sim_index = SI()
        if sim_index < min_sim:
            min_sim = sim_index
            index = i
        else:
            pass
    return index


[docs]def calc_outlier(data, n_ary = 'RR', w_factor = 'fraction', weight = 'nw', c_total = None):
    """*O(N)* outlier calculation for a set.
    
    Parameters
    ----------
    data : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    n_ary : str
        string with the initials of the desired similarity index to calculate the medoid from.
    w_factor : {'fraction', 'power_n'}
        Type of weight function that will be used.
    weight : str
        Type of weight function that will be used.
    c_total : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    
    Raises
    ------
    ValueError
        If the dimensions of the objects and columnwise sum differ.
    
    Returns
    -------
    int
        The index of the outlier.
    """
    if c_total and len(data[0]) != c_total: raise ValueError("Dimensions of objects and columnwise sum differ")
    elif not c_total: c_total = np.sum(data, axis = 0)
    n_objects = len(data)
    index = n_objects + 1
    max_sim = -0.01
    comp_sums = c_total - data 
    for i, obj in enumerate(comp_sums):
        SI = SimilarityIndex(obj, n_objects = n_objects - 1, n_ary = n_ary, 
                             w_factor = w_factor, weight = weight)
        sim_index = SI()
        if sim_index > max_sim:
            max_sim = sim_index
            index = i
        else:
            pass
    return index


[docs]def calc_comp_sim(data, c_threshold = None, n_ary = 'RR', w_factor = 'fraction', 
                  weight = 'nw', c_total = None):
    """*O(N)* complementary similarity calculation for a set.
    
    Parameters
    ----------
    data : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    n_objects : int
        Number of objects to be compared.
    c_threshold : {None, 'dissimilar', int}
        Coincidence threshold.
    n_ary : str
        string with the initials of the desired similarity index to calculate the medoid from.
    w_factor : {'fraction', 'power_n'}
        Type of weight function that will be used.
    weight : str
        Type of weight function that will be used.
    c_total : array-like of shape (n_objects, n_features)
        Vector containing the sums of each column of the fingerprint matrix.
    
    Raises
    ------
    ValueError
        If the dimensions of the objects and columnwise sum differ.
    
    Returns
    -------
    list
        A list with the complementary similarities of all the molecules in the set.
    """
    if c_total and len(data[0]) != c_total: raise ValueError("Dimensions of objects and columnwise sum differ")
    elif not c_total: c_total = np.sum(data, axis = 0)

    n_objects = len(data)   
    comp_sums = c_total - data
    total = []
    for i, obj in enumerate(comp_sums):
        SI = SimilarityIndex(obj, n_objects = n_objects - 1, n_ary = n_ary, w_factor = w_factor, weight = weight)
        sim_index = SI()
        total.append((i, sim_index))
    return total