"""
Miranda Quintana Group - University of Florida
eSIM: extended similarity indices
Please, cite the original papers on the *n*-ary indices:
Miranda-Quintana, R.A., Bajusz, D., Rácz, A. & Héberger, K. J Cheminform 13, 32 (2021).
https://doi.org/10.1186/s13321-021-00505-3
Miranda-Quintana, R.A., Rácz, A., Bajusz, D. & Héberger, K. J Cheminform 13, 33 (2021).
https://doi.org/10.1186/s13321-021-00504-4
"""
from math import ceil, log
import numpy as np
[docs]def calculate_counters(c_total, n_objects, c_threshold=None, w_factor='fraction'):
"""Calculate 1-similarity, 0-similarity, and dissimilarity counters.
Parameters
---------
c_total : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
n_objects : int
Number of objects to be compared.
c_threshold : {None, 'dissimilar', int}, default=None
Coincidence threshold.
- None : Default, c_threshold = n_objects % 2
- ``dissimilar`` : c_threshold = ceil(n_objects / 2)
- int : Integer number < n_objects
- float : Real number in the (0, 1) interval. Indicates the %
of the total data that will serve as threshold.
w_factor : {"fraction", "power_n"}, default='fraction'
Type of weight function that will be used.
- ``fraction`` : similarity = d[k]/n
dissimilarity = 1 - (d[k] - n_objects % 2)/n_objects
- ``power_n`` : similarity = n**-(n_objects - d[k])
dissimilarity = n**-(d[k] - n_objects % 2)
- other values : similarity = dissimilarity = 1
Returns
-------
counters : dict
Dictionary with the weighted and non-weighted counters.
"""
# Assign c_threshold
if not c_threshold:
c_threshold = n_objects % 2
if c_threshold == 'dissimilar':
c_threshold = ceil(n_objects / 2)
if c_threshold == 'min':
c_threshold = n_objects % 2
if isinstance(c_threshold, int):
if c_threshold >= n_objects:
raise ValueError('c_threshold cannot be equal or greater than n_objects.')
c_threshold = c_threshold
if 0 < c_threshold < 1:
c_threshold *= n_objects
# Set w_factor
if w_factor:
if 'power' in w_factor:
power = int(w_factor.split("_")[-1])
def f_s(d):
return power**-float(n_objects - d)
def f_d(d):
return power**-float(d - n_objects % 2)
elif w_factor == 'fraction':
def f_s(d):
return d/n_objects
def f_d(d):
return 1 - (d - n_objects % 2)/n_objects
else:
def f_s(d):
return 1
def f_d(d):
return 1
else:
def f_s(d):
return 1
def f_d(d):
return 1
# Calculate a, d, b + c
a_indices = 2 * c_total - n_objects > c_threshold
d_indices = n_objects - 2 * c_total > c_threshold
dis_indices = np.abs(2 * c_total - n_objects) <= c_threshold
a = np.sum(a_indices)
d = np.sum(d_indices)
total_dis = np.sum(dis_indices)
a_w_array = f_s(2 * c_total[a_indices] - n_objects)
d_w_array = f_s(abs(2 * c_total[d_indices] - n_objects))
total_w_dis_array = f_d(abs(2 * c_total[dis_indices] - n_objects))
w_a = np.sum(a_w_array)
w_d = np.sum(d_w_array)
total_w_dis = np.sum(total_w_dis_array)
total_sim = a + d
total_w_sim = w_a + w_d
p = total_sim + total_dis
w_p = total_w_sim + total_w_dis
counters = {"a": a, "w_a": w_a, "d": d, "w_d": w_d,
"total_sim": total_sim, "total_w_sim": total_w_sim,
"total_dis": total_dis, "total_w_dis": total_w_dis,
"p": p, "w_p": w_p}
return counters
[docs]def gen_sim_dict(c_total, n_objects, c_threshold=None, w_factor='fraction'):
"""Generate a dictionary with the similarity indices.
Valid values for metric are:
``MSD``: Mean Square Deviation.
Extended or Instant Similarity Metrics
| ``AC``: Austin-Colwell, ``BUB``: Baroni-Urbani-Buser,
| ``CTn``: Consoni-Todschini n, ``Fai``: Faith,
| ``Gle``: Gleason, ``Ja``: Jaccard,
| ``Ja0``: Jaccard 0-variant, ``JT``: Jaccard-Tanimoto,
| ``RT``: Rogers-Tanimoto, ``RR``: Russel-Rao,
| ``SM``: Sokal-Michener, ``SSn``: Sokal-Sneath n.
Parameters
----------
c_total : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
n_objects : int
Number of objects to be compared.
c_threshold : {None, 'dissimilar', int}
Coincidence threshold.
w_factor : {'fraction', 'power_n'}
Type of weight function that will be used.
Returns
-------
dict
Dictionary with the similarity indices.
"""
counters = calculate_counters(c_total, n_objects, c_threshold=c_threshold,
w_factor=w_factor)
bub_nw = ((counters['w_a'] * counters['w_d']) ** 0.5 + counters['w_a'])/\
((counters['a'] * counters['d']) ** 0.5 + counters['a'] + counters['total_dis'])
fai_nw = (counters['w_a'] + 0.5 * counters['w_d'])/\
(counters['p'])
gle_nw = (2 * counters['w_a'])/\
(2 * counters['a'] + counters['total_dis'])
ja_nw = (3 * counters['w_a'])/\
(3 * counters['a'] + counters['total_dis'])
jt_nw = (counters['w_a'])/\
(counters['a'] + counters['total_dis'])
rt_nw = (counters['total_w_sim'])/\
(counters['p'] + counters['total_dis'])
rr_nw = (counters['w_a'])/\
(counters['p'])
sm_nw = (counters['total_w_sim'])/\
(counters['p'])
ss1_nw = (counters['w_a'])/\
(counters['a'] + 2 * counters['total_dis'])
ss2_nw = (2 * counters['total_w_sim'])/\
(counters['p'] + counters['total_sim'])
Indices = {'BUB':bub_nw, 'Fai':fai_nw, 'Gle':gle_nw, 'Ja':ja_nw, 'JT':jt_nw,
'RT':rt_nw, 'RR':rr_nw, 'SM':sm_nw, 'SS1':ss1_nw, 'SS2':ss2_nw}
return Indices
[docs]class SimilarityIndex:
"""*O(N)* similarity index calculation for a set.
Parameters
----------
data : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
n_objects : int
Number of objects to be compared.
c_threshold : {None, 'dissimilar', int}
Coincidence threshold.
n_ary : str
string with the initials of the desired similarity index to calculate the medoid from.
w_factor : {'fraction', 'power_n'}
Type of weight function that will be used.
weight : str
Type of weight function that will be used.
return_dict : bool
If True, returns a dictionary with all the similarity indices.
Returns
-------
float or dict
The similarity index.
"""
def __init__(self, data, n_objects = None, c_threshold = None, n_ary = 'RR',
w_factor = 'fraction', weight = 'nw', return_dict = False):
self.data = data
self.n_objects = n_objects
self.n_ary = n_ary
self.w_factor = w_factor
self.c_threshold = c_threshold
self.weight = weight
self.counters = calculate_counters(self.data, self.n_objects, self.c_threshold, self.w_factor)
self.return_dict = return_dict
if self.return_dict == True:
self.index_functions = \
{'nw': {'AC': self.ac_nw, 'BUB': self.bub_nw, 'CT1': self.ct1_nw,
'CT2': self.ct2_nw, 'CT3': self.ct3_nw, 'CT4': self.ct4_nw,
'Fai': self.fai_nw, 'Gle': self.gle_nw, 'Ja': self.ja_nw,
'Ja0': self.ja0_nw, 'JT': self.jt_nw, 'RT': self.rt_nw, 'RR': self.rr_nw,
'SM': self.sm_nw, 'SS1': self.ss1_nw, 'SS2': self.ss2_nw},
'w': {'AC': self.ac_w, 'BUB': self.bub_w, 'CT1': self.ct1_w, 'CT2': self.ct2_w, 'CT3': self.ct3_w,
'CT4': self.ct4_w, 'Fai': self.fai_w, 'Gle': self.gle_w, 'Ja': self.ja_w,
'Ja0': self.ja0_w, 'JT': self.jt_w, 'RT': self.rt_w, 'RR': self.rr_w,
'SM': self.sm_w, 'SS1': self.ss1_w, 'SS2': self.ss2_w},
'nw_nw': {'RR': self.rr_nw_nw, 'SM': self.sm_nw_nw}}
[docs] def __call__(self):
"""The default method to be called when the class is instantiated.
Returns:
--------
float or dict
The similarity index or a dictionary with all the similarity indices.
"""
if self.return_dict:
return self.gen_dict()
else:
return getattr(self, f"{self.n_ary.lower()}_{self.weight}")()
[docs] def gen_sim_dict(self):
"""
Generates a dictionary of all similarity indices.
"""
return {outer_key: {inner_key: inner_func() for inner_key, inner_func in inner_dict.items()}
for outer_key, inner_dict in self.index_functions.items()}
# Calculate for individual Index Functions
# Weighted Indices
[docs] def ac_w(self):
ac_w = (2/np.pi) * np.arcsin(np.sqrt(self.counters['total_w_sim']/
self.counters['w_p']))
return ac_w
[docs] def bub_w(self):
bub_w = ((self.counters['w_a'] * self.counters['w_d'])**0.5 + self.counters['w_a'])/\
((self.counters['w_a'] * self.counters['w_d'])**0.5 + self.counters['w_a'] + self.counters['total_w_dis'])
return bub_w
[docs] def ct1_w(self):
ct1_w = (log(1 + self.counters['w_a'] + self.counters['w_d']))/\
(log(1 + self.counters['w_p']))
return ct1_w
[docs] def ct2_w(self):
ct2_w = (log(1 + self.counters['w_p']) - log(1 + self.counters['total_w_dis']))/\
(log(1 + self.counters['w_p']))
return ct2_w
[docs] def ct3_w(self):
ct3_w = (log(1 + self.counters['w_a']))/\
(log(1 + self.counters['w_p']))
return ct3_w
[docs] def ct4_w(self):
ct4_w = (log(1 + self.counters['w_a']))/\
(log(1 + self.counters['w_a'] + self.counters['total_w_dis']))
return ct4_w
[docs] def fai_w(self):
fai_w = (self.counters['w_a'] + 0.5 * self.counters['w_d'])/\
(self.counters['w_p'])
return fai_w
[docs] def gle_w(self):
gle_w = (2 * self.counters['w_a'])/\
(2 * self.counters['w_a'] + self.counters['total_w_dis'])
return gle_w
[docs] def ja_w(self):
ja_w = (3 * self.counters['w_a'])/\
(3 * self.counters['w_a'] + self.counters['total_w_dis'])
return ja_w
[docs] def ja0_w(self):
ja0_w = (3 * self.counters['total_w_sim'])/\
(3 * self.counters['total_w_sim'] + self.counters['total_w_dis'])
return ja0_w
[docs] def jt_w(self):
jt_w = (self.counters['w_a'])/\
(self.counters['w_a'] + self.counters['total_w_dis'])
return jt_w
[docs] def rt_w(self):
rt_w = (self.counters['total_w_sim'])/\
(self.counters['w_p'] + self.counters['total_w_dis'])
return rt_w
[docs] def rr_w(self):
rr_w = (self.counters['w_a'])/\
(self.counters['w_p'])
return rr_w
[docs] def sm_w(self):
sm_w = (self.counters['total_w_sim'])/\
(self.counters['w_p'])
return sm_w
[docs] def ss1_w(self):
ss1_w = (self.counters['w_a'])/\
(self.counters['w_a'] + 2 * self.counters['total_w_dis'])
return ss1_w
[docs] def ss2_w(self):
ss2_w = (2 * self.counters['total_w_sim'])/\
(self.counters['w_p'] + self.counters['total_w_sim'])
return ss2_w
# Non-Weighted Indices
[docs] def ac_nw(self):
ac_nw = (2/np.pi) * np.arcsin(np.sqrt(self.counters['total_w_sim']/
self.counters['p']))
return ac_nw
[docs] def bub_nw(self):
bub_nw = ((self.counters['w_a'] * self.counters['w_d'])**0.5 + self.counters['w_a'])/\
((self.counters['a'] * self.counters['d'])**0.5 + self.counters['a'] + self.counters['total_dis'])
return bub_nw
[docs] def ct1_nw(self):
ct1_nw = (log(1 + self.counters['w_a'] + self.counters['w_d']))/\
(log(1 + self.counters['p']))
return ct1_nw
[docs] def ct2_nw(self):
ct2_nw = (log(1 + self.counters['w_p']) - log(1 + self.counters['total_w_dis']))/\
(log(1 + self.counters['p']))
return ct2_nw
[docs] def ct3_nw(self):
ct3_nw = (log(1 + self.counters['w_a']))/\
(log(1 + self.counters['p']))
return ct3_nw
[docs] def ct4_nw(self):
ct4_nw = (log(1 + self.counters['w_a']))/\
(log(1 + self.counters['a'] + self.counters['total_dis']))
return ct4_nw
[docs] def fai_nw(self):
fai_nw = (self.counters['w_a'] + 0.5 * self.counters['w_d'])/\
(self.counters['p'])
return fai_nw
[docs] def gle_nw(self):
gle_nw = (2 * self.counters['w_a'])/\
(2 * self.counters['a'] + self.counters['total_dis'])
return gle_nw
[docs] def ja_nw(self):
ja_nw = (3 * self.counters['w_a'])/\
(3 * self.counters['a'] + self.counters['total_dis'])
return ja_nw
[docs] def ja0_nw(self):
ja0_nw = (3 * self.counters['total_w_sim'])/\
(3 * self.counters['total_sim'] + self.counters['total_dis'])
return ja0_nw
[docs] def jt_nw(self):
jt_nw = (self.counters['w_a'])/\
(self.counters['a'] + self.counters['total_dis'])
return jt_nw
[docs] def rt_nw(self):
rt_nw = (self.counters['total_w_sim'])/\
(self.counters['p'] + self.counters['total_dis'])
return rt_nw
[docs] def rr_nw(self):
rr_nw = (self.counters['w_a'])/\
(self.counters['p'])
return rr_nw
[docs] def sm_nw(self):
sm_nw = (self.counters['total_w_sim'])/\
(self.counters['p'])
return sm_nw
[docs] def ss1_nw(self):
ss1_nw = (self.counters['w_a'])/\
(self.counters['a'] + 2 * self.counters['total_dis'])
return ss1_nw
[docs] def ss2_nw(self):
ss2_nw = (2 * self.counters['total_w_sim'])/\
(self.counters['p'] + self.counters['total_sim'])
return ss2_nw
# Non-Weighted Indices for Denominator and Numerator
[docs] def rr_nw_nw(self):
rr_nw_nw = (self.counters['a'])/\
(self.counters['p'])
return rr_nw_nw
[docs] def sm_nw_nw(self):
sm_nw_nw = (self.counters['total_sim'])/\
(self.counters['p'])
return sm_nw_nw
[docs]def calc_medoid(data, n_ary = 'RR', w_factor = 'fraction', weight = 'nw', c_total = None):
"""*O(N)* medoid calculation for a set.
Parameters
----------
data : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
n_ary : str
string with the initials of the desired similarity index to calculate the medoid from.
w_factor : {'fraction', 'power_n'}
Type of weight function that will be used.
weight : str
Type of weight function that will be used.
c_total : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
Raises
------
ValueError
If the dimensions of the objects and columnwise sum differ.
Returns
-------
int
The index of the medoid.
"""
if c_total and len(data[0]) != c_total: raise ValueError("Dimensions of objects and columnwise sum differ")
elif not c_total: c_total = np.sum(data, axis = 0)
n_objects = len(data)
index = n_objects + 1
min_sim = 1.01
comp_sums = c_total - data
for i, obj in enumerate(comp_sums):
SI = SimilarityIndex(obj, n_objects = n_objects - 1, n_ary = n_ary,
w_factor = w_factor, weight = weight)
sim_index = SI()
if sim_index < min_sim:
min_sim = sim_index
index = i
else:
pass
return index
[docs]def calc_outlier(data, n_ary = 'RR', w_factor = 'fraction', weight = 'nw', c_total = None):
"""*O(N)* outlier calculation for a set.
Parameters
----------
data : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
n_ary : str
string with the initials of the desired similarity index to calculate the medoid from.
w_factor : {'fraction', 'power_n'}
Type of weight function that will be used.
weight : str
Type of weight function that will be used.
c_total : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
Raises
------
ValueError
If the dimensions of the objects and columnwise sum differ.
Returns
-------
int
The index of the outlier.
"""
if c_total and len(data[0]) != c_total: raise ValueError("Dimensions of objects and columnwise sum differ")
elif not c_total: c_total = np.sum(data, axis = 0)
n_objects = len(data)
index = n_objects + 1
max_sim = -0.01
comp_sums = c_total - data
for i, obj in enumerate(comp_sums):
SI = SimilarityIndex(obj, n_objects = n_objects - 1, n_ary = n_ary,
w_factor = w_factor, weight = weight)
sim_index = SI()
if sim_index > max_sim:
max_sim = sim_index
index = i
else:
pass
return index
[docs]def calc_comp_sim(data, c_threshold = None, n_ary = 'RR', w_factor = 'fraction',
weight = 'nw', c_total = None):
"""*O(N)* complementary similarity calculation for a set.
Parameters
----------
data : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
n_objects : int
Number of objects to be compared.
c_threshold : {None, 'dissimilar', int}
Coincidence threshold.
n_ary : str
string with the initials of the desired similarity index to calculate the medoid from.
w_factor : {'fraction', 'power_n'}
Type of weight function that will be used.
weight : str
Type of weight function that will be used.
c_total : array-like of shape (n_objects, n_features)
Vector containing the sums of each column of the fingerprint matrix.
Raises
------
ValueError
If the dimensions of the objects and columnwise sum differ.
Returns
-------
list
A list with the complementary similarities of all the molecules in the set.
"""
if c_total and len(data[0]) != c_total: raise ValueError("Dimensions of objects and columnwise sum differ")
elif not c_total: c_total = np.sum(data, axis = 0)
n_objects = len(data)
comp_sums = c_total - data
total = []
for i, obj in enumerate(comp_sums):
SI = SimilarityIndex(obj, n_objects = n_objects - 1, n_ary = n_ary, w_factor = w_factor, weight = weight)
sim_index = SI()
total.append((i, sim_index))
return total