"""Fingerprint array comparison metrics.
Each is fully compatible with both dense and sparse inputs.
Author: Seth Axen
E-mail: seth.axen@gmail.com
"""
from __future__ import division
import numpy as np
import scipy
from scipy.sparse import csr_matrix, issparse, vstack
import scipy.sparse.linalg
import scipy.spatial
from e3fp.util import maybe_jit
[docs]def tanimoto(X, Y=None):
"""Compute the Tanimoto coefficients between `X` and `Y`.
Data must be binary. This is not checked.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
tanimoto : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
soergel: Analog to Tanimoto for non-binary data.
cosine, dice, pearson
"""
X, Y = _check_array_pair(X, Y)
Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.asarray(np.nan_to_num(XYbits / (Xbits + Ybits.T - XYbits)))
[docs]def soergel(X, Y=None):
"""Compute the Soergel similarities between `X` and `Y`.
Soergel similarity is the complement of Soergel distance and can be
thought of as the analog of the Tanimoto coefficient for count/float-based
data. For binary data, it is equivalent to the Tanimoto coefficient.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
soergel : array of shape (`n_fprints_X`, `n_fprints_Y`)
Notes
--------
If Numba is available, this function is jit-compiled and much more efficient.
See Also
--------
tanimoto: A fast version of this function for binary data.
pearson: Pearson correlation, also appropriate for non-binary data.
cosine, dice
"""
X, Y = _check_array_pair(X, Y)
S = np.empty((X.shape[0], Y.shape[0]), dtype=float)
if issparse(X):
return _sparse_soergel(X.data, X.indices, X.indptr,
Y.data, Y.indices, Y.indptr, S)
return _dense_soergel(X, Y, S)
[docs]def dice(X, Y=None):
"""Compute the Dice coefficients between `X` and `Y`.
Data must be binary. This is not checked.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
dice : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
cosine, soergel, tanimoto, pearson
"""
X, Y = _check_array_pair(X, Y)
Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.asarray(np.nan_to_num(2 * XYbits / (Xbits + Ybits.T)))
[docs]def cosine(X, Y=None, assume_binary=False):
"""Compute the Cosine similarities between `X` and `Y`.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
assume_binary : bool, optional
Assume data is binary (results in efficiency boost). If data is not
binary, the result will be incorrect.
Returns
-------
cosine : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
dice, soergel, tanimoto
"""
X, Y = _check_array_pair(X, Y)
if not issparse(X):
return 1.0 - scipy.spatial.distance.cdist(X, Y, metric="cosine")
if assume_binary:
Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.asarray(np.nan_to_num(XYbits / np.sqrt(Xbits * Ybits.T)))
else:
return _sparse_cosine(X, Y)
[docs]def pearson(X, Y=None):
"""Compute the Pearson correlation between `X` and `Y`.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
pearson : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
soergel: Soergel similarity for non-binary data
cosine, dice, tanimoto
"""
X, Y = _check_array_pair(X, Y)
Xlen = X.shape[0]
if issparse(X):
X = vstack((X, Y), format="csr")
X = X - X.mean(axis=1)
cov = (X * X.T) / (X.shape[1] - 1.0)
d = np.sqrt(np.diag(cov))
with np.errstate(divide="ignore"): # handle 0 in denominator
pearson = cov / np.outer(d, d)
else:
with np.errstate(divide="ignore"): # handle 0 in denominator
pearson = np.corrcoef(X, Y)
return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:]))
def _check_array(arr, dtype=float, force_sparse=False):
if force_sparse or issparse(arr):
return csr_matrix(arr, copy=False, dtype=dtype)
else:
return arr.astype(dtype, copy=False)
def _check_array_pair(X, Y=None, dtype=float, force_sparse=False):
if Y is not None and X.shape[1] != Y.shape[1]:
raise ValueError("Arrays must have same width.")
if force_sparse or issparse(X) or issparse(Y):
force_sparse = True # ensure if one is sparse, all are sparse.
X = _check_array(X, dtype=dtype, force_sparse=force_sparse)
if Y is None or Y is X:
Y = X
else:
Y = _check_array(Y, dtype=dtype, force_sparse=force_sparse)
return X, Y
def _get_bitcount_arrays(X, Y, return_XYbits=False):
if issparse(X):
Xbits = np.sum(X, axis=1)
if Y is X:
Ybits = Xbits
else:
Ybits = np.sum(Y, axis=1)
if return_XYbits:
XYbits = (X * Y.T).toarray()
return Xbits, Ybits, XYbits
else:
Xbits = np.sum(X, axis=1, keepdims=True)
if Y is X:
Ybits = Xbits
else:
Ybits = np.sum(Y, axis=1, keepdims=True)
if return_XYbits:
XYbits = np.dot(X, Y.T)
return Xbits, Ybits, XYbits
return Xbits, Ybits
def _sparse_cosine(X, Y):
Xnorm = scipy.sparse.linalg.norm(X, axis=1)
if Y is X:
Ynorm = Xnorm
else:
Ynorm = scipy.sparse.linalg.norm(Y, axis=1)
XY = (X * Y.T).toarray()
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.nan_to_num(XY / np.outer(Xnorm, Ynorm))
@maybe_jit(nopython=True, nogil=True, cache=True)
def _dense_soergel(X, Y, S):
for ix in range(S.shape[0]):
for iy in range(S.shape[1]):
sum_abs_diff = 0
sum_max = 0
for j in range(X.shape[1]):
diff = X[ix, j] - Y[iy, j]
if diff > 0:
sum_abs_diff += diff
sum_max += X[ix, j]
else:
sum_abs_diff -= diff
sum_max += Y[iy, j]
if sum_max == 0:
S[ix, iy] = 0
continue
S[ix, iy] = 1 - sum_abs_diff / sum_max
return S
@maybe_jit(nopython=True, nogil=True, cache=True)
def _sparse_soergel(Xdata, Xindices, Xindptr, Ydata, Yindices, Yindptr, S):
for ix in range(S.shape[0]):
if Xindptr[ix] == Xindptr[ix + 1]:
for iy in range(S.shape[1]): # no X values in row
S[ix, iy] = 0
continue
jxindmax = Xindptr[ix + 1] - 1
for iy in range(S.shape[1]):
if Yindptr[iy] == Yindptr[iy + 1]: # no Y values in row
S[ix, iy] = 0
continue
sum_abs_diff = 0
sum_max = 0
# Implementation of the final step of merge sort
jyindmax = Yindptr[iy + 1] - 1
jx = Xindptr[ix]
jy = Yindptr[iy]
while jx <= jxindmax and jy <= jyindmax:
jxind = Xindices[jx]
jyind = Yindices[jy]
if jxind < jyind:
sum_max += Xdata[jx]
sum_abs_diff += Xdata[jx]
jx += 1
elif jyind < jxind:
sum_max += Ydata[jy]
sum_abs_diff += Ydata[jy]
jy += 1
else:
diff = Xdata[jx] - Ydata[jy]
if diff > 0:
sum_abs_diff += diff
sum_max += Xdata[jx]
else:
sum_abs_diff -= diff
sum_max += Ydata[jy]
jx += 1
jy += 1
while jx <= jxindmax:
sum_max += Xdata[jx]
sum_abs_diff += Xdata[jx]
jx += 1
while jy <= jyindmax:
sum_max += Ydata[jy]
sum_abs_diff += Ydata[jy]
jy += 1
if sum_max == 0:
S[ix, iy] = 0
continue
S[ix, iy] = 1 - sum_abs_diff / sum_max
return S