"""Fingerprint array comparison metrics.
Each is fully compatible with both dense and sparse inputs.
Author: Seth Axen
E-mail: seth.axen@gmail.com
"""
from __future__ import division
import numpy as np
import scipy
from scipy.sparse import csr_matrix, issparse, vstack
from ._fast import soergel as fast_soergel
[docs]def tanimoto(X, Y=None):
"""Compute the Tanimoto coefficients between `X` and `Y`.
Data must be binary. This is not checked.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
tanimoto : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
soergel: Analog to Tanimoto for non-binary data.
cosine, dice, pearson
"""
X, Y = _check_array_pair(X, Y)
Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.asarray(np.nan_to_num(XYbits / (Xbits + Ybits.T - XYbits)))
[docs]def soergel(X, Y=None):
"""Compute the Soergel similarities between `X` and `Y`.
Soergel similarity is the complement of Soergel distance and can be
thought of as the analog of the Tanimoto coefficient for count/float-based
data. For binary data, it is equivalent to the Tanimoto coefficient.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
soergel : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
tanimoto: A fast version of this function for binary data.
pearson: Pearson correlation, also appropriate for non-binary data.
cosine, dice
"""
X, Y = _check_array_pair(X, Y)
return fast_soergel(X, Y, sparse=issparse(X))
[docs]def dice(X, Y=None):
"""Compute the Dice coefficients between `X` and `Y`.
Data must be binary. This is not checked.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
dice : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
cosine, soergel, tanimoto, pearson
"""
X, Y = _check_array_pair(X, Y)
Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.asarray(np.nan_to_num(2 * XYbits / (Xbits + Ybits.T)))
[docs]def cosine(X, Y=None, assume_binary=False):
"""Compute the Cosine similarities between `X` and `Y`.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
assume_binary : bool, optional
Assume data is binary (results in efficiency boost). If data is not
binary, the result will be incorrect.
Returns
-------
cosine : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
dice, soergel, tanimoto
"""
X, Y = _check_array_pair(X, Y)
if not issparse(X):
return 1.0 - scipy.spatial.distance.cdist(X, Y, metric="cosine")
if assume_binary:
Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True)
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.asarray(np.nan_to_num(XYbits / np.sqrt(Xbits * Ybits.T)))
else:
return _sparse_cosine(X, Y)
[docs]def pearson(X, Y=None):
"""Compute the Pearson correlation between `X` and `Y`.
Parameters
----------
X : array_like or sparse matrix
with shape (`n_fprints_X`, `n_bits`).
Y : array_like or sparse matrix, optional
with shape (`n_fprints_Y`, `n_bits`).
Returns
-------
pearson : array of shape (`n_fprints_X`, `n_fprints_Y`)
See Also
--------
soergel: Soergel similarity for non-binary data
cosine, dice, tanimoto
"""
X, Y = _check_array_pair(X, Y)
Xlen = X.shape[0]
if issparse(X):
X = vstack((X, Y), format="csr")
X = X - X.mean(axis=1)
cov = (X * X.T) / (X.shape[1] - 1.0)
d = np.sqrt(np.diag(cov))
with np.errstate(divide="ignore"): # handle 0 in denominator
pearson = cov / np.outer(d, d)
else:
with np.errstate(divide="ignore"): # handle 0 in denominator
pearson = scipy.corrcoef(X, Y)
return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:]))
def _check_array(arr, dtype=np.double, force_sparse=False):
if force_sparse or issparse(arr):
return csr_matrix(arr, copy=False, dtype=dtype)
else:
return arr.astype(dtype, copy=False)
def _check_array_pair(X, Y=None, dtype=np.double, force_sparse=False):
if Y is not None and X.shape[1] != Y.shape[1]:
raise ValueError("Arrays must have same width.")
if force_sparse or issparse(X) or issparse(Y):
force_sparse = True # ensure if one is sparse, all are sparse.
X = _check_array(X, dtype=dtype, force_sparse=force_sparse)
if Y is None or Y is X:
Y = X
else:
Y = _check_array(Y, dtype=dtype, force_sparse=force_sparse)
return X, Y
def _get_bitcount_arrays(X, Y, return_XYbits=False):
if issparse(X):
Xbits = np.sum(X, axis=1)
if Y is X:
Ybits = Xbits
else:
Ybits = np.sum(Y, axis=1)
if return_XYbits:
XYbits = (X * Y.T).toarray()
return Xbits, Ybits, XYbits
else:
Xbits = np.sum(X, axis=1, keepdims=True)
if Y is X:
Ybits = Xbits
else:
Ybits = np.sum(Y, axis=1, keepdims=True)
if return_XYbits:
XYbits = np.dot(X, Y.T)
return Xbits, Ybits, XYbits
return Xbits, Ybits
def _sparse_cosine(X, Y):
Xnorm = scipy.sqrt(X.multiply(X).sum(axis=1))
if Y is X:
Ynorm = Xnorm
else:
Ynorm = scipy.sqrt(Y.multiply(Y).sum(axis=1))
XY = (X * Y.T).toarray()
with np.errstate(divide="ignore"): # handle 0 in denominator
return np.nan_to_num(XY / (Xnorm * Ynorm.T))