Source code for e3fp.fingerprint.metrics.array_metrics

"""Fingerprint array comparison metrics.

Each is fully compatible with both dense and sparse inputs.

Author: Seth Axen
E-mail: seth.axen@gmail.com
"""
from __future__ import division

import numpy as np
import scipy
from scipy.sparse import csr_matrix, issparse, vstack
from ._fast import soergel as fast_soergel


[docs]def tanimoto(X, Y=None): """Compute the Tanimoto coefficients between `X` and `Y`. Data must be binary. This is not checked. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- tanimoto : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- soergel: Analog to Tanimoto for non-binary data. cosine, dice, pearson """ X, Y = _check_array_pair(X, Y) Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) with np.errstate(divide="ignore"): # handle 0 in denominator return np.asarray(np.nan_to_num(XYbits / (Xbits + Ybits.T - XYbits)))
[docs]def soergel(X, Y=None): """Compute the Soergel similarities between `X` and `Y`. Soergel similarity is the complement of Soergel distance and can be thought of as the analog of the Tanimoto coefficient for count/float-based data. For binary data, it is equivalent to the Tanimoto coefficient. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- soergel : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- tanimoto: A fast version of this function for binary data. pearson: Pearson correlation, also appropriate for non-binary data. cosine, dice """ X, Y = _check_array_pair(X, Y) return fast_soergel(X, Y, sparse=issparse(X))
[docs]def dice(X, Y=None): """Compute the Dice coefficients between `X` and `Y`. Data must be binary. This is not checked. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- dice : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- cosine, soergel, tanimoto, pearson """ X, Y = _check_array_pair(X, Y) Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) with np.errstate(divide="ignore"): # handle 0 in denominator return np.asarray(np.nan_to_num(2 * XYbits / (Xbits + Ybits.T)))
[docs]def cosine(X, Y=None, assume_binary=False): """Compute the Cosine similarities between `X` and `Y`. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). assume_binary : bool, optional Assume data is binary (results in efficiency boost). If data is not binary, the result will be incorrect. Returns ------- cosine : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- dice, soergel, tanimoto """ X, Y = _check_array_pair(X, Y) if not issparse(X): return 1.0 - scipy.spatial.distance.cdist(X, Y, metric="cosine") if assume_binary: Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) with np.errstate(divide="ignore"): # handle 0 in denominator return np.asarray(np.nan_to_num(XYbits / np.sqrt(Xbits * Ybits.T))) else: return _sparse_cosine(X, Y)
[docs]def pearson(X, Y=None): """Compute the Pearson correlation between `X` and `Y`. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- pearson : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- soergel: Soergel similarity for non-binary data cosine, dice, tanimoto """ X, Y = _check_array_pair(X, Y) Xlen = X.shape[0] if issparse(X): X = vstack((X, Y), format="csr") X = X - X.mean(axis=1) cov = (X * X.T) / (X.shape[1] - 1.0) d = np.sqrt(np.diag(cov)) with np.errstate(divide="ignore"): # handle 0 in denominator pearson = cov / np.outer(d, d) else: with np.errstate(divide="ignore"): # handle 0 in denominator pearson = scipy.corrcoef(X, Y) return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:]))
def _check_array(arr, dtype=np.double, force_sparse=False): if force_sparse or issparse(arr): return csr_matrix(arr, copy=False, dtype=dtype) else: return arr.astype(dtype, copy=False) def _check_array_pair(X, Y=None, dtype=np.double, force_sparse=False): if Y is not None and X.shape[1] != Y.shape[1]: raise ValueError("Arrays must have same width.") if force_sparse or issparse(X) or issparse(Y): force_sparse = True # ensure if one is sparse, all are sparse. X = _check_array(X, dtype=dtype, force_sparse=force_sparse) if Y is None or Y is X: Y = X else: Y = _check_array(Y, dtype=dtype, force_sparse=force_sparse) return X, Y def _get_bitcount_arrays(X, Y, return_XYbits=False): if issparse(X): Xbits = np.sum(X, axis=1) if Y is X: Ybits = Xbits else: Ybits = np.sum(Y, axis=1) if return_XYbits: XYbits = (X * Y.T).toarray() return Xbits, Ybits, XYbits else: Xbits = np.sum(X, axis=1, keepdims=True) if Y is X: Ybits = Xbits else: Ybits = np.sum(Y, axis=1, keepdims=True) if return_XYbits: XYbits = np.dot(X, Y.T) return Xbits, Ybits, XYbits return Xbits, Ybits def _sparse_cosine(X, Y): Xnorm = scipy.sqrt(X.multiply(X).sum(axis=1)) if Y is X: Ynorm = Xnorm else: Ynorm = scipy.sqrt(Y.multiply(Y).sum(axis=1)) XY = (X * Y.T).toarray() with np.errstate(divide="ignore"): # handle 0 in denominator return np.nan_to_num(XY / (Xnorm * Ynorm.T))