Source code for e3fp.fingerprint.metrics.array_metrics

"""Fingerprint array comparison metrics.

Each is fully compatible with both dense and sparse inputs.

Author: Seth Axen
E-mail: seth.axen@gmail.com
"""
from __future__ import division

import numpy as np
import scipy
from scipy.sparse import csr_matrix, issparse, vstack
import scipy.sparse.linalg
import scipy.spatial
from e3fp.util import maybe_jit


[docs]def tanimoto(X, Y=None): """Compute the Tanimoto coefficients between `X` and `Y`. Data must be binary. This is not checked. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- tanimoto : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- soergel: Analog to Tanimoto for non-binary data. cosine, dice, pearson """ X, Y = _check_array_pair(X, Y) Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) with np.errstate(divide="ignore"): # handle 0 in denominator return np.asarray(np.nan_to_num(XYbits / (Xbits + Ybits.T - XYbits)))
[docs]def soergel(X, Y=None): """Compute the Soergel similarities between `X` and `Y`. Soergel similarity is the complement of Soergel distance and can be thought of as the analog of the Tanimoto coefficient for count/float-based data. For binary data, it is equivalent to the Tanimoto coefficient. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- soergel : array of shape (`n_fprints_X`, `n_fprints_Y`) Notes -------- If Numba is available, this function is jit-compiled and much more efficient. See Also -------- tanimoto: A fast version of this function for binary data. pearson: Pearson correlation, also appropriate for non-binary data. cosine, dice """ X, Y = _check_array_pair(X, Y) S = np.empty((X.shape[0], Y.shape[0]), dtype=float) if issparse(X): return _sparse_soergel(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, S) return _dense_soergel(X, Y, S)
[docs]def dice(X, Y=None): """Compute the Dice coefficients between `X` and `Y`. Data must be binary. This is not checked. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- dice : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- cosine, soergel, tanimoto, pearson """ X, Y = _check_array_pair(X, Y) Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) with np.errstate(divide="ignore"): # handle 0 in denominator return np.asarray(np.nan_to_num(2 * XYbits / (Xbits + Ybits.T)))
[docs]def cosine(X, Y=None, assume_binary=False): """Compute the Cosine similarities between `X` and `Y`. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). assume_binary : bool, optional Assume data is binary (results in efficiency boost). If data is not binary, the result will be incorrect. Returns ------- cosine : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- dice, soergel, tanimoto """ X, Y = _check_array_pair(X, Y) if not issparse(X): return 1.0 - scipy.spatial.distance.cdist(X, Y, metric="cosine") if assume_binary: Xbits, Ybits, XYbits = _get_bitcount_arrays(X, Y, return_XYbits=True) with np.errstate(divide="ignore"): # handle 0 in denominator return np.asarray(np.nan_to_num(XYbits / np.sqrt(Xbits * Ybits.T))) else: return _sparse_cosine(X, Y)
[docs]def pearson(X, Y=None): """Compute the Pearson correlation between `X` and `Y`. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- pearson : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- soergel: Soergel similarity for non-binary data cosine, dice, tanimoto """ X, Y = _check_array_pair(X, Y) Xlen = X.shape[0] if issparse(X): X = vstack((X, Y), format="csr") X = X - X.mean(axis=1) cov = (X * X.T) / (X.shape[1] - 1.0) d = np.sqrt(np.diag(cov)) with np.errstate(divide="ignore"): # handle 0 in denominator pearson = cov / np.outer(d, d) else: with np.errstate(divide="ignore"): # handle 0 in denominator pearson = np.corrcoef(X, Y) return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:]))
def _check_array(arr, dtype=float, force_sparse=False): if force_sparse or issparse(arr): return csr_matrix(arr, copy=False, dtype=dtype) else: return arr.astype(dtype, copy=False) def _check_array_pair(X, Y=None, dtype=float, force_sparse=False): if Y is not None and X.shape[1] != Y.shape[1]: raise ValueError("Arrays must have same width.") if force_sparse or issparse(X) or issparse(Y): force_sparse = True # ensure if one is sparse, all are sparse. X = _check_array(X, dtype=dtype, force_sparse=force_sparse) if Y is None or Y is X: Y = X else: Y = _check_array(Y, dtype=dtype, force_sparse=force_sparse) return X, Y def _get_bitcount_arrays(X, Y, return_XYbits=False): if issparse(X): Xbits = np.sum(X, axis=1) if Y is X: Ybits = Xbits else: Ybits = np.sum(Y, axis=1) if return_XYbits: XYbits = (X * Y.T).toarray() return Xbits, Ybits, XYbits else: Xbits = np.sum(X, axis=1, keepdims=True) if Y is X: Ybits = Xbits else: Ybits = np.sum(Y, axis=1, keepdims=True) if return_XYbits: XYbits = np.dot(X, Y.T) return Xbits, Ybits, XYbits return Xbits, Ybits def _sparse_cosine(X, Y): Xnorm = scipy.sparse.linalg.norm(X, axis=1) if Y is X: Ynorm = Xnorm else: Ynorm = scipy.sparse.linalg.norm(Y, axis=1) XY = (X * Y.T).toarray() with np.errstate(divide="ignore"): # handle 0 in denominator return np.nan_to_num(XY / np.outer(Xnorm, Ynorm)) @maybe_jit(nopython=True, nogil=True, cache=True) def _dense_soergel(X, Y, S): for ix in range(S.shape[0]): for iy in range(S.shape[1]): sum_abs_diff = 0 sum_max = 0 for j in range(X.shape[1]): diff = X[ix, j] - Y[iy, j] if diff > 0: sum_abs_diff += diff sum_max += X[ix, j] else: sum_abs_diff -= diff sum_max += Y[iy, j] if sum_max == 0: S[ix, iy] = 0 continue S[ix, iy] = 1 - sum_abs_diff / sum_max return S @maybe_jit(nopython=True, nogil=True, cache=True) def _sparse_soergel(Xdata, Xindices, Xindptr, Ydata, Yindices, Yindptr, S): for ix in range(S.shape[0]): if Xindptr[ix] == Xindptr[ix + 1]: for iy in range(S.shape[1]): # no X values in row S[ix, iy] = 0 continue jxindmax = Xindptr[ix + 1] - 1 for iy in range(S.shape[1]): if Yindptr[iy] == Yindptr[iy + 1]: # no Y values in row S[ix, iy] = 0 continue sum_abs_diff = 0 sum_max = 0 # Implementation of the final step of merge sort jyindmax = Yindptr[iy + 1] - 1 jx = Xindptr[ix] jy = Yindptr[iy] while jx <= jxindmax and jy <= jyindmax: jxind = Xindices[jx] jyind = Yindices[jy] if jxind < jyind: sum_max += Xdata[jx] sum_abs_diff += Xdata[jx] jx += 1 elif jyind < jxind: sum_max += Ydata[jy] sum_abs_diff += Ydata[jy] jy += 1 else: diff = Xdata[jx] - Ydata[jy] if diff > 0: sum_abs_diff += diff sum_max += Xdata[jx] else: sum_abs_diff -= diff sum_max += Ydata[jy] jx += 1 jy += 1 while jx <= jxindmax: sum_max += Xdata[jx] sum_abs_diff += Xdata[jx] jx += 1 while jy <= jyindmax: sum_max += Ydata[jy] sum_abs_diff += Ydata[jy] jy += 1 if sum_max == 0: S[ix, iy] = 0 continue S[ix, iy] = 1 - sum_abs_diff / sum_max return S