Source code for e3fp.fingerprint.fprinter

"""Tools for generating E3FP fingerprints.

Author: Seth Axen
E-mail: seth.axen@gmail.com
"""
from __future__ import division, print_function
import os
import logging

import numpy as np
from rdkit import Chem
import mmh3
from python_utilities.io_tools import touch_dir
from python_utilities.scripting import setup_logging
from ..config.params import get_default_value
from .fprint import Fingerprint, CountFingerprint
from .structs import Shell, shell_to_pdb
from . import array_ops


BITS = 2 ** 32
LEVEL_DEF = get_default_value("fingerprinting", "level", int)
RADIUS_MULTIPLIER_DEF = get_default_value(
    "fingerprinting", "radius_multiplier", float
)
COUNTS_DEF = get_default_value("fingerprinting", "counts", bool)
STEREO_DEF = get_default_value("fingerprinting", "stereo", bool)
INCLUDE_DISCONNECTED_DEF = get_default_value(
    "fingerprinting", "include_disconnected", bool
)
RDKIT_INVARIANTS_DEF = get_default_value(
    "fingerprinting", "rdkit_invariants", bool
)
EXCLUDE_FLOATING_DEF = get_default_value(
    "fingerprinting", "exclude_floating", bool
)
REMOVE_DUPLICATE_SUBSTRUCTS_DEF = get_default_value(
    "fingerprinting", "remove_duplicate_substructs", bool
)
IDENT_DTYPE = np.int64  # np.dtype to use for identifiers
Y_AXIS_PRECISION = 0.1  # angstroms
Z_AXIS_PRECISION = 0.01  # rad
POLAR_CONE_RAD = np.pi / 36  # rad
MMH3_SEED = 0
BOND_TYPES = {
    None: 5,
    Chem.BondType.SINGLE: 1,
    Chem.BondType.DOUBLE: 2,
    Chem.BondType.TRIPLE: 3,
    Chem.BondType.AROMATIC: 4,
}

setup_logging(reset=False)


[docs]class Fingerprinter(object):
    """E3FP fingerprint generator.

    Parameters
    ----------
    bits : int or None, optional
        Maximum number of bits to which to fold returned fingerprint. Multiple
        of 2 is strongly recommended.
    level : int or None, optional
        Maximum number of iterations for fingerprint generation. If None or
        -1, run until no new substructures are identified. Because this could
        produce a different final level number for each conformer, it is
        recommended to manually specify a level.
    radius_multiplier : float, optional
        Multiple by which to increase shell size. At iteration 0, shell radius
        is 0*`radius_multiplier`, at iteration 2, radius is
        2*`radius_multiplier`, etc.
    counts : bool, optional
        Instead of simple bit-based `Fingerprint` object, generate
        `CountFingerprint` that tracks number of times each bit appears in a
        fingerprint.
    stereo : bool, optional
        Differentiate based on stereography. Resulting fingerprints are not
        comparable to non-stereo fingerprints.
    remove_duplicate_substructs : bool, optional
        If a substructure arises that corresponds to an identifier already in
        the fingerprint, then the identifier for the duplicate substructure is
        not added to fingerprint.
    include_disconnected : bool, optional:
        Include disconnected atoms from hashes and substructure. E3FP's
        advantage over ECFP relies on disconnected atoms, so the option to
        turn this off is present only for testing/comparison.
    rdkit_invariants : bool, optional
        Use the atom invariants used by RDKit for its Morgan fingerprint.
    exclude_floating : bool, optional:
        Mask atoms with no bonds (usually floating ions) from the fingerprint.
        These are often placed arbitrarily and can confound the fingerprint.

    Attributes
    ----------
    current_level : int
        The maximum level/iteration to which the fingerprinter has been run
        on the current conformer.
    level_shells : dict
        Dict matching level to set of all shells accepted at that level.
    """

    def __init__(
        self,
        bits=BITS,
        level=LEVEL_DEF,
        radius_multiplier=RADIUS_MULTIPLIER_DEF,
        stereo=STEREO_DEF,
        counts=COUNTS_DEF,
        include_disconnected=INCLUDE_DISCONNECTED_DEF,
        rdkit_invariants=RDKIT_INVARIANTS_DEF,
        exclude_floating=EXCLUDE_FLOATING_DEF,
        remove_duplicate_substructs=REMOVE_DUPLICATE_SUBSTRUCTS_DEF,
    ):
        """Initialize fingerprinter settings."""
        self.mol = None
        if level is None:
            level = -1
        self.level = level
        if not np.log2(bits).is_integer():
            logging.warning(
                "bits are not multiple of 2. Multiples of 2 are "
                "highly recommended"
            )
        self.bits = bits
        self.radius_multiplier = radius_multiplier
        if counts:
            self.fp_type = CountFingerprint
        else:
            self.fp_type = Fingerprint
        self.stereo = stereo
        self.remove_duplicate_substructs = remove_duplicate_substructs

        if self.level == -1 and not self.remove_duplicate_substructs:
            raise Exception(
                "No termination condition specified. 'level' must be "
                "provided or 'remove_duplicate_substructs' must be True"
            )

        self.include_disconnected = include_disconnected
        self.rdkit_invariants = rdkit_invariants
        self.exclude_floating = exclude_floating

        self.bond_types = BOND_TYPES
        self.reset()

[docs]    def run(self, conf=None, mol=None, return_substruct=False):
        """Generate fingerprint from provided conformer or mol and conf id.

        Parameters
        ----------
        conf : RDKit Conformer or int, optional
            Input conformer or conformer in `mol`.
        mol : RDKit Mol, optional
            Input molecule object, with at least one conformer. If `conf` not
            specified, first conformer is used.
        return_substruct : bool, optional
            Return dict mapping substructure to fingerprint indices. Keys are
            indices, values are list of substructures, represented as a tuple
            of atom indices where the first index is the central atom and the
            remaining indices (within the sphere) are sorted.
        """
        if mol is None:  # mol not provided; get from conf
            try:
                mol = conf.GetOwningMol()
            except AttributeError:  # conf is int ID; use existing mol
                mol = self.mol
        else:
            if not isinstance(conf, Chem.Conformer):
                try:
                    conf = mol.GetConformer(conf)
                except TypeError:  # conf isn't ID either. Fall back to first
                    conf = mol.GetConformer(0)

        if mol is not self.mol:
            self.reset_mol()
            self.initialize_mol(mol)
        elif conf is not self.conf:
            self.reset_conf()

        self.initialize_conformer(conf)

        for i in iter(self):
            pass

[docs]    def reset(self):
        """Clear all variables associated with the last run."""
        self.reset_mol()

[docs]    def reset_mol(self):
        """Clear all variables associated with the molecule."""
        self.atoms = None
        self.bound_atoms_dict = {}
        self.connectivity = {}
        self.init_identifiers = {}
        self.reset_conf()

[docs]    def reset_conf(self):
        """Clear only conformer-specific variables."""
        self.all_shells = []
        self.atom_coords = None
        # self.current_level = None
        self.identifiers_to_shells = {}
        self.level_shells = {}
        self.past_substructs = set()
        self.shells_gen = None

[docs]    def initialize_mol(self, mol):
        """Set general properties of `mol` that apply to all its conformers.

        Parameters
        ----------
        mol : RDKit Mol
            Input molecule `Mol` object.
        """
        self.mol = mol
        self.atoms = np.array(
            [x.GetIdx() for x in mol.GetAtoms() if x.GetAtomicNum() > 1]
        )  # ignore hydrogens

        if self.exclude_floating and len(self.atoms) > 1:
            # ignore floating atoms
            self.atoms = np.array(
                [
                    x.GetIdx()
                    for x in mol.GetAtoms()
                    if (x.GetAtomicNum() > 1 and x.GetDegree() > 0)
                ]
            )

        self.bound_atoms_dict = bound_atoms_from_mol(self.mol, self.atoms)
        self.connectivity = {}
        for i, atom1 in enumerate(self.atoms):
            for j, atom2 in enumerate(self.atoms):
                if i <= j:
                    break
                pair = (int(atom1), int(atom2))
                bond = self.mol.GetBondBetweenAtoms(*pair)
                if bond is not None:
                    bond = bond.GetBondType()
                self.connectivity[pair] = self.bond_types[bond]
                self.connectivity[pair[::-1]] = self.connectivity[pair]

        self.initialize_identifiers()

[docs]    def initialize_conformer(self, conf):
        """Retrieve atom coordinates and instantiate shells generator.

        Parameters
        ----------
        conf : RDKit Conformer
            Conformer to fingerprint
        """
        self.conf = conf
        self.atom_coords = coords_from_atoms(self.atoms, self.conf)
        self.shells_gen = ShellsGenerator(
            self.conf,
            self.atoms,
            radius_multiplier=self.radius_multiplier,
            atom_coords=self.atom_coords,
            include_disconnected=self.include_disconnected,
            bound_atoms_dict=self.bound_atoms_dict,
        )

[docs]    def initialize_identifiers(self):
        """Set initial identifiers for atoms."""
        self.init_identifiers = identifiers_from_invariants(
            self.mol, self.atoms, rdkit_invariants=self.rdkit_invariants
        )

    def __next__(self):
        """Run next iteration of fingerprinting."""
        if self.current_level is None:
            shells_dict = next(self.shells_gen)
            if self.current_level != 0:
                raise Exception(
                    "ShellsGenerator is not at level 0 at start."
                    " This should never happen."
                )

            for atom, shell in shells_dict.items():
                shell.identifier = self.init_identifiers[atom]
                self.identifiers_to_shells.setdefault(
                    shell.identifier, set()
                ).add(shell)
                self.past_substructs.add(shell.substruct)

            level_shells = set(shells_dict.values())
        else:
            # stop if maximum level has been reached
            if self.current_level >= self.level and self.level != -1:
                logging.debug("Hit maximum level")
                raise StopIteration

            # stop if all substructs contain all atoms (there will never
            # be another new substruct), and that substruct has been seen
            if self.remove_duplicate_substructs and all(
                (len(x.substruct.atoms) == len(self.atoms))
                for x in self.shells_gen.get_shells_at_level(
                    self.current_level
                ).values()
            ):
                logging.debug("Ran out of substructs")
                raise StopIteration

            shells_dict = next(self.shells_gen)

            for atom, shell in shells_dict.items():
                identifier = identifier_from_shell(
                    shell,
                    self.atom_coords,
                    self.connectivity,
                    self.current_level,
                    self.stereo,
                )
                shell.identifier = identifier

            accepted_shells = sorted(
                shells_dict.values(), key=self._shell_to_tuple
            )

            # filter shells that correspond to already seen substructs
            if self.remove_duplicate_substructs:
                unique_substruct_shells = []
                for shell in accepted_shells:
                    if shell.substruct in self.past_substructs:
                        logging.debug(
                            (
                                "Shell with identifier {} at level {} is not "
                                "unique. Removing."
                            ).format(shell.identifier, self.current_level)
                        )
                        shell.is_duplicate = True
                        for x in self.past_substructs:
                            if x == shell.substruct:
                                shell.duplicate = x.shell
                                continue
                        continue
                    unique_substruct_shells.append(shell)
                    self.past_substructs.add(shell.substruct)

                accepted_shells = unique_substruct_shells

            # store shells
            for shell in accepted_shells:
                self.identifiers_to_shells.setdefault(
                    shell.identifier, set()
                ).add(shell)

            level_shells = self.level_shells[self.current_level - 1].union(
                set(accepted_shells)
            )
            if len(level_shells) == len(
                self.level_shells[self.current_level - 1]
            ):
                self.shells_gen.back()
                logging.debug("No new shells added. Convergence reached.")
                raise StopIteration

        self.all_shells.extend(shells_dict.values())
        self.level_shells[self.current_level] = level_shells

    next = __next__

    @staticmethod
    def _shell_to_tuple(shell):
        return (shell.identifier, shell.center_atom)

[docs]    def get_shells_at_level(self, level=-1, exact=False, atom_mask=set()):
        """Get set of shells at the specified level.

        Parameters
        ----------
        level : int or None, optional
            Level/iteration
        exact : bool, optional
            Exact level
        atom_mask : int or set of int, optional
            Don't return shells whose substructures contain these atoms.

        Returns
        -------
        set of Shell : Shells at level
        """
        if level in (-1, None) or level not in self.level_shells:
            if exact or len(self.level_shells) == 0:
                raise IndexError(
                    "Level {!r} fingerprints have not yet been "
                    "generated".format(level)
                )

            true_level = self.current_level
            if level in (-1, None):
                if self.level not in (-1, None):
                    level = self.level
                else:
                    level = true_level
        else:
            true_level = level

        shells = self.level_shells[true_level]
        if len(atom_mask) > 0:
            try:
                atom_mask = set(atom_mask)
            except TypeError:
                atom_mask = {atom_mask}
            shells = {
                x for x in shells if x.substruct.atoms.isdisjoint(atom_mask)
            }

        return shells

[docs]    def get_fingerprint_at_level(
        self, level=-1, bits=None, exact=False, atom_mask=set()
    ):
        """Get the fingerprint at the specified level.

        Parameters
        ----------
        level : int or None, optional
            Level/iteration
        bits : int or None, optional
            Return fingerprints folded to this number of bits. If unspecified,
            defaults to bits set when instantiated.
        exact : bool, optional
            Exact level
        atom_mask : int or set of int, optional
            Don't return shells whose substructures contain these atoms.

        Returns
        -------
        Fingerprint : Fingerprint at level
        """
        if bits in (-1, None):
            bits = self.bits

        shells = self.get_shells_at_level(
            level=level, exact=exact, atom_mask=atom_mask
        )

        identifiers = signed_to_unsigned_int(
            np.array([x.identifier for x in shells], dtype=IDENT_DTYPE)
        )

        fprint = self.fp_type.from_indices(identifiers, level=level)

        return fprint.fold(bits)

[docs]    def substructs_to_pdb(
        self,
        level=None,
        bits=None,
        out_dir="substructs",
        reorient=True,
        exact=False,
    ):
        """Save all accepted substructs from current level to PDB.

        Parameters
        ----------
        level : int or None, optional
            Level of fingerprinting/number of iterations
        bits : int or None, optional
            Folding level of identifiers
        out_dir : str, optional
            Directory to which to save PDB files.
        reorient : bool, optional
            Reorient substructure to match stereo quadrants.
        """
        shells = self.get_shells_at_level(level=level, exact=exact)

        if bits in (-1, None):
            bits = self.bits

        touch_dir(out_dir)

        out_files = []
        for shell in shells:
            identifier = signed_to_unsigned_int(shell.identifier) % bits
            out_file = os.path.join(out_dir, "{}.pdb.gz".format(identifier))
            shell_to_pdb(
                self.mol,
                shell,
                self.atom_coords,
                self.bound_atoms_dict,
                out_file,
                reorient=reorient,
            )
            out_files.append(out_file)
        return out_files

    @property
    def current_level(self):
        try:
            return self.shells_gen.level
        except AttributeError:
            return None

    def __iter__(self):
        return self


[docs]class ShellsGenerator(object):
    """Generate nested `Shell` objects from molecule upon request."""

    def __init__(
        self,
        conf,
        atoms,
        radius_multiplier=0.5,
        include_disconnected=True,
        atom_coords=None,
        bound_atoms_dict=None,
    ):
        """Initialize the generator.

        After initialization, the generator can be iterated to generate a
        `dict` matching atom ids to that atom's shell at that
        iteration/level.

        Parameters
        ----------
        conf : RDKit Conformer
            Conformer for which to generate shells.
        atoms : list of int
            Atom ids.
        radius_multiplier : float, optional
            Multiplier by which to increase shell radii.
        include_disconnected : bool, optional
            Include atoms not bonded to center atoms within shell.
        atom_coords : dict or None, optional
            Dict matching atom ids to 1-D array of coordinates. If None, will
            be generated.
        bound_atoms_dict : dict or None, optional
            Dict matching atom ids to set of bound atom ids. If None, will be
            generated.
        """
        self.atoms = tuple(atoms)
        self.radius_multiplier = radius_multiplier
        self.shells_dict = {}
        self.include_disconnected = include_disconnected
        self.level = None

        if atom_coords is None:
            atom_coords = coords_from_atoms(self.atoms, conf)
        atom_coords = [atom_coords.get(x) for x in self.atoms]
        self.distance_matrix = array_ops.make_distance_matrix(atom_coords)

        overlap_atoms = [
            (self.atoms[i], self.atoms[j])
            for i, j in zip(*np.where(self.distance_matrix <= array_ops.EPS))
            if i < j
        ]
        if len(overlap_atoms) > 0:
            logging.warning(
                "Overlapping atoms {} in conformer {} of molecule"
                " {}. Fingerprinting will continue but is less "
                "reliable.".format(
                    ", ".join(map(repr, overlap_atoms)),
                    conf.GetId(),
                    conf.GetOwningMol().GetProp("_Name"),
                )
            )

        if not include_disconnected and bound_atoms_dict is None:
            bound_atoms_dict = bound_atoms_from_mol(conf.GetOwningMol(), atoms)
        self.bound_atoms_dict = bound_atoms_dict

[docs]    def get_match_atoms(self, rad):
        """Get atoms within shell at radius `rad`.

        Parameters
        ----------
        rad : float
            Radius of shell.

        Returns
        -------
        dict : Dict matching atom id to set of ids for other atoms within
               shell
        """
        match_atoms_dict = {x: set() for x in self.atoms}
        atom_pair_indices_list = list(
            zip(*np.where(self.distance_matrix <= rad))
        )
        for i, j in atom_pair_indices_list:
            if i <= j:
                continue
            atom1, atom2 = (self.atoms[i], self.atoms[j])
            match_atoms_dict[atom1].add(atom2)
            match_atoms_dict[atom2].add(atom1)
        if not self.include_disconnected:
            match_atoms_dict = {
                k: v.intersection(self.bound_atoms_dict[k])
                for k, v in match_atoms_dict.items()
            }
        return match_atoms_dict

    def __next__(self):
        """Get next iteration's `dict` of atom shells."""
        if self.level is None:
            self.level = 0
            self.shells_dict[self.level] = {
                x: Shell(x, radius=0.0) for x in self.atoms
            }
            return self.shells_dict[self.level]

        self.level += 1
        self.shells_dict[self.level] = {}
        rad = self.level * self.radius_multiplier
        match_atoms_dict = self.get_match_atoms(rad)
        for atom in self.atoms:
            match_atoms = match_atoms_dict[atom]
            last_match_shells = [
                self.shells_dict[self.level - 1].get(x) for x in match_atoms
            ]
            last_shell = self.shells_dict[self.level - 1][atom]
            shell = Shell(
                atom, last_match_shells, radius=rad, last_shell=last_shell
            )
            self.shells_dict[self.level][atom] = shell
        return self.shells_dict[self.level]

    next = __next__

[docs]    def back(self):
        """Back up one iteration."""
        if self.level in {None, 0}:
            return
        del self.shells_dict[self.level]
        self.level -= 1

[docs]    def get_shells_at_level(self, level):
        """Get `dict` of atom shells at specified level/iteration.

        If not run to `level`, raises `IndexError`.

        Parameters
        ----------
        level : int
            Level/iteration from which to retrieve shells `dict`.

        Returns
        -------
        dict: Dict matching atom ids to that atom's `Shell` at that level.
        """
        if level not in self.shells_dict:
            raise IndexError(
                "Level {!r} shells have not been generated".format(level)
            )
        return self.shells_dict[level]

    def __iter__(self):
        return self


# Getting atom properties
[docs]def coords_from_atoms(atoms, conf):
    """Build `dict` matching atom id to coordinates.

    Parameters
    ----------
    atoms : list of int
        Atom ids
    conf : RDKit Conformer
        Conformer from which to fetch coordinates

    Returns
    -------
    dict : Dict matching atom id to 1-D array of coordinates.
    """
    coordinates = [
        np.array(conf.GetAtomPosition(int(x)), dtype=float) for x in atoms
    ]
    return dict(zip(atoms, coordinates))


[docs]def bound_atoms_from_mol(mol, atoms):
    """Build `dict` matching atom id to ids of bounded atoms.

    Bound atoms not in `atoms` are ignored.

    Parameters
    ----------
    mol : RDKit Mol
        Input mol
    atoms : list of int
        List of atom IDs

    Returns
    -------
    dict : Dict matching atom id to set of bound atom ids.
    """
    atoms = set(atoms)
    bound_atoms_dict = {}
    for bond in mol.GetBonds():
        atom1, atom2 = (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
        if (atom1 not in atoms) or (atom2 not in atoms):
            continue
        bound_atoms_dict.setdefault(atom1, set()).add(atom2)
        bound_atoms_dict.setdefault(atom2, set()).add(atom1)
    return bound_atoms_dict


# Generating identifiers
[docs]def hash_int64_array(array, seed=MMH3_SEED):
    """Hash an int64 array into a 32-bit integer.

    Parameters
    ----------
    array : ndarray of int64
        Numpy array containing integers
    seed : any, optional
        Seed for MurmurHash3.

    Returns
    -------
    int : 32-bit integer
    """
    if not np.issubdtype(array.dtype, IDENT_DTYPE):
        raise TypeError(
            "Provided array has dtype {} not {}".format(
                array.dtype, IDENT_DTYPE.__name__
            )
        )
    # ensure all hashed integers are positive
    hashed_int = mmh3.hash(array, seed)
    return hashed_int


[docs]def signed_to_unsigned_int(a, bits=BITS):
    """Convert `int` between +/-`bits` to an int between 0 and `bits`.

    Parameters
    ----------
    a : int or ndarray of int
        Integer
    bits : int, optional
        Maximum size of int. E.g. 32-bit is 2^32.

    Returns
    -------
    int : unsigned integer
    """
    return (a + bits) % bits


[docs]def identifiers_from_invariants(
    mol, atoms, rdkit_invariants=RDKIT_INVARIANTS_DEF
):
    """Initialize ids according to Daylight invariants.

    Parameters
    ----------
    mol : RDKit Mol
        Input molecule
    atoms : list of int
        IDs for atoms in mol for which to generate identifiers.
    rdkit_invariants : bool, optional
        Use the atom invariants used by RDKit for its Morgan fingerprint.

    Returns
    -------
    ndarray of int64 : initial identifiers for atoms
    """
    if rdkit_invariants:
        identifiers = [
            hash_int64_array(
                rdkit_invariants_from_atom(mol.GetAtomWithIdx(int(x)))
            )
            for x in atoms
        ]
    else:
        identifiers = [
            hash_int64_array(invariants_from_atom(mol.GetAtomWithIdx(int(x))))
            for x in atoms
        ]
    atom_to_identifier_dict = dict(zip(atoms, identifiers))
    return atom_to_identifier_dict


[docs]def invariants_from_atom(atom):
    """Get seven invariants from atom.

    Invariants used are the six Daylight invariants, plus an indicator of
    whether the atom is in a ring, as detailed in [1].

    References
    ----------
    1. D Rogers, M Hahn. J. Chem. Inf. Model., 2010, 50 (5), pp 742-754
       https://doi.org/10.1021/ci100050t

    Parameters
    ----------
    atom : RDKit Atom
        Input atom

    Returns
    -------
    1-D array if int64: Array of 7 invariants
    """
    num_hs = atom.GetTotalNumHs()
    return np.array(
        [
            atom.GetTotalDegree() - num_hs,  # Num heavy neighbors
            atom.GetTotalValence() - num_hs,
            atom.GetAtomicNum(),
            int(atom.GetMass()),
            atom.GetFormalCharge(),
            num_hs,
            int(atom.IsInRing()),
        ],
        dtype=IDENT_DTYPE,
    )


[docs]def rdkit_invariants_from_atom(atom):
    """Get the 6 atom invariants RDKit uses for its Morgan fingerprints.

    Parameters
    ----------
    atom : RDKit Atom
        Input atom

    Returns
    -------
    1-D array if int64: Array of 6 invariants
    """
    delta_mass = int(
        atom.GetMass()
        - Chem.GetPeriodicTable().GetAtomicWeight(atom.GetAtomicNum())
    )
    return np.array(
        [
            atom.GetAtomicNum(),
            atom.GetTotalDegree(),
            atom.GetTotalNumHs(),
            atom.GetFormalCharge(),
            delta_mass,
            int(atom.IsInRing()),
        ],
        dtype=IDENT_DTYPE,
    )


[docs]def identifier_from_shell(shell, atom_coords, connectivity, level, stereo):
    """Determine new identifier for a shell at a specific level.

    Parameters
    ----------
    shell : Shell
        Shell for which to determine identifier
    atom_coords : dict
        Dict matching atom ids to coords.
    connectivity : dict
        Dict matching atom id pair tuples to their bond order (5 for unbound).
    level : int
        Level/iteration
    stereo : bool
        Add stereo indicators
    """
    header = [level, shell.last_shell.identifier]
    atom_tuples = atom_tuples_from_shell(
        shell, atom_coords, connectivity, stereo
    )
    flat_atom_tuples = [y for x in atom_tuples for y in x]
    arr = np.array(header + flat_atom_tuples, dtype=IDENT_DTYPE)
    return hash_int64_array(arr)


[docs]def atom_tuples_from_shell(shell, atom_coords, connectivity, stereo):
    """Generate sorted atom tuples for neighboring atoms.

    Parameters
    ----------
    shell : Shell
        Shell for which to build atom tuples
    atom_coords : dict
        Dict matching atom ids to coords.
    connectivity : dict
        Dict matching atom id pair tuples to their bond order (5 for unbound).
    stereo : bool
        Add stereo indicators to tuples
    """
    if len(shell.shells) == 0:
        return []

    # create list matching "bond order" to last iteration's atom identifier
    atom_tuples = [
        (connectivity[(shell.center_atom, x.center_atom)], x.identifier, x)
        for x in shell.shells
    ]

    # add stereo indicator
    if stereo:
        atom_tuples.sort(key=_first_two)
        stereo_indicators = stereo_indicators_from_shell(
            shell, atom_tuples, atom_coords
        )
        atom_tuples = [
            x[:-1] + (y,) + (x[-1],)
            for x, y in zip(atom_tuples, stereo_indicators)
        ]

    # final sort
    atom_tuples = [x[:-1] for x in atom_tuples]
    atom_tuples.sort()
    return atom_tuples


def _first_two(xs):
    return (xs[0], xs[1])


# Methods used for stereo indicators
[docs]def pick_y(atom_tuples, cent_coords, y_precision=Y_AXIS_PRECISION):
    """Pick a y-coordinate from atom tuples or mean coordinate.

    Parameters
    ----------
    atom_tuples : list of tuple
        Sorted list of atom tuples
    cent_coords : Nx3 array of float
        Coordinates of atoms with center atom at origin.
    y_precision : str, optional
        For mean to be chosen for y-coordinate, it must be at least this
        distance from the origin. Useful when atoms are symmetrical around
        the center atom where a slight shift in any atom results in a very
        different y.

    Returns
    -------
    1x3 array of float or None : y-coordinate
    int or None : index to y-atom, if y was chosen from the atoms.
    """
    y_inds = get_first_unique_tuple_inds(atom_tuples, 1, assume_sorted=True)
    # select y-axis vector
    if len(y_inds) > 0:  # unique atom could be found
        y = cent_coords[y_inds, :]
        return y, y_inds[0]
    elif len(atom_tuples) == 2:  # exactly two identical atoms
        y = cent_coords[0, :]
        return y, 0
    else:  # no unique atom could be found
        # assign y-axis as vector to mean coordinate
        y = np.mean(cent_coords, axis=0)
        if np.linalg.norm(y) < y_precision:
            return None, None
        else:
            return y, None


[docs]def pick_z(
    connectivity,
    identifiers,
    cent_coords,
    y,
    long_angle,
    z_precision=Z_AXIS_PRECISION,
):
    """Pick a z-coordinate orthogonal to `y`.

    Parameters
    ----------
    connectivity : dict
        Dict matching atom id pair tuples to their bond order (5 for unbound).
    identifiers : iterable of int
        Atom identifiers
    cent_coords : Nx3 array of float
        Coordinates of atoms with center atom at origin.
    y : 1x3 array of float
        y-coordinate
    long_angle : Nx1 array of float
        Absolute angle of atoms from orthogonal to `y`.
    z_precision : float, optional
        Minimum difference in `long_angle` between two potential z-atoms.
        Used as a tie breaker to prevent small shift in one atom resulting
        in very different z.

    Returns
    -------
    1x3 array of float or None : z-coordinate
    """
    angle_from_right = sorted(
        zip(
            np.divide(long_angle, z_precision).astype(int),
            connectivity,
            identifiers,
            range(len(identifiers)),
        )
    )

    z_angle_inds = get_first_unique_tuple_inds(
        angle_from_right, 1, assume_sorted=True
    )

    if len(z_angle_inds) > 0:
        z_ind = angle_from_right[z_angle_inds[0]][-1]
        z = array_ops.project_to_plane(cent_coords[z_ind, :], y)
        return z
    else:
        return None


[docs]def stereo_indicators_from_shell(
    shell, atom_tuples, atom_coords_dict, add_transform_to_shell=True
):
    """Get `list` of `int` indicating location of atoms on unit sphere.

    Parameters
    ----------
    shell : Shell
        Shell for which to get stereo indicators.
    atom_tuples : list of tuple
        List of atom tuples.
    atom_coords_dict : dict
        Dict matching atom ids to coords.
    add_transform_to_shell : bool, optional
        Calculate transformation matrix to align coordinates to unit sphere,
        and add to shell.

    Returns
    -------
    list of int : stereo indicators for atoms in `atom_tuples`.
    """
    cent_coord = atom_coords_dict[shell.center_atom]

    y = None
    z = None
    if len(atom_tuples) > 0:
        (connectivity, identifiers, shells) = list(zip(*atom_tuples))

        stereo_indicators = np.zeros((len(atom_tuples),), dtype=IDENT_DTYPE)
        atoms = [x.center_atom for x in shells]
        mask = np.ones(len(atom_tuples), dtype=bool)

        cent_coords = (
            np.array(
                [atom_coords_dict.get(x) for x in atoms], dtype=float
            )
            - cent_coord
        )
        # mask atom lying on center atom from consideration
        cent_overlap_indices = np.all(cent_coords == np.zeros(3), axis=1)
        mask[cent_overlap_indices] = False

        # pick y based on first unique atom tuple or mean
        y, y_ind = pick_y(atom_tuples, cent_coords)
        if y_ind is not None:
            mask[y_ind] = False  # use to ensure y not chosen for z

        if y is not None:  # y was picked
            # pick z based on closeness to pi/2 from y-axis
            long_angle = np.pi / 2.0 - array_ops.calculate_angles(
                cent_coords, y
            )
            # perfect right angles
            long_angle[np.fabs(long_angle) < array_ops.EPS] = 0.0
            long_sign = np.asarray(np.sign(long_angle), dtype=IDENT_DTYPE)
            long_sign[long_sign == 0] = 1
            long_angle = np.fabs(long_angle)
            tmp_conn = np.array(connectivity, dtype=IDENT_DTYPE)[mask]
            tmp_ident = np.array(identifiers, dtype=IDENT_DTYPE)[mask]
            z = pick_z(
                tmp_conn, tmp_ident, cent_coords[mask], y, long_angle[mask]
            )

            if z is not None:  # z was picked
                quad_indicators = quad_indicators_from_coords(
                    cent_coords, y, y_ind, z, long_sign
                )
                stereo_indicators = quad_indicators

            #  set indicators for atoms near poles to +/-1
            pole_inds = np.pi / 2 - long_angle < POLAR_CONE_RAD
            stereo_indicators[pole_inds] = long_sign[pole_inds]

        stereo_indicators[cent_overlap_indices] = 0
        stereo_indicators = stereo_indicators.flatten().tolist()
    else:
        stereo_indicators = []

    if add_transform_to_shell:
        shell.transform_matrix = array_ops.make_transform_matrix(
            cent_coord, y, z
        )

    return stereo_indicators


[docs]def quad_indicators_from_coords(cent_coords, y, y_ind, z, long_sign):
    """Create angle indicators for four quadrants in each hemisphere.

    Parameters
    ----------
    cent_coords : Nx3 array of float
        Array of centered coordinates.
    y : 1-D array of float
        Vector lying along y-axis.
    y_ind : int
        Index of `cent_coords` corresponding to `y`.
    z : 1-D array of float
        Vector lying along z-axis
    long_sign : Nx1 array of int
        Array of signs of vectors in `cent_coords` indicating whether they are
        above (+1) or below (-1) the xz-plane.

    Returns
    -------
    Nx1 array of int : Quadrant indicators. Clockwise from `z` around `y`,
                    indicators are 2, 3, 4, 5 for vectors above the xz-plane
                    and -2, -3, -4, -5 for vectors below the xz-plane.
    """
    atom_lats = array_ops.project_to_plane(cent_coords, y)

    with np.errstate(invalid="ignore"):  # y atom_lat should be (0, 0, 0)
        angle_from_z = array_ops.calculate_angles(atom_lats, z, y).flatten()

    if y_ind is not None:
        angle_from_z[y_ind] = 0.0  # otherwise, will be nan

    # offset by pi/4 so z-axis isn't an edge case
    lat_angle = array_ops.rotate_angles(angle_from_z, np.pi / 4)

    # create quadrant indicators
    quad_indicators = 2 + np.asarray(
        lat_angle * 4 / (2 * np.pi), dtype=IDENT_DTYPE
    )
    quad_indicators *= long_sign
    return quad_indicators


[docs]def get_first_unique_tuple_inds(
    tuples_list, num_ret, ignore=[], assume_sorted=True
):
    """Return indices of first `num_ret` unique tuples in a list.

    Only first 2 values of each tuple are considered.

    Parameters
    ----------
    tuples_list : list of tuple
        List of tuples. Only first two unique values are considered.
    num_ret : int
        Maximum number of first unique tuples to return.
    ignore : list, optional
        Indices for tuples not be considered as unique.
    assume_sorted : bool, optional
        If True, assume list is already sorted by tuples.

    Returns
    -------
    tuple of int : List of at most `num_ret` ints indicating index of
                   unique tuples in list.
    """
    unique = {}
    repeated = set(tuples_list[x][:2] for x in ignore)
    for i, tup in enumerate(tuples_list):
        val = tup[:2]
        if val in repeated:
            if assume_sorted and len(unique) == num_ret:
                break  # okay to stop looking
        elif val in unique:
            del unique[val]
            repeated.add(val)
        else:
            unique[val] = i

    return tuple(sorted(unique.values())[:num_ret])