Source code for e3fp.conformer.generator
"""Conformer generation.
Author: Seth Axen
E-mail: seth.axen@gmail.com
"""
import logging
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PropertyMol
from .util import add_conformer_energies_to_mol
# Heavily modified by Seth Axen from code under the following license
__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "3-clause BSD"
# options
FORCEFIELD_CHOICES = ("uff", "mmff94", "mmff94s")
# default values
NUM_CONF_DEF = -1
FIRST_DEF = -1
POOL_MULTIPLIER_DEF = 1
RMSD_CUTOFF_DEF = 0.5
MAX_ENERGY_DIFF_DEF = -1.0
FORCEFIELD_DEF = "uff"
SEED_DEF = -1
[docs]class ConformerGenerator(object):
"""Generate conformers using RDKit.
Procedure
---------
1. Generate a pool of conformers.
2. Minimize conformers.
3. Filter conformers using an RMSD threshold and optional minimum energy
difference.
Note that pruning is done _after_ minimization, which differs from the
protocol described in the references.
References
----------
* http://rdkit.org/docs/GettingStartedInPython.html
#working-with-3d-molecules
* http://pubs.acs.org/doi/full/10.1021/ci2004658
* https://github.com/skearnes/rdkit-utils/blob/master/rdkit_utils/
conformers.py
"""
def __init__(
self,
num_conf=NUM_CONF_DEF,
first=FIRST_DEF,
rmsd_cutoff=RMSD_CUTOFF_DEF,
max_energy_diff=MAX_ENERGY_DIFF_DEF,
forcefield=FORCEFIELD_DEF,
pool_multiplier=POOL_MULTIPLIER_DEF,
seed=SEED_DEF,
get_values=False,
sparse_rmsd=True,
store_energies=True,
):
"""Initialize generator settings.
Parameters
----------
num_conf : int, optional
Maximum number of conformers to generate (after pruning). -1
results in auto selection of max_conformers.
first : int, optional
Terminate when this number of conformers has been accepted, and
only return those conformers.
pool_multiplier : int, optional
Factor to multiply by max_conformers to generate the initial
conformer pool. Since conformers are filtered after energy
minimization, increasing the size of the pool increases the chance
of identifying max_conformers unique conformers.
rmsd_cutoff : float, optional
RMSD cutoff for pruning conformers. If None or negative, no
pruning is performed.
max_energy_diff : float, optional
If set, conformers with energies this amount above the minimum
energy conformer are not accepted.
forcefield : {'uff', 'mmff94', 'mmff94s'}, optional
Force field to use for conformer energy calculation and
minimization.
seed : int, optional
Random seed for conformer generation. If -1, the random number
generator is unseeded.
get_values : boolean, optional
Return tuple of key values, for storage.
sparse_rmsd : bool, optional
If `get_values` is True, instead of returning full symmetric RMSD
matrix, only return flattened upper triangle.
store_energies : bool, optional
Store conformer energies as property in mol.
"""
self.max_conformers = num_conf
self.first_conformers = first
if not rmsd_cutoff or rmsd_cutoff < 0:
rmsd_cutoff = -1.0
self.rmsd_cutoff = rmsd_cutoff
if max_energy_diff is None or max_energy_diff < 0:
max_energy_diff = -1.0
self.max_energy_diff = max_energy_diff
if forcefield not in FORCEFIELD_CHOICES:
raise ValueError(
"%s is not a valid option for forcefield" % forcefield
)
self.forcefield = forcefield
self.pool_multiplier = pool_multiplier
self.seed = seed
self.get_values = get_values
self.sparse_rmsd = sparse_rmsd
self.store_energies = store_energies
def __call__(self, mol):
"""Generate conformers for a molecule.
Parameters
----------
mol : RDKit Mol
Molecule.
Returns
-------
RDKit Mol : copy of the input molecule with embedded conformers
"""
return self.generate_conformers(mol)
[docs] def generate_conformers(self, mol):
"""Generate conformers for a molecule.
Parameters
----------
mol : RDKit Mol
Molecule.
Returns
-------
RDKit Mol : copy of the input molecule with embedded conformers
"""
# initial embedding
mol = self.embed_molecule(mol)
if not mol.GetNumConformers():
msg = "No conformers generated for molecule"
if mol.HasProp("_Name"):
name = mol.GetProp("_Name")
msg += ' "{}".'.format(name)
else:
msg += "."
raise RuntimeError(msg)
# minimization and filtering
self.minimize_conformers(mol)
mol, indices, energies, rmsds = self.filter_conformers(mol)
if self.store_energies:
add_conformer_energies_to_mol(mol, energies)
if self.get_values is True:
if self.sparse_rmsd:
rmsds_mat = rmsds[np.triu_indices_from(rmsds, k=1)]
else:
rmsds_mat = rmsds
return mol, (self.max_conformers, indices, energies, rmsds_mat)
else:
return mol
[docs] @staticmethod
def get_num_conformers(mol):
"""Return ideal number of conformers from rotatable bond number in model.
Parameters
----------
mol : Mol
RDKit `Mol` object for molecule
Yields
------
num_conf : int
Target number of conformers to accept
"""
num_rot = AllChem.CalcNumRotatableBonds(mol)
if num_rot < 8:
return 50
elif num_rot >= 8 and num_rot <= 12:
return 200
elif num_rot > 12:
return 300
else:
return 0
[docs] def embed_molecule(self, mol):
"""Generate conformers, possibly with pruning.
Parameters
----------
mol : RDKit Mol
Molecule.
"""
logging.debug("Adding hydrogens for %s" % mol.GetProp("_Name"))
mol = Chem.AddHs(mol) # add hydrogens
logging.debug("Hydrogens added to %s" % mol.GetProp("_Name"))
logging.debug("Sanitizing mol for %s" % mol.GetProp("_Name"))
Chem.SanitizeMol(mol)
logging.debug("Mol sanitized for %s" % mol.GetProp("_Name"))
if self.max_conformers == -1 or type(self.max_conformers) is not int:
self.max_conformers = self.get_num_conformers(mol)
n_confs = self.max_conformers * self.pool_multiplier
if self.first_conformers == -1:
self.first_conformers = self.max_conformers
logging.debug(
"Embedding %d conformers for %s" % (n_confs, mol.GetProp("_Name"))
)
AllChem.EmbedMultipleConfs(
mol,
numConfs=n_confs,
maxAttempts=10 * n_confs,
pruneRmsThresh=-1.0,
randomSeed=self.seed,
ignoreSmoothingFailures=True,
)
logging.debug("Conformers embedded for %s" % mol.GetProp("_Name"))
return mol
[docs] def get_molecule_force_field(self, mol, conf_id=None, **kwargs):
"""Get a force field for a molecule.
Parameters
----------
mol : RDKit Mol
Molecule.
conf_id : int, optional
ID of the conformer to associate with the force field.
**kwargs : dict, optional
Keyword arguments for force field constructor.
"""
if self.forcefield == "uff":
ff = AllChem.UFFGetMoleculeForceField(
mol, confId=conf_id, **kwargs
)
elif self.forcefield.startswith("mmff"):
AllChem.MMFFSanitizeMolecule(mol)
mmff_props = AllChem.MMFFGetMoleculeProperties(
mol, mmffVariant=self.forcefield
)
ff = AllChem.MMFFGetMoleculeForceField(
mol, mmff_props, confId=conf_id, **kwargs
)
else:
raise ValueError(
"Invalid forcefield " + "'{}'.".format(self.forcefield)
)
return ff
[docs] def minimize_conformers(self, mol):
"""Minimize molecule conformers.
Parameters
----------
mol : RDKit Mol
Molecule.
"""
logging.debug("Minimizing conformers for %s" % mol.GetProp("_Name"))
for conf in mol.GetConformers():
ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
ff.Minimize()
logging.debug("Conformers minimized for %s" % mol.GetProp("_Name"))
[docs] def get_conformer_energies(self, mol):
"""Calculate conformer energies.
Parameters
----------
mol : RDKit Mol
Molecule.
Returns
-------
energies : array_like
Minimized conformer energies.
"""
num_conf = mol.GetNumConformers()
energies = np.empty((num_conf,), dtype=float)
for i, conf in enumerate(mol.GetConformers()):
ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
energies[i] = ff.CalcEnergy()
return energies
[docs] def filter_conformers(self, mol):
"""Filter conformers which do not meet an RMSD threshold.
Parameters
----------
mol : RDKit Mol
Molecule.
Returns
-------
A new RDKit Mol containing the chosen conformers, sorted by
increasing energy.
"""
logging.debug("Pruning conformers for %s" % mol.GetProp("_Name"))
energies = self.get_conformer_energies(mol)
energy_below_threshold = np.ones_like(energies, dtype=np.bool_)
sort = np.argsort(energies) # sort by increasing energy
confs = np.array(mol.GetConformers())
# remove hydrogens to speed up substruct match
mol = Chem.RemoveHs(mol)
accepted = [] # always accept lowest-energy conformer
rejected = []
rmsds = np.zeros((confs.shape[0], confs.shape[0]), dtype=float)
for i, fit_ind in enumerate(sort):
accepted_num = len(accepted)
# always accept lowest-energy conformer
if accepted_num == 0:
accepted.append(fit_ind)
# pre-compute if Es are in acceptable range of min E
if self.max_energy_diff != -1.0:
energy_below_threshold = (
energies <= energies[fit_ind] + self.max_energy_diff
)
continue
# reject conformers after first_conformers is reached
if accepted_num >= self.first_conformers:
rejected.append(fit_ind)
continue
# check if energy is too high
if not energy_below_threshold[fit_ind]:
rejected.append(fit_ind)
continue
# get RMSD to selected conformers
these_rmsds = np.zeros((accepted_num,), dtype=float)
# reverse so all confs aligned to lowest energy
for j, accepted_ind in self.reverse_enumerate(accepted):
this_rmsd = AllChem.GetBestRMS(
mol,
mol,
confs[accepted_ind].GetId(),
confs[fit_ind].GetId(),
)
# reject conformers within the RMSD threshold
if this_rmsd < self.rmsd_cutoff:
rejected.append(fit_ind)
break
else:
these_rmsds[-j - 1] = this_rmsd
else:
rmsds[fit_ind, accepted] = these_rmsds
rmsds[accepted, fit_ind] = these_rmsds
accepted.append(fit_ind)
# slice and order rmsds and energies to match accepted list
rmsds = rmsds[np.ix_(accepted, accepted)]
energies = energies[accepted]
# create a new molecule with all conformers, sorted by energy
new = PropertyMol.PropertyMol(mol)
new.RemoveAllConformers()
conf_ids = [conf.GetId() for conf in mol.GetConformers()]
for i in accepted:
conf = mol.GetConformer(conf_ids[i])
new.AddConformer(conf, assignId=True)
logging.debug("Conformers filtered for %s" % mol.GetProp("_Name"))
return new, np.asarray(accepted, dtype=int), energies, rmsds
[docs] @staticmethod
def reverse_enumerate(iterable):
"""Enumerate, but with the last result first but still numbered last.
Parameters
----------
iterable : some 1-D iterable
Returns
-------
iterable:
Reverse of `enumerate` function
"""
return zip(reversed(range(len(iterable))), reversed(iterable))
# magic methods
def __repr__(self):
return """ConformerGenerator(num_conf=%r, first=%r,\
\n pool_multiplier=%r, rmsd_cutoff=%r,\
\n max_energy_diff=%r, forcefield=%r,\
\n get_values=%r, sparse_rmsd=%r)""" % (
self.max_conformers,
self.first,
self.pool_multiplier,
self.rmsd_cutoff,
self.max_energy_diff,
self.forcefield,
self.get_values,
self.sparse_rmsd,
)
def __str__(self):
return self.__repr__()