Source code for openforcefield.utils.structure

#!/usr/bin/env python

"""
Utility subroutines for manipulating parmed Structure objects

"""
#=============================================================================================
# GLOBAL IMPORTS
#=============================================================================================

import os
import time

import numpy as np

from simtk import openmm, unit

#=============================================================================================
# PARMED UTILITIES
#=============================================================================================

# TODO: Can we get rid of this, or convert it to use Molecule instead?
[docs]def generateSMIRNOFFStructure(oemol): """ Given an OpenEye molecule (oechem.OEMol), create an OpenMM System and use to generate a ParmEd structure using the SMIRNOFF forcefield parameters. Parameters ---------- oemol : openeye.oechem.OEMol OpenEye molecule Returns ------- molecule_structure : parmed.Structure The resulting Structure """ from openforcefield.topology import Molecule, Topology from openforcefield.typing.engines.smirnoff import ForceField off_mol = Molecule.from_openeye(oemol) off_top = Topology.from_molecules([off_mol]) mol_ff = ForceField('smirnoff99Frosst.offxml') # Create OpenMM System and Topology. omm_top = generateTopologyFromOEMol(oemol) # If it's a nonperiodic box, then we can't use default (PME) settings if omm_top.getPeriodicBoxVectors() is None: mol_ff.get_handler("Electrostatics", {})._method = 'Coulomb' system = mol_ff.create_openmm_system(off_top) # Convert to ParmEd structure. import parmed xyz = extractPositionsFromOEMol(oemol) molecule_structure = parmed.openmm.load_topology(omm_top, system, xyz=xyz) return molecule_structure
[docs]def generateProteinStructure(proteinpdb, protein_forcefield='amber99sbildn.xml', solvent_forcefield='tip3p.xml'): """ Given an OpenMM PDBFile, create the OpenMM System of the protein using OpenMM's ForceField and then generate the parametrized ParmEd Structure of the protein. Parameters ---------- proteinpdb : simtk.openmm.app.PDBFile object, Loaded PDBFile object of the protein. protein_forcefield : xml file, default='amber99sbildn.xml' Forcefield parameters for protein solvent_forcefield : xml file, default='tip3p.xml' Forcefield parameters for solvent Returns ------- solv_structure : parmed.structure.Structure The parameterized Structure of the protein with solvent molecules. (No ligand). """ # Generate protein Structure object using OpenMM ForceField from simtk.openmm import app import parmed forcefield = openmm.app.ForceField(protein_forcefield, solvent_forcefield) protein_system = forcefield.createSystem( proteinpdb.topology ) protein_structure = parmed.openmm.load_topology(proteinpdb.topology, protein_system, xyz=proteinpdb.positions) return protein_structure
[docs]def combinePositions(proteinPositions, molPositions): """ Loops through the positions from the ParmEd structures of the protein and ligand, divides by unit.angstroms which will ensure both positions arrays are in the same units. Parameters ---------- proteinPositions : list of 3-element Quantity tuples. Positions list taken directly from the protein Structure. molPositions : list of 3-element Quantity tuples. Positions list taken directly from the molecule Structure. Returns ------- positions : list of 3-element Quantity tuples. ex. unit.Quantity(positions, positions_unit) Combined positions of the protein and molecule Structures. """ positions_unit = unit.angstroms positions0_dimensionless = np.array(proteinPositions / positions_unit) positions1_dimensionless = np.array(molPositions / positions_unit) coordinates = np.vstack( (positions0_dimensionless, positions1_dimensionless)) natoms = len(coordinates) positions = np.zeros([natoms, 3], np.float32) for index in range(natoms): (x, y, z) = coordinates[index] positions[index, 0] = x positions[index, 1] = y positions[index, 2] = z positions = unit.Quantity(positions, positions_unit) return positions
[docs]def mergeStructure(proteinStructure, molStructure): """ Combines the parametrized ParmEd structures of the protein and ligand to create the Structure for the protein:ligand complex, while retaining the SMIRNOFF parameters on the ligand. Preserves positions and box vectors. (Not as easily achieved using native OpenMM tools). Parameters ---------- proteinStructure : parmed.structure.Structure The parametrized structure of the protein. moleculeStructure : parmed.structure.Structure The parametrized structure of the ligand. Returns ------- structure : parmed.structure.Structure The parametrized structure of the protein:ligand complex. """ structure = proteinStructure + molStructure positions = combinePostions(proteinStructure.positions, molStructure.positions) # Concatenate positions arrays (ensures same units) structure.positions = positions # Restore original box vectors structure.box = proteinStructure.box return structure
# TODO: Migrate into Molecule
[docs]def generateTopologyFromOEMol(molecule): """ Generate an OpenMM Topology object from an OEMol molecule. Parameters ---------- molecule : openeye.oechem.OEMol The molecule from which a Topology object is to be generated. Returns ------- topology : simtk.openmm.app.Topology The Topology object generated from `molecule`. """ from openeye import oechem # Avoid manipulating the molecule mol = oechem.OEMol(molecule) # Create a Topology object with one Chain and one Residue. from simtk.openmm.app import Topology topology = Topology() chain = topology.addChain() resname = mol.GetTitle() residue = topology.addResidue(resname, chain) # Make sure the atoms have names, otherwise bonds won't be created properly below if any([atom.GetName() =='' for atom in mol.GetAtoms()]): oechem.OETriposAtomNames(mol) # Check names are unique; non-unique names will also cause a problem atomnames = [ atom.GetName() for atom in mol.GetAtoms() ] if any( atomnames.count(atom.GetName())>1 for atom in mol.GetAtoms()): raise Exception("Error: Reference molecule must have unique atom names in order to create a Topology.") # Create atoms in the residue. for atom in mol.GetAtoms(): name = atom.GetName() element = openmm.app.element.Element.getByAtomicNumber(atom.GetAtomicNum()) openmm_atom = topology.addAtom(name, element, residue) # Create bonds. atoms = { atom.name : atom for atom in topology.atoms() } for bond in mol.GetBonds(): aromatic = None if bond.IsAromatic(): aromatic = 'Aromatic' # Add bond, preserving order assessed by OEChem topology.addBond(atoms[bond.GetBgn().GetName()], atoms[bond.GetEnd().GetName()], type=aromatic, order=bond.GetOrder()) return topology
[docs]def get_testdata_filename(relative_path): """Get the full path to one of the reference files in testsystems. In the source distribution, these files are in ``openforcefield/data/``, but on installation, they're moved to somewhere in the user's python site-packages directory. Parameters ---------- name : str Name of the file to load (with respect to the repex folder). """ from pkg_resources import resource_filename fn = resource_filename('openforcefield', os.path.join('data', relative_path)) if not os.path.exists(fn): raise ValueError("Sorry! %s does not exist. If you just added it, you'll have to re-install" % fn) return fn
# TODO: Do we need this?
[docs]def normalize_molecules(molecules): """ Normalize all molecules in specified set. ARGUMENTS molecules (list of OEMol) - molecules to be normalized (in place) """ from openeye import oechem # Add explicit hydrogens. for molecule in molecules: oechem.OEAddExplicitHydrogens(molecule) # Build a conformation for all molecules with Omega. print("Building conformations for all molecules...") from openeye import oeomega omega = oeomega.OEOmega() omega.SetMaxConfs(1) omega.SetFromCT(True) for molecule in molecules: #omega.SetFixMol(molecule) omega(molecule) end_time = time.time() elapsed_time = end_time - start_time print("%.3f s elapsed" % elapsed_time) # Regularize all molecules through writing as mol2. print("Regularizing all molecules...") ligand_mol2_dirname = os.path.dirname(mcmcDbName) + '/mol2' if( not os.path.exists( ligand_mol2_dirname ) ): os.makedirs(ligand_mol2_dirname) ligand_mol2_filename = ligand_mol2_dirname + '/temp' + os.path.basename(mcmcDbName) + '.mol2' start_time = time.time() omolstream = oechem.oemolostream(ligand_mol2_filename) for molecule in molecules: # Write molecule as mol2, changing molecule through normalization. oechem.OEWriteMolecule(omolstream, molecule) omolstream.close() end_time = time.time() elapsed_time = end_time - start_time print("%.3f s elapsed" % elapsed_time) # Assign AM1-BCC charges. print("Assigning AM1-BCC charges...") start_time = time.time() for molecule in molecules: # Assign AM1-BCC charges. if molecule.NumAtoms() == 1: # Use formal charges for ions. OEFormalPartialCharges(molecule) else: # Assign AM1-BCC charges for multiatom molecules. OEAssignPartialCharges(molecule, OECharges_AM1BCC, False) # use explicit hydrogens # Check to make sure we ended up with partial charges. if OEHasPartialCharges(molecule) == False: print("No charges on molecule: '%s'" % molecule.GetTitle()) print("IUPAC name: %s" % OECreateIUPACName(molecule)) # TODO: Write molecule out # Delete themolecule. molecules.remove(molecule) end_time = time.time() elapsed_time = end_time - start_time print("%.3f s elapsed" % elapsed_time) print("%d molecules remaining" % len(molecules)) return
# TODO: Migrate into Molecule
[docs]def read_molecules(filename, verbose=True): """ Read molecules from an OpenEye-supported file. Parameters ---------- filename : str Filename from which molecules are to be read (e.g. mol2, sdf) Returns ------- molecules : list of OEMol List of molecules read from file """ from openeye import oechem if not os.path.exists(filename): built_in = get_testdata_filename('molecules/%s' % filename) if not os.path.exists(built_in): raise Exception("File '%s' not found." % filename) filename = built_in if verbose: print("Loading molecules from '%s'..." % filename) start_time = time.time() molecules = list() input_molstream = oechem.oemolistream(filename) flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield input_molstream.SetFlavor(oechem.OEFormat_MOL2, flavor) molecule = oechem.OECreateOEGraphMol() while oechem.OEReadMolecule(input_molstream, molecule): # If molecule has no title, try getting SD 'name' tag if molecule.GetTitle() == '': name = oechem.OEGetSDData(molecule, 'name').strip() molecule.SetTitle(name) # Append to list. molecule_copy = oechem.OEMol(molecule) molecules.append(molecule_copy) input_molstream.close() if verbose: print("%d molecules read" % len(molecules)) end_time = time.time() elapsed_time = end_time - start_time if verbose: print("%.3f s elapsed" % elapsed_time) return molecules
# TODO: Migrate into Molecule
[docs]def setPositionsInOEMol(oemol, positions): """Set the positions in an OEMol using a position array with units from simtk.unit, i.e. from OpenMM. Atoms must have same order. Arguments: --------- oemol : OEMol OpenEye molecule positions : Nx3 array Unit-bearing via simtk.unit Nx3 array of coordinates """ from openeye import oechem if oemol.NumAtoms() != len(positions): raise ValueError("Number of atoms in molecule does not match length of position array.") pos_unitless = positions/unit.angstroms coordlist = [] for idx in range(len(pos_unitless)): for j in range(3): coordlist.append(pos_unitless[idx][j]) oemol.SetCoords(oechem.OEFloatArray(coordlist))
# TODO: Migrate into Molecule
[docs]def extractPositionsFromOEMol(oemol): """Get the positions from an OEMol and return in a position array with units via simtk.unit, i.e. foramtted for OpenMM. Adapted from choderalab/openmoltools test function extractPositionsFromOEMOL Arguments: ---------- oemol : OEMol OpenEye molecule Returns: -------- positions : Nx3 array Unit-bearing via simtk.unit Nx3 array of coordinates """ positions = unit.Quantity(np.zeros([oemol.NumAtoms(), 3], np.float32), unit.angstroms) coords = oemol.GetCoords() for index in range(oemol.NumAtoms()): positions[index,:] = unit.Quantity(coords[index], unit.angstroms) return positions
[docs]def read_typelist(filename): """ Read a parameter type or decorator list from a file. Lines in these files have the format "SMARTS/SMIRKS shorthand" lines beginning with '%' are ignored Parameters ---------- filename : str Path and name of file to be read Could be file in openforcefield/data/ Returns ------- typelist : list of tuples Typelist[i] is element i of the typelist in format (smarts, shorthand) """ if filename is None: return None if not os.path.exists(filename): built_in = get_testdata_filename(filename) if not os.path.exists(built_in): raise Exception("File '%s' not found." % filename) filename = built_in typelist = list() ifs = open(filename) lines = ifs.readlines() used_typenames = list() for line in lines: # Strip trailing comments index = line.find('%') if index != -1: line = line[0:index] # Split into tokens. tokens = line.split() # Process if we have enough tokens if len(tokens) >= 2: smarts = tokens[0] typename = ' '.join(tokens[1:]) if typename not in used_typenames: typelist.append([smarts,typename]) used_typenames.append(typename) else: raise Exception("Error in file '%s' -- each entry must " "have a unique name." % filename ) ifs.close() return typelist
# TODO: This only works with the OpenEye toolkit installed; replace with Molecule API
[docs]def positions_from_oemol(mol): """ Extract OpenMM positions from OEMol. Parameters ---------- mol : oechem.openeye.OEMol OpenEye molecule from which to extract coordinates. Returns ------- positions : simtk.unit.Quantity of dimension (nparticles,3) """ from openeye import oechem, oeomega if mol.GetDimension() != 3: # Assign coordinates omega = oeomega.OEOmega() omega.SetMaxConfs(1) omega.SetIncludeInput(False) omega.SetCanonOrder(False) omega.SetSampleHydrogens(True) # Word to the wise: skipping this step can lead to significantly different charges! status = omega(mol) # generate conformation coordinates = mol.GetCoords() natoms = len(coordinates) positions = np.zeros([natoms,3], np.float32) for index in range(natoms): (x,y,z) = coordinates[index] positions[index,0] = x positions[index,1] = y positions[index,2] = z positions = unit.Quantity(positions, unit.angstroms) return positions
[docs]def check_energy_is_finite(system, positions): """ Check the potential energy is not NaN. Parameters ---------- system : simtk.openmm.System The system to check positions : simtk.unit.Quantity of dimension (natoms,3) with units of length The positions to use """ integrator = openmm.VerletIntegrator(1.0 * unit.femtoseconds) context = openmm.Context(system, integrator) context.setPositions(positions) state = context.getState(getEnergy=True) energy = state.getPotentialEnergy() / unit.kilocalories_per_mole if np.isnan(energy): raise Exception('Potential energy is NaN')
[docs]def get_energy(system, positions): """ Return the potential energy. Parameters ---------- system : simtk.openmm.System The system to check positions : simtk.unit.Quantity of dimension (natoms,3) with units of length The positions to use Returns --------- energy """ integrator = openmm.VerletIntegrator(1.0 * unit.femtoseconds) context = openmm.Context(system, integrator) context.setPositions(positions) state = context.getState(getEnergy=True) energy = state.getPotentialEnergy() / unit.kilocalories_per_mole return energy
#============================================================================= # OPENMM MERGING AND EXPORTING UTILITY FUNCTIONS #============================================================================= # TODO: Reorganize this file, moving exporters to openforcefield.exporters
[docs]def get_molecule_parameterIDs(molecules, forcefield): """Process a list of molecules with a specified SMIRNOFF ffxml file and determine which parameters are used by which molecules, returning collated results. Parameters ---------- molecules : list of openforcefield.topology.Molecule List of molecules (with explicit hydrogens) to parse forcefield : openforcefield.typing.engines.smirnoff.ForceField The ForceField to apply Returns ------- parameters_by_molecule : dict Parameter IDs used in each molecule, keyed by isomeric SMILES generated from provided OEMols. Each entry in the dict is a list which does not necessarily have unique entries; i.e. parameter IDs which are used more than once will occur multiple times. parameters_by_ID : dict Molecules in which each parameter ID occur, keyed by parameter ID. Each entry in the dict is a set of isomeric SMILES for molecules in which that parameter occurs. No frequency information is stored. """ from openforcefield.topology import Topology # Create storage parameters_by_molecule = dict() parameters_by_ID = dict() # Generate isomeric SMILES for each molecule, ensuring all molecules are unique isosmiles = [ molecule.to_smiles() for molecule in molecules ] already_seen = set() duplicates = set(smiles for smiles in isosmiles if smiles in already_seen or already_seen.add(smiles)) if len(duplicates) > 0: raise ValueError("Error: get_molecule_parameterIDs has been provided a list of oemols which contains some duplicates: {}".format(duplicates)) # Assemble molecules into a Topology topology = Topology() for molecule in molecules: topology.add_molecule(molecule) # Label molecules labels = forcefield.label_molecules(topology) # Organize labels into output dictionary by looping over all molecules/smiles for idx in range(len(isosmiles)): # Pull smiles, initialize storage smi = isosmiles[idx] parameters_by_molecule[smi] = [] # Organize data for this molecule data = labels[idx] for force_type in data.keys(): for atom_indices, parameter_type in data[force_type].items(): pid = parameter_type.id # Store pid to molecule parameters_by_molecule[smi].append(pid) # Store which molecule this pid occurred in if pid not in parameters_by_ID: parameters_by_ID[pid] = set() parameters_by_ID[pid].add(smi) else: parameters_by_ID[pid].add(smi) return parameters_by_molecule, parameters_by_ID
[docs]def getMolParamIDToAtomIndex(molecule, forcefield): """Take a Molecule and a SMIRNOFF forcefield object and return a dictionary, keyed by parameter ID, where each entry is a tuple of ( smirks, [[atom1, ... atomN], [atom1, ... atomN]) giving the SMIRKS corresponding to that parameter ID and a list of the atom groups in that molecule that parameter is applied to. Parameters ---------- molecule : openforcefield.topology.Molecule Molecule to investigate forcefield : ForceField SMIRNOFF ForceField object (obtained from an ffxml via ForceField(ffxml)) containing FF of interest. Returns ------- param_usage : dictionary Dictionary, keyed by parameter ID, where each entry is a tuple of ( smirks, [[atom1, ... atomN], [atom1, ... atomN]) giving the SMIRKS corresponding to that parameter ID and a list of the atom groups in that molecule that parameter is applied to. """ topology = Topology() topology.add_molecule(molecule) labels = ff.labal_molecules(topology) param_usage = {} for mol_entry in range(len(labels)): for force in labels[mol_entry].keys(): for (atom_indices, pid, smirks) in labels[mol_entry][force]: if not pid in param_usage: param_usage[pid] = (smirks, [atom_indices]) else: param_usage[pid][1].append( atom_indices ) return param_usage
[docs]def merge_system(topology0, topology1, system0, system1, positions0, positions1, label0="AMBER system", label1="SMIRNOFF system", verbose=True): """Merge two given OpenMM systems. Returns the merged OpenMM System. Parameters ---------- topology0 : OpenMM Topology Topology of first system (i.e. a protein) topology1 : OpenMM Topology Topology of second system (i.e. a ligand) system0 : OpenMM System First system for merging (usually from AMBER) system1 : OpenMM System Second system for merging (usually from SMIRNOFF) positions0 : simtk.unit.Quantity wrapped Positions to use for energy evaluation comparison positions1 (optional) : simtk.unit.Quantity wrapped (optional) Positions to use for second OpenMM system label0 (optional) : str String labeling system0 for output. Default, "AMBER system" label1 (optional) : str String labeling system1 for output. Default, "SMIRNOFF system" verbose (optional) : bool Print out info on topologies, True/False (default True) Returns ---------- topology : OpenMM Topology system : OpenMM System positions: unit.Quantity position array """ #Load OpenMM Systems to ParmEd Structures import parmed structure0 = parmed.openmm.load_topology( topology0, system0 ) structure1 = parmed.openmm.load_topology( topology1, system1 ) #Merge parameterized Structure structure = structure0 + structure1 topology = structure.topology #Concatenate positions arrays positions_unit = unit.angstroms positions0_dimensionless = np.array( positions0 / positions_unit ) positions1_dimensionless = np.array( positions1 / positions_unit ) coordinates = np.vstack((positions0_dimensionless,positions1_dimensionless)) natoms = len(coordinates) positions = np.zeros([natoms,3], np.float32) for index in range(natoms): (x,y,z) = coordinates[index] positions[index,0] = x positions[index,1] = y positions[index,2] = z positions = unit.Quantity(positions, positions_unit) #Generate merged OpenMM system system = structure.createSystem() if verbose: print("Generating ParmEd Structures...\n \t{}: {}\n \t{}: {}\n".format(label0, structure0, label1, structure1)) print("Merged ParmEd Structure: {}".format( structure )) return topology, system, positions
[docs]def save_system_to_amber(openmm_topology, system, positions, prmtop, inpcrd): """Save an OpenMM System, with provided topology and positions, to AMBER prmtop and coordinate files. Parameters ---------- openmm_topology : OpenMM Topology Topology of the system to be saved, perhaps as loaded from a PDB file or similar. system : OpenMM System Parameterized System to be saved, containing components represented by Topology positions : unit.Quantity position array Position array containing positions of atoms in topology/system prmtop : filename AMBER parameter file name to write inpcrd : filename AMBER coordinate file name (ASCII crd format) to write """ import parmed structure = parmed.openmm.topsystem.load_topology(openmm_topology, system, positions) structure.save(prmtop, overwrite = True, format="amber") structure.save(inpcrd, format='rst7', overwrite = True)
[docs]def save_system_to_gromacs(openmm_topology, system, positions, top, gro): """Save an OpenMM System, with provided topology and positions, to AMBER prmtop and coordinate files. Parameters ---------- openmm_topology : OpenMM Topology Topology of the system to be saved, perhaps as loaded from a PDB file or similar. system : OpenMM System Parameterized System to be saved, containing components represented by Topology positions : unit.Quantity position array Position array containing positions of atoms in topology/system top : filename GROMACS topology file name to write gro : filename GROMACS coordinate file name (.gro format) to write """ import parmed structure = parmed.openmm.topsystem.load_topology(openmm_topology, system, positions ) structure.save(top, overwrite = True, format="gromacs") structure.save(gro, overwrite = True, format="gro")