Source code for openforcefield.typing.chemistry.environment

#!/usr/bin/env python

#==============================================================================
# MODULE DOCSTRING
#==============================================================================

"""
environment.py

.. warning :: This file is  will be updated to comply with PEP8.

Classes defining a chemical environment for atoms and how they are connected
using networkx graph objects to organize and make changes to the structure.
Output will be in the form of SMARTS and SMIRKS.

AUTHORS

Caitlin Bannan <bannanc@uci.edu>, Mobley Lab, University of California Irvine,
with contributions from John Chodera, Memorial Sloan Kettering Cancer Center
and David Mobley, UC Irvine.

"""

#==============================================================================
# GLOBAL IMPORTS
#==============================================================================

import re
import copy

import networkx as nx
from numpy import random

import openforcefield.utils


#==============================================================================
# Functions
#==============================================================================

def _find_embedded_brackets(string, in_char, out_char):
    """
    Finds the substring surrounded by the in_char and out_char
    intended use is to identify embedded bracketed sequences

    string - a string you want separated
    in_char - regular expression for the character you're looking for '\(' for '('
    out_char - regular expression for the closing character such as '\)' for ')'

    string = "[#1$(*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br]):1]"
    sub_string, in_idx, out_idx = _find_embedded_brackets(string, '\(','\)')
    # sub_string = (*-C(-[#7,#8,F,#16,Cl,Br])-[#7,#8,F,#16,Cl,Br])  in_idx = 4, out_idx = 50
    """
    in_list = [m.start() for m in re.finditer(in_char, string)]
    out_list = [m.start() for m in re.finditer(out_char, string)]
    # If no occurance of the in_char return an empty string
    if len(in_list) == 0:
        return "", -1, -1
    # If no out_char returns the first in_char to the end
    if len(out_list) == 0:
        return string[in_list[0]:], in_list[0], -1

    # Otherwise find closure from the first in_char
    list_idx = 0
    while list_idx < len(in_list) - 1:
        if in_list[list_idx+1] > out_list[list_idx]:
            break
        list_idx+=1
    in_idx = in_list[0]
    out_idx = out_list[list_idx]
    return string[in_idx:out_idx+1], in_idx, out_idx

def _convert_embedded_SMIRKS(smirks):
    """
    Converts a SMIRKS string with the $(...) in an atom to the
    form expected by the environment parser

    smirks = any smirks string, if no $(...) then the original smirks is returned

    initial_smirks = "[#1$(*~[#6]):1]"
    new_smirks = _convert_embedded_SMIRKS(initial_smirks)
    # new_smirks = [#1:1]~[#6]
    """
    a_out = 0
    while smirks.find('$(') != -1:
        # Find first atom
        atom, a_in, a_out = _find_embedded_brackets(smirks, r'\[', r'\]')
        d = atom.find('$(')
        # Find atom with the $ string embedded
        while d == -1:
            atom, temp_in, temp_out = _find_embedded_brackets(smirks[a_out+1:], r'\[', r'\]')
            a_in = a_out + temp_in + 1
            a_out += temp_out + 1
            d = atom.find('$(')

        # Store the smirks pattern before and after the relevant atom
        pre_smirks = smirks[:a_in]
        post_smirks = smirks[a_out+1:]

        # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1"
        match = re.match(r'(\d+)',post_smirks)
        if match is not None: # leftover starts with int
            ring_out = re.findall(r'(\d+)',post_smirks)[0]
            # update post_smirks
            post_smirks = post_smirks[match.end():]
        else:
            ring_out = ''

        embedded, p_in, p_out = _find_embedded_brackets(atom, r'\(', r'\)')
        # two forms of embedded strings $(*~stuff) or $([..]~stuff)
        # in the latter case the first atom refers the current atom
        if embedded[1] == '[':
            first, f_in, f_out = _find_embedded_brackets(embedded, r'\[', r'\]')
            first = _convert_embedded_SMIRKS(first)
            new_atom = atom[:d]+first[1:-1]+atom[p_out+1:]
            embedded = embedded[f_out+1:]
            # if embedded is empty between brackets, remove it
            if embedded.replace('(','').replace(')','') == '':
                embedded = ''

        elif embedded[1] == '*': # embedded[1] = *
            new_atom = atom[:d]+atom[p_out+1:]
            embedded = embedded[2:]

        else: # embedded starts with a "no bracket" atom such as 'C'
            embedded = embedded[1:] # remove leading '('
            # atoms by symbol don't need brackets, this covers atomic symbols and aromatic atoms
            no_bracket = r'(!?[A-Z][a-z]?|!?[cnops])'
            match = re.match(no_bracket, embedded)
            if match is not None:
                new_atom = atom[:d]+embedded[:match.end()]+atom[p_out+1:]
                embedded = embedded[match.end():]
            else:
                new_atom = atom[:d]+atom[p_out+1]

        # Look for ring insided embedded SMIRKS "[#6$(*1CCC1)]"
        match = re.match(r'(\d+)', embedded)
        if match is not None: # embedded starts with an int
            ring_in = re.findall(r'(\d+)', embedded)[0]
            embedded = '(' + embedded[match.end():]
        else:
            ring_in = ''
            if embedded != '':
                embedded = '(' + embedded

        # Make new smirks
        smirks = pre_smirks+new_atom+ring_out+ring_in+embedded+post_smirks

    return smirks

def _remove_blanks_repeats(init_list, remove_list = ['']):
    """
    Returns the input list 'init_list'
    without any repeating entries or blank strings ''
    """
    final_list = [item for item in init_list if item not in remove_list]
    return list( set(final_list) )


class SMIRKSMismatchError(openforcefield.utils.MessageException):
    """
    Exception for cases where smirks are inappropriate
    for the environment type they are being parsed into
    """
    pass


class SMIRKSParsingError(openforcefield.utils.MessageException):
    """
    Exception for when SMIRKS are not parseable for any environment
    """
    pass


[docs]class ChemicalEnvironment(object): """Chemical environment abstract base class that matches an atom, bond, angle, etc. .. warning :: This class is largely redundant with the same one in the Chemper package, and will likely be removed. """
[docs] class Atom(object): """Atom representation, which may have some ORtypes and ANDtypes properties. Attributes ---------- ORtypes : list of tuples in the form (base, [list of decorators]) where bases and decorators are both strings The descriptor types that will be combined with logical OR ANDtypes : list of string The descriptor types that will be combined with logical AND """ def __init__(self, ORtypes = None, ANDtypes = None, index = None, ring = None): """Initialize an Atom object with optional descriptors. Parameters ----------- ORtypes: list of tuples for ORbases and ORdecorators, in the form (base, [list of decorators]) optional, default = [] ANDtypes: list of str, strings that will be AND'd together in a SMARTS optional, default = None index : int, optional, default=None If not None, the specified index will be attached as a SMIRKS index (e.g. '[#6:1]') ring : int, optional, default = None If not None, the specified ring index will be attached at the end of the atom i.e. '[#6:1]1' """ # List of 2 tuples in the form [ (ORbase, ORdecorator), ...] if ORtypes is not None: self.ORtypes = copy.deepcopy(ORtypes) else: self.ORtypes = list() # Set of strings that will be AND'd to the the end if ANDtypes is not None: self.ANDtypes = list(copy.deepcopy(ANDtypes)) else: self.ANDtypes = list() self.index = index self.ring = ring self._atom = True
[docs] def asSMARTS(self): """Return the atom representation as SMARTS. Returns -------- smarts : str The SMARTS string for this atom """ smarts = '[' # Add the OR'd features if self.ORtypes: ORList = list() for (base, ORdecorators) in self.ORtypes: if len(base) > 0 and base[0] == '$': # after a $base an explicit '&' is necessary if ORdecorators: OR = base+'&'+''.join(ORdecorators) else: OR = base else: # base doesn't start with $ OR = base+''.join(ORdecorators) ORList.append(OR) smarts += ','.join(ORList) else: smarts += '*' if len(self.ANDtypes) > 0: smarts += ';' + ';'.join(self.ANDtypes) if self.ring is not None: return smarts + ']' + str(self.ring) else: return smarts + ']'
[docs] def asSMIRKS(self): """Return the atom representation as SMIRKS. Returns -------- smirks : str The SMIRKS string for this atom """ smirks = self.asSMARTS() # No index specified so SMIRKS = SMARTS if self.index == None: return smirks # Add label to the end of SMARTS else: sub_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]') if self.ring is not None: return sub_string[:-1] + ':' + str(self.index) + ']'+str(self.ring) else: return sub_string[:-1] + ':' + str(self.index) + ']'
[docs] def addORtype(self, ORbase, ORdecorators): """ Adds ORtype to the set for this atom. Parameters ---------- ORbase: string, such as '#6' ORdecorators: list of strings, such as ['X4','+0'] """ ORdecorators = _remove_blanks_repeats(ORdecorators, ['',ORbase]) self.ORtypes.append((ORbase, ORdecorators))
[docs] def addANDtype(self, ANDtype): """ Adds ANDtype to the set for this atom. Parameters ---------- ANDtype: string added to the list of ANDtypes for this atom """ self.ANDtypes.append(ANDtype) self.ANDtypes = _remove_blanks_repeats(self.ANDtypes)
[docs] def getORtypes(self): """ returns a copy of the dictionary of ORtypes for this atom """ return copy.deepcopy(self.ORtypes)
[docs] def setORtypes(self, newORtypes): """ sets new ORtypes for this atom Parameters ---------- newORtypes: list of tuples in the form (base, [ORdecorators]) for example: ('#6', ['X4','H0','+0']) --> '#6X4H0+0' """ self.ORtypes = list() if newORtypes is not None: for (base, decs) in newORtypes: adjusted_decs = _remove_blanks_repeats(decs, ['', base]) self.ORtypes.append( (base, adjusted_decs) )
[docs] def getANDtypes(self): """ returns a copy of the list of ANDtypes for this atom """ return list(copy.deepcopy(self.ANDtypes))
[docs] def setANDtypes(self, newANDtypes): """ sets new ANDtypes for this atom Parameters ---------- newANDtypes: list of strings strings that will be AND'd together in a SMARTS """ if newANDtypes is None: self.ANDtypes = list() else: self.ANDtypes = _remove_blanks_repeats(newANDtypes)
[docs] class Bond(Atom): """Bond representation, which may have ORtype and ANDtype descriptors. Attributes ---------- ORtypes : list of tuples of ORbases and ORdecorators in form (base: [list of decorators]) The ORtype types that will be combined with logical OR ANDtypes : list of string The ANDtypes that will be combined with logical AND """ # Implementation identical to atoms apart from what is put in the asSMARTS/asSMIRKS strings def __init__(self, ORtypes = None, ANDtypes = None): """ Parameters ----------- ORtypes: list of tuples, optional, default = None tuples have form (base, [ORdecorators]) bond descriptors that will be OR'd together in a SMARTS ANDtypes: list of str, optional, default = None strings that will be AND'd together in a SMARTS index: integer, default = None This is for book keeping inside environments and will not be shown in SMARTS or SMIRKS example: bond1 in a Bond is the bond between atom1 and atom2 """ super(ChemicalEnvironment.Bond,self).__init__(ORtypes, ANDtypes, None, None) self._atom = False return
[docs] def asSMARTS(self): """Return the atom representation as SMARTS. Returns ------- smarts : str The SMARTS string for just this atom """ if self.ORtypes: ORcombos = list() for (ORbase, ORdecorators) in self.ORtypes: ORcombos.append(ORbase+''.join(ORdecorators)) smarts = ','.join(ORcombos) else: smarts = '~' if len(self.ANDtypes) > 0: smarts += ';' + ';'.join(self.ANDtypes) return smarts
[docs] def asSMIRKS(self): """ Returns ------- smarts : str The SMIRKS string for just this bond """ #the same as asSMARTS() # for consistency asSMARTS() or asSMIRKS() can be called # for all environment objects return self.asSMARTS()
[docs] def getOrder(self): """ Returns a float for the order of this bond for multiple ORtypes or ~ it returns the minimum possible order the intended application is for checking valence around a given atom """ # Minimum order for empty ORtypes is 1: if not self.ORtypes: return 1 orderDict = {'~':1., '-':1., ':': 1.5, '=':2., '#':3., '!-':1.5, '!:':1., '!=':1., '!#':1.} orderList = [orderDict[base] for (base, decor) in self.ORtypes] return min(orderList)
[docs] @staticmethod def validate(smirks, ensure_valence_type=None, toolkit='openeye'): """Validate the provided SMIRKS string is valid, and if requested, tags atoms appropriate to the specified valence type. Parameters ---------- smirks : str The SMIRKS expression to validate ensure_valence_type : str, optional, default=None If specified, ensure the tagged atoms are appropriate to the specified valence type This method will raise a :class:`SMIRKSParsingError` if the provided SMIRKS string is not valid. """ chemenv = ChemicalEnvironment(smirks, toolkit=toolkit) if ensure_valence_type: valence_type = chemenv.getType() if valence_type != ensure_valence_type: raise SMIRKSParsingError("Tagged atoms in SMARTS string '%s' specifies valence type '%s', expected '%s'." % (smirks, valence_type, ensure_valence_type))
[docs] def __init__(self, smirks = None, label = None, replacements = None, toolkit='openeye'): """Initialize a chemical environment abstract base class. smirks = string, optional if smirks is not None, a chemical environment is built from the provided SMIRKS string label = anything, optional intended to be used to label this chemical environment could be a string, int, or float, or anything replacements = list of lists, optional, [substitution, smarts] form for parsing SMIRKS """ # TODO: Refactor all this class to use the ToolkitRegistry API. if toolkit.lower() == 'openeye' and openforcefield.utils.OpenEyeToolkitWrapper.is_available(): self.toolkit = 'openeye' elif toolkit.lower() == 'rdkit' and openforcefield.utils.RDKitToolkitWrapper.is_available(): self.toolkit = 'rdkit' else: raise ValueError("Could not find toolkit {}, please use/install " "openeye or rdkit.".format(toolkit)) # Define the regular expressions used for all SMIRKS decorators # There are a limited number of descriptors for smirks string they are: # That is a # followed by one or more ints w/or w/o at ! in front '!#16' element_num = r"!?[#]\d+" # covers element symbols, i.e. N,C,O,Br not followed by a number element_sym = "!?[A-Z][a-z]?" # covers element symbols that are aromatic: aro_sym = "!?[cnops]" # replacement strings replace_str = r"\$\w+" # a or A w/ or w/o a ! in front 'A' aro_ali = "!?[aA]" # the decorators (D,H,j,r,V,X,^) followed by one or more integers needs_int = r"!?[DHjrVX^]\d+" # R(x), +, - do not need to be followed by a integer w/ or w/o a ! 'R2' optional_int = r"!?[Rx+-]\d*" # chirality options, "@", "@@", "@int" w/ or w/o a ! in front chirality = r"!?[@]\d+|!?[@]@?" # Generate RegEx string for decorators: self.no_bracket_atom_reg = r'('+'|'.join([element_sym, aro_sym, replace_str])+')' self.atom_reg = '|'.join([element_num, aro_ali, needs_int, optional_int, chirality, replace_str, element_sym, aro_sym]) self.atom_reg = r'('+self.atom_reg+')' # Define bond regular expression options below in order: # single, double, triple, aromatic, directional up bond, directional down bond # Each can have ! in from and directional can have ? after # up and down bonds have lots of \ to fit the python requirements self.bond_regs = ['!?[-]', '!?[=]', '!?[#]', '!?[:]', '!?[@]', '!?[\\\\]\\??', '!?[\\/]\\??'] self.bond_regs = r'('+'|'.join(self.bond_regs)+')' # Note, not looking for ~ because that is used for empty bonds # Create an empty graph which will store Atom objects. self._graph = nx.Graph() self.label = label self.replacements = replacements if smirks is not None: # Check that it is a valid SMIRKS if not self.isValid(smirks): raise SMIRKSParsingError("Error Provided SMIRKS ('%s') was \ not parseable with %s tools" % (smirks, self.toolkit)) # Check for SMIRKS not supported by Chemical Environments if smirks.find('.') != -1: raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \ contains a '.' indicating multiple molecules in the same pattern. This type \ of pattern is not parseable into ChemicalEnvironments" % smirks) if smirks.find('>') != -1: raise SMIRKSParsingError("Error: Provided SMIRKS ('%s') \ contains a '>' indicating a reaction. This type of pattern is not parseable \ into ChemicalEnvironments." % smirks) # try parsing into environment object try: self._parse_smirks(smirks) except: raise SMIRKSParsingError("Error SMIRKS (%s) was not parseable\ into a ChemicalEnvironment" % smirks) # Check that the created Environment is valid if not self.isValid(): raise SMIRKSParsingError("Input SMIRKS (%s), converted to %s \ is now invalid" % (smirks, self.asSMIRKS())) return
def _graph_remove_node(self, node): self._graph.remove_node(node) return True def _graph_nodes(self, data=False): """ When data is False returns a list of nodes in graph otherwise returns a dictionary in the form {node: data} """ if data: return dict(self._graph.nodes(data=True)) return list(self._graph.nodes()) def _graph_edges(self, data=False, node=None): """ returns a list of tuples, If data is False it has the form [(node1, node2)] Otherwise it includes the data [(node1, node2, data_dictionary)] If node is not None then the list includes only edges connected to that node """ if node is None: return list(self._graph.edges(data=data)) return list(self._graph.edges(node, data=data)) def _graph_neighbors(self, node): """ returns a list of neighbors for the given node """ return list(self._graph.neighbors(node)) def _graph_get_edge_data(self, node1, node2): """ returns a dictionary for the data at the edged connecting node1 and node2 in graph """ return self._graph.get_edge_data(node1, node2)
[docs] def isValid(self, smirks=None): """ Returns if the environment is valid, that is if it creates a parseable SMIRKS string. """ if smirks is None: smirks = self._asSMIRKS() if self.toolkit == 'openeye': return self._oe_isValid(smirks) elif self.toolkit == 'rdkit': return self._rdk_isValid(smirks) else: raise Exception("Could not import openeye.oechem or rdkit.Chem")
def _rdk_isValid(self, smirks): from rdkit import Chem if self.replacements is not None: for substring, replace_with in self.replacements: smirks = smirks.replace(substring, '('+replace_with+')') ss = Chem.MolFromSmarts(smirks) if ss is None: print(smirks, 'not parsed') return ss is not None def _oe_isValid(self, smirks): """ Returns if the atom is valid, that is if it creates a parseable SMIRKS string. """ from openeye import oechem qmol = oechem.OEQMol() if self.replacements is not None: smirks = oechem.OESmartsLexReplace(smirks, self.replacements) return oechem.OEParseSmarts(qmol, smirks) def _parse_smirks(self,input_smirks): """ This function converts a smirks string to a Chemical Environment """ smirks = _convert_embedded_SMIRKS(input_smirks) atoms = dict() # store created atom idx = 1 # current atom being created store = list() # to store indices while branching bondingTo = idx # which atom are we going to bond to atom_string, start, end = _find_embedded_brackets(smirks, r'\[', r'\]') if start != 0: # first atom is not in square brackets if start != -1: start_string = smirks[:start] else: start_string = smirks # Check for atoms not between square brackets split = re.split(self.no_bracket_atom_reg, start_string) atom_string = split[1] # update leftover for this condition if start != -1: # there is at least 1 more bracketed atom leftover = ''.join(split[2:])+smirks[start:] else: leftover = ''.join(split[2:]) else: # First atom is in square brackets leftover = smirks[end+1:] # remove square brackets for parsing atom_string = atom_string[1:-1] # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1" match = re.match(r'(\d+)',leftover) if match is not None: # leftover starts with int ring = re.findall(r'(\d+)',leftover)[0] leftover = leftover[match.end():] else: ring = None # Get atom information and create first atom OR, AND, index = self._getAtomInfo(atom_string) new_atom = self.addAtom(None, newORtypes = OR, newANDtypes = AND, newAtomIndex = index, newAtomRing = ring, beyondBeta = True) atoms[idx] = new_atom while len(leftover) > 0: idx += 1 # Check for branching if leftover[0] == ')': bondingTo = store.pop() leftover = leftover[1:] continue if leftover[0] == '(': store.append(bondingTo) leftover = leftover[1:] continue # find beginning and end of next [atom] atom_string, start, end = _find_embedded_brackets(leftover, r'\[', r'\]') if start != -1: # no more square brackets bond_string = leftover[:start] else: bond_string = leftover # Check for atoms not between square brackets bond_split = re.split(self.no_bracket_atom_reg, bond_string) # Next atom is not in brackets for example C in "[#7:1]-C" if len(bond_split) > 1: bond_string = bond_split[0] atom_string = '['+bond_split[1]+']' # update leftover for this condition if start != -1: # ther is at least 1 more bracketed atom leftover = ''.join(bond_split[2:])+leftover[start:] else: leftover = ''.join(bond_split[2:]) else: # next atom is in the brackets [atom] # bond and atom string stay the same, update leftover leftover = leftover[end+1:] # Get bond and atom info bOR, bAND = self._getBondInfo(bond_string) aOR, aAND, index = self._getAtomInfo(atom_string[1:-1]) # Check for ring index, i.e. the 1s in "[#6:1]1-CCCCC1" match = re.match(r'(\d+)',leftover) if match is not None: # leftover starts with int ring = re.findall(r'(\d+)',leftover)[0] leftover = leftover[match.end():] else: ring = None # create new atom new_atom = self.addAtom(atoms[bondingTo], bondORtypes=bOR, bondANDtypes=bAND, newORtypes=aOR, newANDtypes=aAND, newAtomIndex=index, newAtomRing=ring, beyondBeta=True) # update state atoms[idx] = new_atom bondingTo = idx return def _getAtomInfo(self, atom): """ given atom string, returns ORtypes, ANDtypes, and index """ # Find atom index colon = atom.find(':') if colon == -1: index = None else: index = int(atom[colon+1:]) atom = atom[:colon] split = atom.split(';') # Get ANDtypes (and split them if they don't use ;) ANDtypes = list() for a in split[1:]: ANDtypes += re.findall(self.atom_reg, a) # Get ORtypes ORList = split[0].split(',') ORtypes = list() # Separate ORtypes into bases and decorators for OR in ORList: ORbase, ORdecors = self._separateORtypes(OR) if ORbase != None: ORtypes.append( (ORbase, ORdecors) ) return ORtypes, ANDtypes, index def _separateORtypes(self, ORtype): """ Separates ORtype (i.e. "#6X4R+0") into a base and decorators (i.e. '#6', ['X4','R','+0'] ) """ # special case 1: wild card if ORtype == '*': return None, [] # if ORbase is a wildcard if ORtype[0] == '*': return '*', re.findall(self.atom_reg, ORtype[1:]) # Split up decorators by RegEx strings for atoms split = re.findall(self.atom_reg, ORtype) if len(split) == 0: return None, [] base = split[0] decs = _remove_blanks_repeats(split[1:], ['',base]) return base, decs def _getBondInfo(self, bond): """ given bond strings returns ORtypes and ANDtypes """ # blank bond string is single or aromatic # empty ORtypes in Chemical Environments are treated as ~ bonds if bond == "": ANDtypes = list() ORtypes = [ ('-', []), (':', []) ] return ORtypes, ANDtypes # AND types indicated by ; at the end split = bond.split(';') ANDtypes = list() for a in split[1:]: ANDtypes += re.findall(self.bond_regs, a) # ORtypes are divided by , ORList = split[0].split(',') ORtypes = list() for OR in ORList: if OR == '~': continue or_divide = re.findall(self.bond_regs, OR) if len(or_divide) > 0: ORtypes.append( (or_divide[0], or_divide[1:])) return ORtypes, ANDtypes
[docs] def asSMIRKS(self, smarts = False): """ Returns a SMIRKS representation of the chemical environment Parameters ----------- smarts: optional, boolean if True, returns a SMARTS instead of SMIRKS without index labels """ return self._asSMIRKS(None, None, smarts)
def _asSMIRKS(self, initialAtom = None, neighbors = None, smarts = False): """Return a SMIRKS representation of the chemical environment. Parameters ---------- initalAtom = optional, atom object This is randomly selected if not chosen. neighbors = optional, list of atom objects This is all of the initalAtom neighbors if not specified generally this is used only for the recursive calls so initial atoms are not reprinted smarts = optional, boolean if True, returns a SMARTS string instead of SMIRKS """ # If empty chemical environment if len(self._graph_nodes()) == 0: return "" if initialAtom == None: initialAtom = self.getAtoms()[0] if neighbors is None: neighbors = self._graph_neighbors(initialAtom) # sort neighbors to guarantee order is constant neighbors = sorted(neighbors, key=lambda atom: atom.asSMIRKS()) # initialize smirks for starting atom if smarts: smirks = initialAtom.asSMARTS() else: smirks = initialAtom.asSMIRKS() # loop through neighbors for idx, neighbor in enumerate(neighbors): # get the SMIRKS for the bond between these atoms # bonds are the same if smarts or smirks bond_edge = self._graph_get_edge_data(initialAtom, neighbor) bondSMIRKS = bond_edge['bond'].asSMIRKS() # Get the neighbors for this neighbor new_neighbors = self._graph_neighbors(neighbor) # Remove initialAtom so it doesn't get reprinted new_neighbors.remove(initialAtom) # Call asSMIRKS again to get the details for that atom atomSMIRKS = self._asSMIRKS(neighbor, new_neighbors, smarts) # Use ( ) for branch atoms (all but last) if idx < len(neighbors) - 1: smirks += '(' + bondSMIRKS + atomSMIRKS + ')' # This is for the atoms that are a part of the main chain else: smirks += bondSMIRKS + atomSMIRKS return smirks
[docs] def selectAtom(self, descriptor = None): """Select a random atom fitting the descriptor. Parameters ---------- descriptor: optional, None None - returns any atom with equal probability int - will return an atom with that index 'Indexed' - returns a random indexed atom 'Unindexed' - returns a random unindexed atom 'Alpha' - returns a random alpha atom 'Beta' - returns a random beta atom Returns -------- a single Atom object fitting the description or None if no such atom exists """ if descriptor == None: return random.choice(self._graph_nodes()) try: descriptor = int(descriptor) except: descriptor = descriptor if type(descriptor) is int: for atom in self.getAtoms(): if atom.index == descriptor: return atom return None atoms = self.getComponentList('atom',descriptor) if len(atoms) == 0: return None return random.choice(atoms)
[docs] def getComponentList(self, component_type, descriptor = None): """ Returns a list of atoms or bonds matching the descriptor Parameters ---------- component_type: string: 'atom' or 'bond' descriptor: string, optional 'all', 'Indexed', 'Unindexed', 'Alpha', 'Beta' Returns ------- """ if descriptor is not None: d = descriptor.lower() else: d = None if not component_type.lower() in ['atom', 'bond']: raise Exception("Error: 'getComponentList()' component_type must be 'atom' or 'bond'") if component_type.lower() == 'atom': if d == 'indexed': return self.getIndexedAtoms() elif d == 'unindexed': return self.getUnindexedAtoms() elif d == 'alpha': return self.getAlphaAtoms() elif d == 'beta': return self.getBetaAtoms() else: return self.getAtoms() elif component_type.lower() == 'bond': if d == 'indexed': return self.getIndexedBonds() elif d == 'unindexed': return self.getUnindexedBonds() elif d == 'alpha': return self.getAlphaBonds() elif d == 'beta': return self.getBetaBonds() return self.getBonds() return None
[docs] def selectBond(self, descriptor = None): """Select a random bond fitting the descriptor. Parameters ---------- descriptor: optional, None None - returns any bond with equal probability int - will return an bond with that index 'Indexed' - returns a random indexed bond 'Unindexed' - returns a random unindexed bond 'Alpha' - returns a random alpha bond 'Beta' - returns a random beta bond Returns ------- a single Bond object fitting the description or None if no such atom exists """ try: descriptor = int(descriptor) except: descriptor = descriptor if type(descriptor) is int: for bond in self.getBonds(): if bond._bond_type == descriptor: return bond return None bonds = self.getComponentList('bond', descriptor) if len(bonds) == 0: return None return random.choice(bonds)
[docs] def addAtom(self, bondToAtom, bondORtypes= None, bondANDtypes = None, newORtypes = None, newANDtypes = None, newAtomIndex = None, newAtomRing = None, beyondBeta = False): """Add an atom to the specified target atom. Parameters ----------- bondToAtom: atom object, required atom the new atom will be bound to bondORtypes: list of tuples, optional strings that will be used for the ORtypes for the new bond bondANDtypes: list of strings, optional strings that will be used for the ANDtypes for the new bond newORtypes: list of strings, optional strings that will be used for the ORtypes for the new atom newANDtypes: list of strings, optional strings that will be used for the ANDtypes for the new atom newAtomIndex: int, optional integer label that could be used to index the atom in a SMIRKS string beyondBeta: boolean, optional if True, allows bonding beyond beta position Returns -------- newAtom: atom object for the newly created atom """ if bondToAtom == None: if len(self._graph_nodes()) > 0: return None newType = newAtomIndex if newType == None: newType = 0 newAtom = self.Atom(newORtypes, newANDtypes, newAtomIndex, newAtomRing) self._graph.add_node(newAtom, atom_type = newType) return newAtom # Check if we can get past beta position bondToType = self._graph_nodes(data=True)[bondToAtom]['atom_type'] if bondToType < 0 and not beyondBeta: return None # determine the type integer for the new atom and bond if newAtomIndex != None: newType = newAtomIndex bondType = max(newType, bondToType) - 1 else: if bondToType > 0: newType = 0 else: newType = bondToType - 1 bondType = newType # create new bond newBond = self.Bond(bondORtypes, bondANDtypes) # create new atom newAtom = self.Atom(newORtypes, newANDtypes, newAtomIndex, newAtomRing) # Add node for newAtom self._graph.add_node(newAtom, atom_type = newType) # Connect original atom and new atom self._graph.add_edge(bondToAtom, newAtom, bond = newBond, bond_type = bondType) newBond._bond_type = bondType return newAtom
[docs] def removeAtom(self, atom, onlyEmpty = False): """Remove the specified atom from the chemical environment. if the atom is not indexed for the SMIRKS string or used to connect two other atoms. Parameters ---------- atom: atom object, required atom to be removed if it meets the conditions. onlyEmpty: boolean, optional True only an atom with no ANDtypes and 1 ORtype can be removed Returns -------- Boolean True: atom was removed, False: atom was not removed """ # labeled atoms can't be removed if atom.index is not None: return False # Atom connected to more than one other atom cannot be removed if len(self._graph_neighbors(atom)) > 1: return False # if you can remove "decorated atoms" remove it if not onlyEmpty: self._graph_remove_node(atom) return True if len(atom.ANDtypes) > 0: return False elif len(atom.ORtypes) > 1: return False self._graph_remove_node(atom) return True
[docs] def getAtoms(self): """ Returns ------- list of atoms in the environment """ return self._graph_nodes()
[docs] def getBonds(self, atom = None): """ Parameters ---------- atom: Atom object, optional, returns bonds connected to atom returns all bonds in fragment if atom is None Returns -------- a complete list of bonds in the fragment """ if atom == None: edge_list = self._graph_edges(data=True) bonds = [data['bond'] for a1, a2, data in edge_list] else: bonds = [] for (a1, a2, info) in self._graph_edges(data=True, node=atom): bonds.append(info['bond']) return bonds
[docs] def getBond(self, atom1, atom2): """ Get bond betwen two atoms Parameters ----------- atom1 and atom2: atom objects Returns -------- bond object between the atoms or None if no bond there """ if atom2 in self._graph_neighbors(atom1): return self._graph_get_edge_data(atom1, atom2)['bond'] else: return None
[docs] def getIndexedAtoms(self): """ returns the list of Atom objects with an index """ index_atoms = [] for atom, info in self._graph_nodes(data=True).items(): if info['atom_type'] > 0: index_atoms.append(atom) return index_atoms
[docs] def getUnindexedAtoms(self): """ returns a list of Atom objects that are not indexed """ unindexed_atoms = [] for atom, info in self._graph_nodes(data=True).items(): if info['atom_type'] < 1: unindexed_atoms.append(atom) return unindexed_atoms
[docs] def getAlphaAtoms(self): """ Returns a list of atoms alpha to any indexed atom that are not also indexed """ alpha_atoms = [] for atom, info in self._graph_nodes(data=True).items(): if info['atom_type'] == 0: alpha_atoms.append(atom) return alpha_atoms
[docs] def getBetaAtoms(self): """ Returns a list of atoms beta to any indexed atom that are not alpha or indexed atoms """ beta_atoms = [] for atom, info in self._graph_nodes(data=True).items(): if info['atom_type'] == -1: beta_atoms.append(atom) return beta_atoms
[docs] def getIndexedBonds(self): """ Returns a list of Bond objects that connect two indexed atoms """ indexedBonds = [] for bond in self.getBonds(): if bond._bond_type > 0: indexedBonds.append(bond) return indexedBonds
[docs] def getUnindexedBonds(self): """ Returns a list of Bond objects that connect an indexed atom to an unindexed atom two unindexed atoms """ unindexedBonds = [] for bond in self.getBonds(): if bond._bond_type < 1: unindexedBonds.append(bond) return unindexedBonds
[docs] def getAlphaBonds(self): """ Returns a list of Bond objects that connect an indexed atom to alpha atoms """ alphaBonds = [] for bond in self.getBonds(): if bond._bond_type == 0: alphaBonds.append(bond) return alphaBonds
[docs] def getBetaBonds(self): """ Returns a list of Bond objects that connect alpha atoms to beta atoms """ betaBonds = [] for bond in self.getBonds(): if bond._bond_type == -1: betaBonds.append(bond) return betaBonds
[docs] def isAlpha(self, component): """ Takes an atom or bond are returns True if it is alpha to an indexed atom """ if component._atom: return self._graph_nodes(data=True)[component]['atom_type'] == 0 else: return component._bond_type == 0
[docs] def isUnindexed(self, component): """ returns True if the atom or bond is not indexed """ if component._atom: return component.index == None else: return component._bond_type < 1
[docs] def isIndexed(self, component): """ returns True if the atom or bond is indexed """ if component._atom: return component.index != None else: return component._bond_type > 0
[docs] def isBeta(self, component): """ Takes an atom or bond are returns True if it is beta to an indexed atom """ if component._atom: return self._graph_nodes(data=True)[component]['atom_type'] == -1 else: return component._bond_type == -1
# TODO: We may want to overhaul ChemicalEnvironment.getType() to return one of ['atom', 'bond', 'angle', 'proper', 'improper'] # and check to make sure the expected connectivity is represented in the SMIRKS expression.
[docs] def getType(self): """ Uses number of indexed atoms and bond connectivity to determine the type of chemical environment Returns ------- chemical environemnt type: 'Atom', 'Bond', 'Angle', 'ProperTorsion', 'ImproperTorsion' None if number of indexed atoms is 0 or > 4 """ index_atoms = self.getIndexedAtoms() natoms = len(index_atoms) if natoms == 1: return "Atom" if natoms == 2: return "Bond" if natoms == 3: return "Angle" if natoms == 4: atom2 = self.selectAtom(2) atom4 = self.selectAtom(4) bond24 = self.getBond(atom2, atom4) if bond24 != None: return "ImproperTorsion" return "ProperTorsion" else: return None
[docs] def getNeighbors(self, atom): """ Returns atoms that are bound to the given atom in the form of a list of Atom objects """ return self._graph_neighbors(atom)
[docs] def getValence(self, atom): """ Returns the valence (number of neighboring atoms) around the given atom """ return len(self._graph_neighbors(atom))
[docs] def getBondOrder(self, atom): """ Returns minimum bond order around a given atom 0 if atom has no neighbors aromatic bonds count as 1.5 any bond counts as 1.0 """ order = 0. for a1, a2, info in self._graph_edges(data=True, node=atom): order += info['bond'].getOrder() return order
class AtomChemicalEnvironment(ChemicalEnvironment): """Chemical environment matching one labeled atom. """ def __init__(self, smirks = "[*:1]", label = None, replacements = None, toolkit='openeye'): """Initialize a chemical environment corresponding to matching a single atom. Parameters ----------- smirks: string, optional the default is an empty Atom corresponding to "[*:1]" label = anything, optional intended to be used to label this chemical environment could be a string, int, or float, or anything replacements = list of lists, optional, [substitution, smarts] form for parsing SMIRKS For example: # create an atom that is carbon, nitrogen, or oxygen with no formal charge atom = AtomChemicalEnvironment([['#6', '#7', '#8'], ['+0']]) print atom.asSMIRKS() # prints: "[#6,#7,#8;+0:1]" """ # Initialize base class super(AtomChemicalEnvironment,self).__init__(smirks, label, replacements, toolkit) correct, expected = self._checkType() if not correct: assigned = self.getType() raise SMIRKSMismatchError("The SMIRKS (%s) was assigned the type %s when %s was expected" % (smirks, assigned, expected)) self.atom1 = self.selectAtom(1) def _checkType(self): return (self.getType() == 'Atom'), 'Atom' def asSMIRKS(self, smarts = False): """ Returns a SMIRKS representation of the chemical environment Parameters ----------- smarts: optional, boolean if True, returns a SMARTS instead of SMIRKS without index labels """ return self._asSMIRKS(self.atom1, None, smarts) def asAtomtypeSMARTS(self): """ Makes SMARTS string for one atom Returns -------- string for the SMARTS string for the first atom (labeled with index :1) This is a single atom with neighbors as decorators in the form: [atom1$(*~neighbor1)$(*~neighbor2)...] """ # smarts for atom1 without last ']' smarts = self.atom1.asSMARTS()[:-1] for idx, neighbor in enumerate(self._graph_neighbors(self.atom1)): new_neighbors = self._graph_neighbors(neighbor) new_neighbors.remove(self.atom1) bondSMARTS = self._graph_get_edge_data(self.atom1, neighbor)['bond'].asSMARTS() neighborSMARTS = self._asSMIRKS(neighbor, new_neighbors, True) smarts += '$(*' + bondSMARTS + neighborSMARTS + ')' return smarts + ']' class BondChemicalEnvironment(AtomChemicalEnvironment): """Chemical environment matching two labeled atoms (or a bond). """ def __init__(self, smirks = "[*:1]~[*:2]", label = None, replacements = None, toolkit='openeye'): """Initialize a chemical environment corresponding to matching two atoms (bond). Parameters ----------- smirks: string, optional the default is an empty Bond corresponding to "[*:1]~[*:2]" label = anything, optional intended to be used to label this chemical environment could be a string, int, or float, or anything replacements = list of lists, optional, [substitution, smarts] form for parsing SMIRKS """ # Initialize base class super(BondChemicalEnvironment,self).__init__(smirks, label, replacements, toolkit) # Add initial atom self.atom2 = self.selectAtom(2) if self.atom2 == None: raise Exception("Error: Bonds need 2 indexed atoms, there were not enough in %s" % smirks) self.bond2 = self._graph_get_edge_data(self.atom1, self.atom2)['bond'] def _checkType(self): return (self.getType() == 'Bond'), 'Bond' class AngleChemicalEnvironment(BondChemicalEnvironment): """Chemical environment matching three marked atoms (angle). """ def __init__(self, smirks = "[*:1]~[*:2]~[*:3]", label = None, replacements = None, toolkit='openeye'): """Initialize a chemical environment corresponding to matching three atoms. Parameters ----------- smirks: string, optional the default is an empty Angle corresponding to "[*:1]~[*:2]~[*:3]" label = anything, optional intended to be used to label this chemical environment could be a string, int, or float, or anything replacements = list of lists, optional, [substitution, smarts] form for parsing SMIRKS """ # Initialize base class super(AngleChemicalEnvironment,self).__init__(smirks, label, replacements, toolkit) # Add initial atom self.atom3 = self.selectAtom(3) self.bond2 = self._graph_get_edge_data(self.atom2, self.atom3)['bond'] def _checkType(self): return (self.getType() == 'Angle'), 'Angle' class TorsionChemicalEnvironment(AngleChemicalEnvironment): """Chemical environment matching four marked atoms (torsion). """ def __init__(self, smirks = "[*:1]~[*:2]~[*:3]~[*:4]", label = None, replacements = None, toolkit='openeye'): """Initialize a chemical environment corresponding to matching four atoms (torsion). Parameters ----------- smirks: string, optional the default is an empty Torsion corresponding to "[*:1]~[*:2]~[*:3]~[*:4]" label = anything, optional intended to be used to label this chemical environment could be a string, int, or float, or anything replacements = list of lists, optional, [substitution, smarts] form for parsing SMIRKS """ # Initialize base class super(TorsionChemicalEnvironment,self).__init__(smirks, label, replacements, toolkit) # Add initial atom self.atom4 = self.selectAtom(4) self.bond3 = self._graph_get_edge_data(self.atom3, self.atom4)['bond'] def _checkType(self): return (self.getType() == 'ProperTorsion'), 'ProperTorsion' class ImproperChemicalEnvironment(AngleChemicalEnvironment): """Chemical environment matching four marked atoms (improper). """ def __init__(self, smirks = "[*:1]~[*:2](~[*:3])~[*:4]", label = None, replacements = None, toolkit='openeye'): """Initialize a chemical environment corresponding four atoms (improper). Parameters ----------- smirks: string, optional the default is an empty Improper corresponding to "[*:1]~[*:2](~[*:3])~[*:4]" label = anything, optional intended to be used to label this chemical environment could be a string, int, or float, or anything """ # Initialize base class super(ImproperChemicalEnvironment,self).__init__(smirks, label, replacements, toolkit) # Add initial atom self.atom4 = self.selectAtom(4) self.bond3 = self._graph_get_edge_data(self.atom2, self.atom4)['bond'] def _checkType(self): return (self.getType() == 'Improper'), 'Improper'