Skip to content

Chemical Utilities

Examples

Examples of using the Chemical Utilities are listed at the bottom of this page Examples.

The majority of the chemical utilities in Workbench use either RDKIT or Mordred (Community). The inclusion of these utilities allows the use and deployment of this functionality into AWS (FeatureSets, Models, Endpoints).

Examples

Canonical Smiles

examples/chem_utils/canonicalize_smiles.py
"""Example for computing Canonicalize SMILES strings"""
import pandas as pd
from workbench.utils.chem_utils import canonicalize

test_data = [
    {"id": "Acetylacetone", "smiles": "CC(=O)CC(=O)C", "expected": "CC(=O)CC(C)=O"},
    {"id": "Imidazole", "smiles": "c1cnc[nH]1", "expected": "c1c[nH]cn1"},
    {"id": "Pyridone", "smiles": "C1=CC=NC(=O)C=C1", "expected": "O=c1cccccn1"},
    {"id": "Guanidine", "smiles": "C(=N)N=C(N)N", "expected": "N=CN=C(N)N"},
    {"id": "Catechol", "smiles": "c1cc(c(cc1)O)O", "expected": "Oc1ccccc1O"},
    {"id": "Formamide", "smiles": "C(=O)N", "expected": "NC=O"},
    {"id": "Urea", "smiles": "C(=O)(N)N", "expected": "NC(N)=O"},
    {"id": "Phenol", "smiles": "c1ccc(cc1)O", "expected": "Oc1ccccc1"},
]

# Convert test data to a DataFrame
df = pd.DataFrame(test_data)

# Perform canonicalization
result_df = canonicalize(df)
print(result_df)

Output

              id            smiles       expected smiles_canonical
0  Acetylacetone     CC(=O)CC(=O)C  CC(=O)CC(C)=O    CC(=O)CC(C)=O
1      Imidazole        c1cnc[nH]1     c1c[nH]cn1       c1c[nH]cn1
2       Pyridone  C1=CC=NC(=O)C=C1    O=c1cccccn1      O=c1cccccn1
3      Guanidine      C(=N)N=C(N)N     N=CN=C(N)N       N=CN=C(N)N
4       Catechol    c1cc(c(cc1)O)O     Oc1ccccc1O       Oc1ccccc1O
5      Formamide            C(=O)N           NC=O             NC=O
6           Urea         C(=O)(N)N        NC(N)=O          NC(N)=O
7         Phenol       c1ccc(cc1)O      Oc1ccccc1        Oc1ccccc1

Tautomerize Smiles

examples/chem_utils/tautomerize_smiles.py
"""Example for Tautomerizing SMILES strings"""
import pandas as pd
from workbench.utils.chem_utils import tautomerize_smiles

test_data = [
    # Salicylaldehyde undergoes keto-enol tautomerization.
    {"id": "Salicylaldehyde (Keto)", "smiles": "O=Cc1cccc(O)c1", "expected": "O=Cc1cccc(O)c1"},
    {"id": "2-Hydroxybenzaldehyde (Enol)", "smiles": "Oc1ccc(C=O)cc1", "expected": "O=Cc1ccc(O)cc1"},
    # Acetylacetone undergoes keto-enol tautomerization to favor the enol form.
    {"id": "Acetylacetone", "smiles": "CC(=O)CC(=O)C", "expected": "CC(=O)CC(C)=O"},
    # Imidazole undergoes a proton shift in the aromatic ring.
    {"id": "Imidazole", "smiles": "c1cnc[nH]1", "expected": "c1c[nH]cn1"},
    # Pyridone prefers the lactam form in RDKit's tautomer enumeration.
    {"id": "Pyridone", "smiles": "C1=CC=NC(=O)C=C1", "expected": "O=c1cccccn1"},
    # Guanidine undergoes amine-imine tautomerization.
    {"id": "Guanidine", "smiles": "C(=N)N=C(N)N", "expected": "N=C(N)N=CN"},
    # Catechol standardizes hydroxyl group placement in the aromatic system.
    {"id": "Catechol", "smiles": "c1cc(c(cc1)O)O", "expected": "Oc1ccccc1O"},
    # Formamide canonicalizes to NC=O, reflecting its stable form.
    {"id": "Formamide", "smiles": "C(=O)N", "expected": "NC=O"},
    # Urea undergoes a proton shift between nitrogen atoms.
    {"id": "Urea", "smiles": "C(=O)(N)N", "expected": "NC(N)=O"},
    # Phenol standardizes hydroxyl group placement in the aromatic system.
    {"id": "Phenol", "smiles": "c1ccc(cc1)O", "expected": "Oc1ccccc1"}
]

# Convert test data to a DataFrame
df = pd.DataFrame(test_data)

# Perform tautomerization
result_df = tautomerize_smiles(df)
print(result_df)

Output

                             id       smiles_orig        expected smiles_canonical          smiles
0        Salicylaldehyde (Keto)    O=Cc1cccc(O)c1  O=Cc1cccc(O)c1   O=Cc1cccc(O)c1  O=Cc1cccc(O)c1
1  2-Hydroxybenzaldehyde (Enol)    Oc1ccc(C=O)cc1  O=Cc1ccc(O)cc1   O=Cc1ccc(O)cc1  O=Cc1ccc(O)cc1
2                 Acetylacetone     CC(=O)CC(=O)C   CC(=O)CC(C)=O    CC(=O)CC(C)=O   CC(=O)CC(C)=O
3                     Imidazole        c1cnc[nH]1      c1c[nH]cn1       c1c[nH]cn1      c1c[nH]cn1
4                      Pyridone  C1=CC=NC(=O)C=C1     O=c1cccccn1      O=c1cccccn1     O=c1cccccn1
5                     Guanidine      C(=N)N=C(N)N      N=C(N)N=CN       N=CN=C(N)N      N=C(N)N=CN
6                      Catechol    c1cc(c(cc1)O)O      Oc1ccccc1O       Oc1ccccc1O      Oc1ccccc1O
7                     Formamide            C(=O)N            NC=O             NC=O            NC=O
8                          Urea         C(=O)(N)N         NC(N)=O          NC(N)=O         NC(N)=O
9                        Phenol       c1ccc(cc1)O       Oc1ccccc1        Oc1ccccc1       Oc1ccccc1

Additional Resources