Skip to content

Chemical Utilities

Endpoint Examples

Examples of using the Endpoint class are listed at the bottom of this page Examples.

The majority of the chemical utilities in Workbench use either RDKIT or Mordred (Community). The inclusion of these utilities allows the use and deployment of this functionality into AWS (FeatureSets, Models, Endpoints).

Chem/RDKIT/Mordred utilities for Workbench

add_compound_tags(df, mol_column='molecule')

Adds a 'tags' column to a DataFrame, tagging compounds based on their properties.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing molecular data.

required
mol_column str

Column name containing RDKit molecule objects.

'molecule'

Returns:

Type Description

pd.DataFrame: Updated DataFrame with a 'tags' column.

Source code in src/workbench/utils/chem_utils.py
def add_compound_tags(df, mol_column="molecule"):
    """
    Adds a 'tags' column to a DataFrame, tagging compounds based on their properties.

    Args:
        df (pd.DataFrame): Input DataFrame containing molecular data.
        mol_column (str): Column name containing RDKit molecule objects.

    Returns:
        pd.DataFrame: Updated DataFrame with a 'tags' column.
    """
    # Initialize the tags column
    df["tags"] = [[] for _ in range(len(df))]
    df["meta"] = [{} for _ in range(len(df))]

    # Process each molecule in the DataFrame
    for idx, row in df.iterrows():
        mol = row[mol_column]
        tags = []

        # Check for salts
        if contains_salts(mol):
            tags.append("salt")

        # Check for fragments (should be done after salt check)
        fragments = Chem.GetMolFrags(mol, asMols=True)
        if len(fragments) > 1:
            tags.append("frag")

        # Check for heavy metals
        if contains_heavy_metals(mol):
            tags.append("heavy_metals")

        # Check for toxic elements
        te = toxic_elements(mol)
        if te:
            tags.append("toxic_element")
            df.at[idx, "meta"]["toxic_elements"] = te

        # Check for toxic groups
        tg = toxic_groups(mol)
        if tg:
            tags.append("toxic_group")
            df.at[idx, "meta"]["toxic_groups"] = tg

        # Check for metalloenzyme-relevant metals
        if contains_metalloenzyme_relevant_metals(mol):
            tags.append("metalloenzyme")

        # Check for drug-likeness
        if is_druglike_compound(mol):
            tags.append("druglike")

        # Update tags
        df.at[idx, "tags"] = tags

    return df

canonicalize(df, remove_mol_col=True)

Generate RDKit's canonical SMILES for each molecule in the input DataFrame.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing a column named 'SMILES' (case-insensitive).

required
remove_mol_col bool

Whether to drop the intermediate 'molecule' column. Default is True.

True

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame with an additional 'smiles_canonical' column and, optionally, the 'molecule' column.

Source code in src/workbench/utils/chem_utils.py
def canonicalize(df: pd.DataFrame, remove_mol_col: bool = True) -> pd.DataFrame:
    """
    Generate RDKit's canonical SMILES for each molecule in the input DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing a column named 'SMILES' (case-insensitive).
        remove_mol_col (bool): Whether to drop the intermediate 'molecule' column. Default is True.

    Returns:
        pd.DataFrame: A DataFrame with an additional 'smiles_canonical' column and,
                      optionally, the 'molecule' column.
    """
    # Identify the SMILES column (case-insensitive)
    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
    if smiles_column is None:
        raise ValueError("Input DataFrame must have a 'SMILES' column")

    # Convert SMILES to RDKit molecules
    df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)

    # Log invalid SMILES
    invalid_indices = df[df["molecule"].isna()].index
    if not invalid_indices.empty:
        log.critical(f"Invalid SMILES strings at indices: {invalid_indices.tolist()}")

    # Drop rows where SMILES failed to convert to molecule
    df.dropna(subset=["molecule"], inplace=True)

    # Remove disconnected fragments (keep the largest fragment)
    df["molecule"] = df["molecule"].apply(lambda mol: remove_disconnected_fragments(mol) if mol else None)

    # Convert molecules to canonical SMILES (preserving isomeric information)
    df["smiles_canonical"] = df["molecule"].apply(
        lambda mol: Chem.MolToSmiles(mol, isomericSmiles=True) if mol else None
    )

    # Drop intermediate RDKit molecule column if requested
    if remove_mol_col:
        df.drop(columns=["molecule"], inplace=True)

    return df

compute_molecular_descriptors(df)

Compute and add all the Molecular Descriptors

Parameters:

Name Type Description Default
df(pd.DataFrame)

The DataFrame to process and generate RDKit/Mordred Descriptors

required

Returns:

Type Description
DataFrame

pd.DataFrame: The input DataFrame with all the RDKit Descriptors added

Source code in src/workbench/utils/chem_utils.py
def compute_molecular_descriptors(df: pd.DataFrame) -> pd.DataFrame:
    """Compute and add all the Molecular Descriptors

    Args:
        df(pd.DataFrame): The DataFrame to process and generate RDKit/Mordred Descriptors

    Returns:
        pd.DataFrame: The input DataFrame with all the RDKit Descriptors added
    """
    delete_mol_column = False

    # Check for the smiles column (any capitalization)
    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
    if smiles_column is None:
        raise ValueError("Input DataFrame must have a 'smiles' column")

    # Compute/add all the Molecular Descriptors
    log.info("Computing Molecular Descriptors...")

    # Convert SMILES to RDKit molecule objects (vectorized)
    if "molecule" not in df.columns:
        log.info("Converting SMILES to RDKit Molecules...")
        delete_mol_column = True
        df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)

    # Make sure our molecules are not None
    failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
    if failed_smiles:
        log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
    df = df.dropna(subset=["molecule"])

    # If we have fragments in our compounds, get the largest fragment before computing descriptors
    largest_frags = df["molecule"].apply(remove_disconnected_fragments)

    # Now get all the RDKIT Descriptors
    all_descriptors = [x[0] for x in Descriptors._descList]

    # There's an overflow issue that happens with the IPC descriptor, so we'll remove it
    # See: https://github.com/rdkit/rdkit/issues/1527
    if "Ipc" in all_descriptors:
        all_descriptors.remove("Ipc")

    # Make sure we don't have duplicates
    all_descriptors = list(set(all_descriptors))

    # RDKit Molecular Descriptor Calculator Class
    log.info("Computing RDKit Descriptors...")
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(all_descriptors)
    column_names = calc.GetDescriptorNames()
    descriptor_values = [calc.CalcDescriptors(m) for m in largest_frags]
    rdkit_features_df = pd.DataFrame(descriptor_values, columns=column_names)

    # Now compute Mordred Features
    log.info("Computing Mordred Descriptors...")
    descriptor_choice = [AcidBase, Aromatic, Polarizability, RotatableBond]
    calc = Calculator()
    for des in descriptor_choice:
        calc.register(des)
    mordred_df = calc.pandas(largest_frags, nproc=1)

    # Combine the DataFrame with the RDKit and Mordred Descriptors added
    # Note: This will overwrite any existing columns with the same name. This is a good thing
    #       since we want computed descriptors to overwrite anything in the input dataframe
    output_df = mordred_df.combine_first(rdkit_features_df).combine_first(df)

    # Lowercase all column names and ensure no duplicate column names
    output_df.columns = output_df.columns.str.lower()
    output_df = output_df.loc[:, ~output_df.columns.duplicated()]

    # Reorder the columns to have all the ones in the input df first and then the descriptors
    input_columns = df.columns.str.lower()
    output_df = output_df[list(input_columns) + [col for col in output_df.columns if col not in input_columns]]

    # Drop the intermediate 'molecule' column if it was added
    if delete_mol_column:
        del output_df["molecule"]

    # Return the DataFrame with the RDKit and Mordred Descriptors added
    return output_df

compute_morgan_fingerprints(df, radius=2, n_bits=2048, counts=True)

Compute and add Morgan fingerprints to the DataFrame.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing SMILES strings.

required
radius int

Radius for the Morgan fingerprint.

2
n_bits int

Number of bits for the fingerprint.

2048
counts bool

Count simulation for the fingerprint.

True

Returns:

Type Description
DataFrame

pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.

Note

See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html

Source code in src/workbench/utils/chem_utils.py
def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
    """Compute and add Morgan fingerprints to the DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing SMILES strings.
        radius (int): Radius for the Morgan fingerprint.
        n_bits (int): Number of bits for the fingerprint.
        counts (bool): Count simulation for the fingerprint.

    Returns:
        pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.

    Note:
        See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
    """
    delete_mol_column = False

    # Check for the SMILES column (case-insensitive)
    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
    if smiles_column is None:
        raise ValueError("Input DataFrame must have a 'smiles' column")

    # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
    if "molecule" in df.columns and df["molecule"].dtype == "string":
        log.warning("Detected serialized molecules in 'molecule' column. Removing...")
        del df["molecule"]

    # Convert SMILES to RDKit molecule objects (vectorized)
    if "molecule" not in df.columns:
        log.info("Converting SMILES to RDKit Molecules...")
        delete_mol_column = True
        df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)

    # If we have fragments in our compounds, get the largest fragment before computing fingerprints
    largest_frags = df["molecule"].apply(remove_disconnected_fragments)

    # Create a Morgan fingerprint generator
    if counts:
        n_bits *= 4  # Multiply by 4 to simulate counts
    morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)

    # Compute Morgan fingerprints (vectorized)
    fingerprints = largest_frags.apply(
        lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
    )

    # Add the fingerprints to the DataFrame
    df["morgan_fingerprint"] = fingerprints

    # Drop the intermediate 'molecule' column if it was added
    if delete_mol_column:
        del df["molecule"]
    return df

contains_heavy_metals(mol)

Check if a molecule contains any heavy metals (broad filter).

Parameters:

Name Type Description Default
mol

RDKit molecule object.

required

Returns:

Name Type Description
bool

True if any heavy metals are detected, False otherwise.

Source code in src/workbench/utils/chem_utils.py
def contains_heavy_metals(mol):
    """
    Check if a molecule contains any heavy metals (broad filter).

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if any heavy metals are detected, False otherwise.
    """
    heavy_metals = {"Zn", "Cu", "Fe", "Mn", "Co", "Pb", "Hg", "Cd", "As"}
    return any(atom.GetSymbol() in heavy_metals for atom in mol.GetAtoms())

contains_metalloenzyme_relevant_metals(mol)

Check if a molecule contains metals relevant to metalloenzymes.

Parameters:

Name Type Description Default
mol

RDKit molecule object.

required

Returns:

Name Type Description
bool

True if metalloenzyme-relevant metals are detected, False otherwise.

Source code in src/workbench/utils/chem_utils.py
def contains_metalloenzyme_relevant_metals(mol):
    """
    Check if a molecule contains metals relevant to metalloenzymes.

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if metalloenzyme-relevant metals are detected, False otherwise.
    """
    metalloenzyme_metals = {"Zn", "Cu", "Fe", "Mn", "Co"}
    return any(atom.GetSymbol() in metalloenzyme_metals for atom in mol.GetAtoms())

contains_salts(mol)

Check if a molecule contains common salts or counterions.

Parameters:

Name Type Description Default
mol

RDKit molecule object.

required

Returns:

Name Type Description
bool

True if salts are detected, False otherwise.

Source code in src/workbench/utils/chem_utils.py
def contains_salts(mol):
    """
    Check if a molecule contains common salts or counterions.

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if salts are detected, False otherwise.
    """
    # Define common inorganic salt fragments (SMARTS patterns)
    salt_patterns = ["[Na+]", "[K+]", "[Cl-]", "[Mg+2]", "[Ca+2]", "[NH4+]", "[SO4--]"]
    for pattern in salt_patterns:
        if mol.HasSubstructMatch(Chem.MolFromSmarts(pattern)):
            return True
    return False

custom_tautomer_canonicalization(mol)

Domain-specific processing of a molecule to select the canonical tautomer.

This function enumerates all possible tautomers for a given molecule and applies custom logic to select the canonical form.

Parameters:

Name Type Description Default
mol Mol

The RDKit molecule for which the canonical tautomer is to be determined.

required

Returns:

Name Type Description
str str

The SMILES string of the selected canonical tautomer.

Source code in src/workbench/utils/chem_utils.py
def custom_tautomer_canonicalization(mol: Mol) -> str:
    """Domain-specific processing of a molecule to select the canonical tautomer.

    This function enumerates all possible tautomers for a given molecule and applies
    custom logic to select the canonical form.

    Args:
        mol (Mol): The RDKit molecule for which the canonical tautomer is to be determined.

    Returns:
        str: The SMILES string of the selected canonical tautomer.
    """
    tautomer_enumerator = TautomerEnumerator()
    enumerated_tautomers = tautomer_enumerator.Enumerate(mol)

    # Example custom logic: prioritize based on use-case specific criteria
    selected_tautomer = None
    highest_score = float("-inf")

    for taut in enumerated_tautomers:
        # Compute custom scoring logic:
        # 1. Prefer forms with fewer hydrogen bond donors (HBD) if membrane permeability is important
        # 2. Penalize forms with high molecular weight for better drug-likeness
        # 3. Incorporate known functional group preferences (e.g., keto > enol for binding)

        hbd = CalcNumHBD(taut)  # Hydrogen Bond Donors
        mw = CalcExactMolWt(taut)  # Molecular Weight
        aromatic_rings = taut.GetRingInfo().NumAromaticRings()  # Favor aromaticity

        # Example scoring: balance HBD, MW, and aromaticity
        score = -hbd - 0.01 * mw + aromatic_rings * 2

        # Update selected tautomer
        if score > highest_score:
            highest_score = score
            selected_tautomer = taut

    # Return the SMILES of the selected tautomer
    return Chem.MolToSmiles(selected_tautomer)

geometric_mean(series)

Computes the geometric mean manually to avoid using scipy.

Source code in src/workbench/utils/chem_utils.py
def geometric_mean(series: pd.Series) -> float:
    """Computes the geometric mean manually to avoid using scipy."""
    return np.exp(np.log(series).mean())

halogen_toxicity_score(mol)

Calculate the halogen count and toxicity threshold for a molecule.

Parameters:

Name Type Description Default
mol Mol

RDKit molecule object.

required

Returns:

Type Description
(int, int)

Tuple[int, int]: (halogen_count, halogen_threshold), where the threshold

(int, int)

scales with molecule size (minimum of 2 or 20% of atom count).

Source code in src/workbench/utils/chem_utils.py
def halogen_toxicity_score(mol: Mol) -> (int, int):
    """
    Calculate the halogen count and toxicity threshold for a molecule.

    Args:
        mol: RDKit molecule object.

    Returns:
        Tuple[int, int]: (halogen_count, halogen_threshold), where the threshold
        scales with molecule size (minimum of 2 or 20% of atom count).
    """
    # Define halogens and count their occurrences
    halogens = {"Cl", "Br", "I", "F"}
    halogen_count = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() in halogens)

    # Define threshold: small molecules tolerate fewer halogens
    # Threshold scales with molecule size to account for reasonable substitution
    molecule_size = mol.GetNumAtoms()
    halogen_threshold = max(2, int(molecule_size * 0.2))  # Minimum 2, scaled by 20% of molecule size

    return halogen_count, halogen_threshold

img_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')

Generate an image of the molecule represented by the given SMILES string.

Parameters:

Name Type Description Default
smiles str

A SMILES string representing the molecule.

required
width int

Width of the image in pixels. Default is 500.

500
height int

Height of the image in pixels. Default is 500.

500
background str

Background color of the image. Default is dark grey

'rgba(64, 64, 64, 1)'

Returns:

Name Type Description
str Optional[str]

PIL image of the molecule or None if the SMILES string is invalid.

Source code in src/workbench/utils/chem_utils.py
def img_from_smiles(
    smiles: str, width: int = 500, height: int = 500, background: str = "rgba(64, 64, 64, 1)"
) -> Optional[str]:
    """
    Generate an image of the molecule represented by the given SMILES string.

    Args:
        smiles (str): A SMILES string representing the molecule.
        width (int): Width of the image in pixels. Default is 500.
        height (int): Height of the image in pixels. Default is 500.
        background (str): Background color of the image. Default is dark grey

    Returns:
        str: PIL image of the molecule or None if the SMILES string is invalid.
    """

    # Set up the drawing options
    dos = Draw.MolDrawOptions()
    if is_dark(background):
        rdMolDraw2D.SetDarkMode(dos)
    dos.setBackgroundColour(rgba_to_tuple(background))

    # Convert the SMILES string to an RDKit molecule and generate the image
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        img = Draw.MolToImage(mol, options=dos, size=(width, height))
        return img
    else:
        log.warning(f"Invalid SMILES: {smiles}")
        return None

is_druglike_compound(mol)

Filter for drug-likeness and QSAR relevance based on Lipinski's Rule of Five. Returns False for molecules unlikely to be orally bioavailable.

Parameters:

Name Type Description Default
mol Mol

RDKit molecule object.

required

Returns:

Name Type Description
bool bool

True if the molecule is drug-like, False otherwise.

Source code in src/workbench/utils/chem_utils.py
def is_druglike_compound(mol: Mol) -> bool:
    """
    Filter for drug-likeness and QSAR relevance based on Lipinski's Rule of Five.
    Returns False for molecules unlikely to be orally bioavailable.

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if the molecule is drug-like, False otherwise.
    """

    # Lipinski's Rule of Five
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    if mw > 500 or logp > 5 or hbd > 5 or hba > 10:
        return False

    # Allow exceptions for linear molecules that meet strict RO5 criteria
    if mol.GetRingInfo().NumRings() == 0:
        if mw <= 300 and logp <= 3 and hbd <= 3 and hba <= 3:
            pass  # Allow small, non-cyclic druglike compounds
        else:
            return False

    return True

log_to_category(log_series)

Convert a pandas Series of log values to concentration categories.

Parameters: log_series (pd.Series): Series of logarithmic values (log10).

Returns: pd.Series: Series of concentration categories.

Source code in src/workbench/utils/chem_utils.py
def log_to_category(log_series: pd.Series) -> pd.Series:
    """
    Convert a pandas Series of log values to concentration categories.

    Parameters:
    log_series (pd.Series): Series of logarithmic values (log10).

    Returns:
    pd.Series: Series of concentration categories.
    """
    # Create a solubility classification column
    bins = [-float("inf"), -5, -4, float("inf")]
    labels = ["low", "medium", "high"]
    return pd.cut(log_series, bins=bins, labels=labels)

log_to_micromolar(log_series)

Convert a pandas Series of logarithmic values (log10) back to concentrations in µM (micromolar).

Parameters: log_series (pd.Series): Series of logarithmic values (log10).

Returns: pd.Series: Series of concentrations in micromolar.

Source code in src/workbench/utils/chem_utils.py
def log_to_micromolar(log_series: pd.Series) -> pd.Series:
    """
    Convert a pandas Series of logarithmic values (log10) back to concentrations in µM (micromolar).

    Parameters:
    log_series (pd.Series): Series of logarithmic values (log10).

    Returns:
    pd.Series: Series of concentrations in micromolar.
    """
    series_mol_per_l = 10**log_series  # Convert log10 back to mol/L
    series_µM = series_mol_per_l * 1e6  # Convert mol/L to µM
    return series_µM

micromolar_to_log(series_μM)

Convert a pandas Series of concentrations in µM (micromolar) to their logarithmic values (log10).

Parameters: series_uM (pd.Series): Series of concentrations in micromolar.

Returns: pd.Series: Series of logarithmic values (log10).

Source code in src/workbench/utils/chem_utils.py
def micromolar_to_log(series_µM: pd.Series) -> pd.Series:
    """
    Convert a pandas Series of concentrations in µM (micromolar) to their logarithmic values (log10).

    Parameters:
    series_uM (pd.Series): Series of concentrations in micromolar.

    Returns:
    pd.Series: Series of logarithmic values (log10).
    """
    # Replace 0 or negative values with a small number to avoid log errors
    adjusted_series = series_µM.clip(lower=1e-9)  # Alignment with another project

    series_mol_per_l = adjusted_series * 1e-6  # Convert µM/L to mol/L
    log_series = np.log10(series_mol_per_l)
    return log_series

project_fingerprints(df, projection='UMAP')

Project fingerprints onto a 2D plane using dimensionality reduction techniques.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing fingerprint data.

required
projection str

Dimensionality reduction technique to use (TSNE or UMAP).

'UMAP'

Returns:

Type Description
DataFrame

pd.DataFrame: The input DataFrame with the projected coordinates added as 'x' and 'y' columns.

Source code in src/workbench/utils/chem_utils.py
def project_fingerprints(df: pd.DataFrame, projection: str = "UMAP") -> pd.DataFrame:
    """Project fingerprints onto a 2D plane using dimensionality reduction techniques.

    Args:
        df (pd.DataFrame): Input DataFrame containing fingerprint data.
        projection (str): Dimensionality reduction technique to use (TSNE or UMAP).

    Returns:
        pd.DataFrame: The input DataFrame with the projected coordinates added as 'x' and 'y' columns.
    """
    # Check for the fingerprint column (case-insensitive)
    fingerprint_column = next((col for col in df.columns if "fingerprint" in col.lower()), None)
    if fingerprint_column is None:
        raise ValueError("Input DataFrame must have a fingerprint column")

    # Convert the bitstring fingerprint into a NumPy array of bools (bits)
    df["fingerprint_bits"] = df[fingerprint_column].apply(lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_))

    # Create a matrix of fingerprints
    X = np.vstack(df["fingerprint_bits"].values)

    # Check for UMAP availability
    if projection == "UMAP" and umap is None:
        log.warning("UMAP is not available. Using TSNE instead.")
        projection = "TSNE"

    # Run the projection
    if projection == "TSNE":
        # Run TSNE on the fingerprint matrix
        tsne = TSNE(n_components=2, perplexity=30, random_state=42)
        embedding = tsne.fit_transform(X)
    else:
        # Run UMAP
        # reducer = umap.UMAP(densmap=True)
        reducer = umap.UMAP(metric="jaccard")
        embedding = reducer.fit_transform(X)

    # Add coordinates to DataFrame
    df["x"] = embedding[:, 0]
    df["y"] = embedding[:, 1]

    # If vertices disconnect from the manifold, they are given NaN values (so replace with 0)
    df["x"] = df["x"].fillna(0)
    df["y"] = df["y"].fillna(0)

    # Jitter
    jitter_scale = 0.1
    df["x"] += np.random.uniform(0, jitter_scale, len(df))
    df["y"] += np.random.uniform(0, jitter_scale, len(df))

    return df

remove_disconnected_fragments(mol)

Remove disconnected fragments from a molecule, keeping the fragment with the most heavy atoms.

Parameters:

Name Type Description Default
mol Mol

RDKit molecule object.

required

Returns:

Name Type Description
Mol Mol

The fragment with the most heavy atoms, or None if no such fragment exists.

Source code in src/workbench/utils/chem_utils.py
def remove_disconnected_fragments(mol: Chem.Mol) -> Chem.Mol:
    """
    Remove disconnected fragments from a molecule, keeping the fragment with the most heavy atoms.

    Args:
        mol (Mol): RDKit molecule object.

    Returns:
        Mol: The fragment with the most heavy atoms, or None if no such fragment exists.
    """
    fragments = Chem.GetMolFrags(mol, asMols=True)
    return max(fragments, key=lambda frag: frag.GetNumHeavyAtoms()) if fragments else None

rollup_experimental_data(df, id, time, target, use_gmean=False)

Rolls up a dataset by selecting the largest time per unique ID and averaging the target value if multiple records exist at that time. Supports both arithmetic and geometric mean.

Parameters:

Name Type Description Default
df DataFrame

Input dataframe.

required
id str

Column representing the unique molecule ID.

required
time str

Column representing the time.

required
target str

Column representing the target value.

required
use_gmean bool

Whether to use the geometric mean instead of the arithmetic mean.

False

Returns:

Type Description
DataFrame

pd.DataFrame: Rolled-up dataframe with all original columns retained.

Source code in src/workbench/utils/chem_utils.py
def rollup_experimental_data(
    df: pd.DataFrame, id: str, time: str, target: str, use_gmean: bool = False
) -> pd.DataFrame:
    """
    Rolls up a dataset by selecting the largest time per unique ID and averaging the target value
    if multiple records exist at that time. Supports both arithmetic and geometric mean.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        id (str): Column representing the unique molecule ID.
        time (str): Column representing the time.
        target (str): Column representing the target value.
        use_gmean (bool): Whether to use the geometric mean instead of the arithmetic mean.

    Returns:
        pd.DataFrame: Rolled-up dataframe with all original columns retained.
    """
    # Find the max time per unique ID
    max_time_df = df.groupby(id)[time].transform("max")
    filtered_df = df[df[time] == max_time_df]

    # Define aggregation function
    agg_func = geometric_mean if use_gmean else np.mean

    # Perform aggregation on all columns
    agg_dict = {col: "first" for col in df.columns if col not in [target, id, time]}
    agg_dict[target] = lambda x: agg_func(x) if len(x) > 1 else x.iloc[0]  # Apply mean or gmean

    rolled_up_df = filtered_df.groupby([id, time]).agg(agg_dict).reset_index()
    return rolled_up_df

show(smiles, width=500, height=500)

Displays an image of the molecule represented by the given SMILES string.

Parameters:

Name Type Description Default
smiles str

A SMILES string representing the molecule.

required
width int

Width of the image in pixels. Default is 500.

500
height int

Height of the image in pixels. Default is 500.

500

Returns: None

Source code in src/workbench/utils/chem_utils.py
def show(smiles: str, width: int = 500, height: int = 500) -> None:
    """
    Displays an image of the molecule represented by the given SMILES string.

    Args:
        smiles (str): A SMILES string representing the molecule.
        width (int): Width of the image in pixels. Default is 500.
        height (int): Height of the image in pixels. Default is 500.

    Returns:
    None
    """
    img = img_from_smiles(smiles, width, height)
    if img:
        img.show()

standard_tautomer_canonicalization(mol)

Standard processing of a molecule to select the canonical tautomer.

RDKit's TautomerEnumerator uses heuristics to select a canonical tautomer, such as preferring keto over enol forms and minimizing formal charges.

Parameters:

Name Type Description Default
mol Mol

The RDKit molecule for which the canonical tautomer is to be determined.

required

Returns:

Name Type Description
str str

The SMILES string of the canonical tautomer.

Source code in src/workbench/utils/chem_utils.py
def standard_tautomer_canonicalization(mol: Mol) -> str:
    """Standard processing of a molecule to select the canonical tautomer.

    RDKit's `TautomerEnumerator` uses heuristics to select a canonical tautomer,
    such as preferring keto over enol forms and minimizing formal charges.

    Args:
        mol (Mol): The RDKit molecule for which the canonical tautomer is to be determined.

    Returns:
        str: The SMILES string of the canonical tautomer.
    """
    tautomer_enumerator = TautomerEnumerator()
    canonical_tautomer = tautomer_enumerator.Canonicalize(mol)
    return Chem.MolToSmiles(canonical_tautomer)

svg_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')

Generate an SVG image of the molecule represented by the given SMILES string.

Parameters:

Name Type Description Default
smiles str

A SMILES string representing the molecule.

required
width int

Width of the image in pixels. Default is 500.

500
height int

Height of the image in pixels. Default is 500.

500
background str

Background color of the image. Default is dark grey.

'rgba(64, 64, 64, 1)'

Returns:

Type Description
Optional[str]

Optional[str]: Encoded SVG string of the molecule or None if the SMILES string is invalid.

Source code in src/workbench/utils/chem_utils.py
def svg_from_smiles(
    smiles: str, width: int = 500, height: int = 500, background: str = "rgba(64, 64, 64, 1)"
) -> Optional[str]:
    """
    Generate an SVG image of the molecule represented by the given SMILES string.

    Args:
        smiles (str): A SMILES string representing the molecule.
        width (int): Width of the image in pixels. Default is 500.
        height (int): Height of the image in pixels. Default is 500.
        background (str): Background color of the image. Default is dark grey.

    Returns:
        Optional[str]: Encoded SVG string of the molecule or None if the SMILES string is invalid.
    """
    # Convert the SMILES string to an RDKit molecule
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None

    # Compute 2D coordinates for the molecule
    AllChem.Compute2DCoords(mol)

    # Initialize the SVG drawer
    drawer = rdMolDraw2D.MolDraw2DSVG(width, height)

    # Configure drawing options
    options = drawer.drawOptions()
    if is_dark(background):
        rdMolDraw2D.SetDarkMode(options)
    options.setBackgroundColour(rgba_to_tuple(background))

    # Draw the molecule
    drawer.DrawMolecule(mol)
    drawer.FinishDrawing()

    # Clean and encode the SVG
    svg = drawer.GetDrawingText()
    encoded_svg = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
    return f"data:image/svg+xml;base64,{encoded_svg}"

tautomerize_smiles(df)

Perform tautomer enumeration and canonicalization on a DataFrame.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing SMILES strings.

required

Returns:

Type Description
DataFrame

pd.DataFrame: A new DataFrame with additional 'smiles_canonical' and 'smiles_tautomer' columns.

Source code in src/workbench/utils/chem_utils.py
def tautomerize_smiles(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform tautomer enumeration and canonicalization on a DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing SMILES strings.

    Returns:
        pd.DataFrame: A new DataFrame with additional 'smiles_canonical' and 'smiles_tautomer' columns.
    """
    # Standardize SMILES strings and create 'molecule' column for further processing
    df = canonicalize(df, remove_mol_col=False)

    # Helper function to safely canonicalize a molecule's tautomer
    def safe_tautomerize(mol):
        """Safely canonicalize a molecule's tautomer, handling errors gracefully."""
        if not mol:
            return pd.NA
        try:
            # Use RDKit's standard Tautomer enumeration and canonicalization
            # For custom logic, replace with custom_tautomer_canonicalization(mol)
            return standard_tautomer_canonicalization(mol)
        except Exception as e:
            log.warning(f"Tautomerization failed: {str(e)}")
            return pd.NA

    # Apply tautomer canonicalization to each molecule
    df["smiles_tautomer"] = df["molecule"].apply(safe_tautomerize)

    # Drop intermediate RDKit molecule column to clean up the DataFrame
    df.drop(columns=["molecule"], inplace=True)

    # Now switch the smiles columns
    df.rename(columns={"smiles": "smiles_orig", "smiles_tautomer": "smiles"}, inplace=True)

    return df

toxic_elements(mol)

Identifies toxic elements or specific forms of elements in a molecule.

Parameters:

Name Type Description Default
mol Mol

RDKit molecule object.

required

Returns:

Type Description
Optional[List[str]]

Optional[List[str]]: List of toxic elements or specific forms if found, otherwise None.

Notes

Halogen toxicity logic integrates with halogen_toxicity_score and scales thresholds based on molecule size.

Source code in src/workbench/utils/chem_utils.py
def toxic_elements(mol: Mol) -> Optional[List[str]]:
    """
    Identifies toxic elements or specific forms of elements in a molecule.

    Args:
        mol: RDKit molecule object.

    Returns:
        Optional[List[str]]: List of toxic elements or specific forms if found, otherwise None.

    Notes:
        Halogen toxicity logic integrates with `halogen_toxicity_score` and scales thresholds
        based on molecule size.
    """
    # Always toxic elements (heavy metals and known toxic single elements)
    always_toxic = {"Pb", "Hg", "Cd", "As", "Be", "Tl", "Sb"}
    toxic_found = set()

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        formal_charge = atom.GetFormalCharge()

        # Check for always toxic elements
        if symbol in always_toxic:
            toxic_found.add(symbol)

        # Conditionally toxic nitrogen (positively charged)
        if symbol == "N" and formal_charge > 0:
            # Exclude benign quaternary ammonium (e.g., choline-like structures)
            if mol.HasSubstructMatch(Chem.MolFromSmarts("[N+](C)(C)(C)C")):  # Example benign structure
                continue
            toxic_found.add("N+")

        # Halogen toxicity: Uses halogen_toxicity_score to flag excessive halogenation
        if symbol in {"Cl", "Br", "I", "F"}:
            halogen_count, halogen_threshold = halogen_toxicity_score(mol)
            if halogen_count > halogen_threshold:
                toxic_found.add(symbol)

    return list(toxic_found) if toxic_found else None

toxic_groups(mol)

Check if a molecule contains known toxic functional groups using RDKit's functional groups and SMARTS patterns.

Parameters:

Name Type Description Default
mol Mol

The molecule to evaluate.

required

Returns:

Type Description
Optional[List[str]]

Optional[List[str]]: List of SMARTS patterns for toxic groups if found, otherwise None.

Source code in src/workbench/utils/chem_utils.py
def toxic_groups(mol: Chem.Mol) -> Optional[List[str]]:
    """
    Check if a molecule contains known toxic functional groups using RDKit's functional groups and SMARTS patterns.

    Args:
        mol (rdkit.Chem.Mol): The molecule to evaluate.

    Returns:
        Optional[List[str]]: List of SMARTS patterns for toxic groups if found, otherwise None.
    """
    toxic_smarts_matches = []

    # Use RDKit's functional group definitions
    toxic_group_names = ["Nitro", "Azide", "Alcohol", "Aldehyde", "Halogen", "TerminalAlkyne"]
    for group_name in toxic_group_names:
        group_node = next(node for node in fgroup_hierarchy if node.label == group_name)
        if mol.HasSubstructMatch(Chem.MolFromSmarts(group_node.smarts)):
            toxic_smarts_matches.append(group_node.smarts)  # Use group_node's SMARTS directly

    # Check for custom precompiled toxic SMARTS patterns
    for smarts, compiled in zip(toxic_smarts_patterns, compiled_toxic_smarts):
        if mol.HasSubstructMatch(compiled):  # Use precompiled SMARTS
            toxic_smarts_matches.append(smarts)

    # Special handling for N+
    if mol.HasSubstructMatch(Chem.MolFromSmarts("[N+]")):
        if not mol.HasSubstructMatch(Chem.MolFromSmarts("C[N+](C)(C)C")):  # Exclude benign
            toxic_smarts_matches.append("[N+]")  # Append as SMARTS

    # Exempt stabilizing functional groups using precompiled patterns
    for compiled in compiled_exempt_smarts:
        if mol.HasSubstructMatch(compiled):
            return None

    return toxic_smarts_matches if toxic_smarts_matches else None

Examples

Canonical Smiles

examples/chem_utils/canonicalize_smiles.py
"""Example for computing Canonicalize SMILES strings"""
import pandas as pd
from workbench.utils.chem_utils import canonicalize

test_data = [
    {"id": "Acetylacetone", "smiles": "CC(=O)CC(=O)C", "expected": "CC(=O)CC(C)=O"},
    {"id": "Imidazole", "smiles": "c1cnc[nH]1", "expected": "c1c[nH]cn1"},
    {"id": "Pyridone", "smiles": "C1=CC=NC(=O)C=C1", "expected": "O=c1cccccn1"},
    {"id": "Guanidine", "smiles": "C(=N)N=C(N)N", "expected": "N=CN=C(N)N"},
    {"id": "Catechol", "smiles": "c1cc(c(cc1)O)O", "expected": "Oc1ccccc1O"},
    {"id": "Formamide", "smiles": "C(=O)N", "expected": "NC=O"},
    {"id": "Urea", "smiles": "C(=O)(N)N", "expected": "NC(N)=O"},
    {"id": "Phenol", "smiles": "c1ccc(cc1)O", "expected": "Oc1ccccc1"},
]

# Convert test data to a DataFrame
df = pd.DataFrame(test_data)

# Perform canonicalization
result_df = canonicalize(df)
print(result_df)

Output

              id            smiles       expected smiles_canonical
0  Acetylacetone     CC(=O)CC(=O)C  CC(=O)CC(C)=O    CC(=O)CC(C)=O
1      Imidazole        c1cnc[nH]1     c1c[nH]cn1       c1c[nH]cn1
2       Pyridone  C1=CC=NC(=O)C=C1    O=c1cccccn1      O=c1cccccn1
3      Guanidine      C(=N)N=C(N)N     N=CN=C(N)N       N=CN=C(N)N
4       Catechol    c1cc(c(cc1)O)O     Oc1ccccc1O       Oc1ccccc1O
5      Formamide            C(=O)N           NC=O             NC=O
6           Urea         C(=O)(N)N        NC(N)=O          NC(N)=O
7         Phenol       c1ccc(cc1)O      Oc1ccccc1        Oc1ccccc1

Tautomerize Smiles

examples/chem_utils/tautomerize_smiles.py
"""Example for Tautomerizing SMILES strings"""
import pandas as pd
from workbench.utils.chem_utils import tautomerize_smiles

test_data = [
    # Salicylaldehyde undergoes keto-enol tautomerization.
    {"id": "Salicylaldehyde (Keto)", "smiles": "O=Cc1cccc(O)c1", "expected": "O=Cc1cccc(O)c1"},
    {"id": "2-Hydroxybenzaldehyde (Enol)", "smiles": "Oc1ccc(C=O)cc1", "expected": "O=Cc1ccc(O)cc1"},
    # Acetylacetone undergoes keto-enol tautomerization to favor the enol form.
    {"id": "Acetylacetone", "smiles": "CC(=O)CC(=O)C", "expected": "CC(=O)CC(C)=O"},
    # Imidazole undergoes a proton shift in the aromatic ring.
    {"id": "Imidazole", "smiles": "c1cnc[nH]1", "expected": "c1c[nH]cn1"},
    # Pyridone prefers the lactam form in RDKit's tautomer enumeration.
    {"id": "Pyridone", "smiles": "C1=CC=NC(=O)C=C1", "expected": "O=c1cccccn1"},
    # Guanidine undergoes amine-imine tautomerization.
    {"id": "Guanidine", "smiles": "C(=N)N=C(N)N", "expected": "N=C(N)N=CN"},
    # Catechol standardizes hydroxyl group placement in the aromatic system.
    {"id": "Catechol", "smiles": "c1cc(c(cc1)O)O", "expected": "Oc1ccccc1O"},
    # Formamide canonicalizes to NC=O, reflecting its stable form.
    {"id": "Formamide", "smiles": "C(=O)N", "expected": "NC=O"},
    # Urea undergoes a proton shift between nitrogen atoms.
    {"id": "Urea", "smiles": "C(=O)(N)N", "expected": "NC(N)=O"},
    # Phenol standardizes hydroxyl group placement in the aromatic system.
    {"id": "Phenol", "smiles": "c1ccc(cc1)O", "expected": "Oc1ccccc1"}
]

# Convert test data to a DataFrame
df = pd.DataFrame(test_data)

# Perform tautomerization
result_df = tautomerize_smiles(df)
print(result_df)

Output

                             id       smiles_orig        expected smiles_canonical          smiles
0        Salicylaldehyde (Keto)    O=Cc1cccc(O)c1  O=Cc1cccc(O)c1   O=Cc1cccc(O)c1  O=Cc1cccc(O)c1
1  2-Hydroxybenzaldehyde (Enol)    Oc1ccc(C=O)cc1  O=Cc1ccc(O)cc1   O=Cc1ccc(O)cc1  O=Cc1ccc(O)cc1
2                 Acetylacetone     CC(=O)CC(=O)C   CC(=O)CC(C)=O    CC(=O)CC(C)=O   CC(=O)CC(C)=O
3                     Imidazole        c1cnc[nH]1      c1c[nH]cn1       c1c[nH]cn1      c1c[nH]cn1
4                      Pyridone  C1=CC=NC(=O)C=C1     O=c1cccccn1      O=c1cccccn1     O=c1cccccn1
5                     Guanidine      C(=N)N=C(N)N      N=C(N)N=CN       N=CN=C(N)N      N=C(N)N=CN
6                      Catechol    c1cc(c(cc1)O)O      Oc1ccccc1O       Oc1ccccc1O      Oc1ccccc1O
7                     Formamide            C(=O)N            NC=O             NC=O            NC=O
8                          Urea         C(=O)(N)N         NC(N)=O          NC(N)=O         NC(N)=O
9                        Phenol       c1ccc(cc1)O       Oc1ccccc1        Oc1ccccc1       Oc1ccccc1

Additional Resources