Chemical Utilities

Examples

Examples of using the Chemical Utilities are listed at the bottom of this page Examples.

The majority of the chemical utilities in Workbench use either RDKIT or Mordred (Community). The inclusion of these utilities allows the use and deployment of this functionality into AWS (FeatureSets, Models, Endpoints).

Chem/RDKIT/Mordred utilities for Workbench

`log = logging.getLogger('workbench')` `module-attribute`

FIXME: Let's figure out what we're not just using these RDKit methods

from rdkit.Chem import PandasTools df = PandasTools.LoadSDF('file.sdf', molColName='ROMol', smilesName='SMILES') PandasTools.WriteSDF(df, 'file.sdf', molColName='ROMol', properties=list(df.columns))

`add_compound_tags(df, mol_column='molecule')`

Adds a 'tags' column to a DataFrame, tagging compounds based on their properties.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing molecular data.	required
`mol_column`	`str`	Column name containing RDKit molecule objects.	`'molecule'`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Updated DataFrame with a 'tags' column.

Source code in src/workbench/utils/chem_utils.py

def add_compound_tags(df, mol_column="molecule") -> pd.DataFrame:
    """
    Adds a 'tags' column to a DataFrame, tagging compounds based on their properties.

    Args:
        df (pd.DataFrame): Input DataFrame containing molecular data.
        mol_column (str): Column name containing RDKit molecule objects.

    Returns:
        pd.DataFrame: Updated DataFrame with a 'tags' column.
    """
    # Initialize the tags column
    df["tags"] = [[] for _ in range(len(df))]
    df["meta"] = [{} for _ in range(len(df))]

    # Process each molecule in the DataFrame
    for idx, row in df.iterrows():
        mol = row[mol_column]
        tags = []

        # Check for salts
        if contains_salts(mol):
            tags.append("salt")

        # Check for fragments (should be done after salt check)
        fragments = Chem.GetMolFrags(mol, asMols=True)
        if len(fragments) > 1:
            tags.append("frag")

        # Check for heavy metals
        if contains_heavy_metals(mol):
            tags.append("heavy_metals")

        # Check for toxic elements
        te = toxic_elements(mol)
        if te:
            tags.append("toxic_element")
            df.at[idx, "meta"]["toxic_elements"] = te

        # Check for toxic groups
        tg = toxic_groups(mol)
        if tg:
            tags.append("toxic_group")
            df.at[idx, "meta"]["toxic_groups"] = tg

        # Check for metalloenzyme-relevant metals
        if contains_metalloenzyme_relevant_metals(mol):
            tags.append("metalloenzyme")

        # Check for drug-likeness
        if is_druglike_compound(mol):
            tags.append("druglike")

        # Update tags
        df.at[idx, "tags"] = tags

    return df

`add_salt_features(df)`

Add salt features to dataframe with 'molecule' column containing RDKit molecules

Source code in src/workbench/utils/chem_utils.py

def add_salt_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add salt features to dataframe with 'molecule' column containing RDKit molecules"""
    salt_features_list = []

    for idx, row in df.iterrows():
        mol = row["molecule"]
        features, clean_mol = extract_advanced_salt_features(mol)

        if features is None:
            # Handle invalid molecules
            features = {col: None for col in _get_salt_feature_columns()}

        salt_features_list.append(features)

    # Convert to DataFrame and concatenate
    salt_df = pd.DataFrame(salt_features_list)
    return pd.concat([df, salt_df], axis=1)

`canonicalize(df, remove_mol_col=True)`

Generate RDKit's canonical SMILES for each molecule in the input DataFrame.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing a column named 'SMILES' (case-insensitive).	required
`remove_mol_col`	`bool`	Whether to drop the intermediate 'molecule' column. Default is True.	`True`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A DataFrame with an additional 'smiles_canonical' column and, optionally, the 'molecule' column.

Source code in src/workbench/utils/chem_utils.py

def canonicalize(df: pd.DataFrame, remove_mol_col: bool = True) -> pd.DataFrame:
    """
    Generate RDKit's canonical SMILES for each molecule in the input DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing a column named 'SMILES' (case-insensitive).
        remove_mol_col (bool): Whether to drop the intermediate 'molecule' column. Default is True.

    Returns:
        pd.DataFrame: A DataFrame with an additional 'smiles_canonical' column and,
                      optionally, the 'molecule' column.
    """
    # Identify the SMILES column (case-insensitive)
    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
    if smiles_column is None:
        raise ValueError("Input DataFrame must have a 'SMILES' column")

    # Convert SMILES to RDKit molecules
    df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)

    # Log invalid SMILES
    invalid_indices = df[df["molecule"].isna()].index
    if not invalid_indices.empty:
        log.critical(f"Invalid SMILES strings at indices: {invalid_indices.tolist()}")

    # Drop rows where SMILES failed to convert to molecule
    df.dropna(subset=["molecule"], inplace=True)

    # Remove disconnected fragments (keep the largest fragment)
    df["molecule"] = df["molecule"].apply(lambda mol: remove_disconnected_fragments(mol) if mol else None)

    # Convert molecules to canonical SMILES (preserving isomeric information)
    df["smiles_canonical"] = df["molecule"].apply(
        lambda mol: Chem.MolToSmiles(mol, isomericSmiles=True) if mol else None
    )

    # Drop intermediate RDKit molecule column if requested
    if remove_mol_col:
        df.drop(columns=["molecule"], inplace=True)

    return df

`compute_molecular_descriptors(df, tautomerize=True)`

Compute and add all the Molecular Descriptors

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing SMILES strings.	required
`tautomerize`	`bool`	Whether to tautomerize the SMILES strings.	`True`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The input DataFrame with all the RDKit Descriptors added

Source code in src/workbench/utils/chem_utils.py

def compute_molecular_descriptors(df: pd.DataFrame, tautomerize=True) -> pd.DataFrame:
    """Compute and add all the Molecular Descriptors

    Args:
        df (pd.DataFrame): Input DataFrame containing SMILES strings.
        tautomerize (bool): Whether to tautomerize the SMILES strings.

    Returns:
        pd.DataFrame: The input DataFrame with all the RDKit Descriptors added
    """

    # Check for the smiles column (any capitalization)
    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
    if smiles_column is None:
        raise ValueError("Input DataFrame must have a 'smiles' column")

    # Compute/add all the Molecular Descriptors
    log.info("Computing Molecular Descriptors...")

    # Convert SMILES to RDKit molecule objects (vectorized)
    log.info("Converting SMILES to RDKit Molecules...")
    df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)

    # Make sure our molecules are not None
    failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
    if failed_smiles:
        log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
    df = df.dropna(subset=["molecule"])

    # If we have fragments in our compounds, get the largest fragment before computing descriptors
    df["molecule"] = df["molecule"].apply(remove_disconnected_fragments)

    # Tautomerize the molecules if requested
    if tautomerize:
        log.info("Tautomerizing molecules...")
        tautomer_enumerator = TautomerEnumerator()
        df["molecule"] = df["molecule"].apply(tautomer_enumerator.Canonicalize)

    # Now get all the RDKIT Descriptors
    all_descriptors = [x[0] for x in Descriptors._descList]

    # There's an overflow issue that happens with the IPC descriptor, so we'll remove it
    # See: https://github.com/rdkit/rdkit/issues/1527
    if "Ipc" in all_descriptors:
        all_descriptors.remove("Ipc")

    # Make sure we don't have duplicates
    all_descriptors = list(set(all_descriptors))

    # RDKit Molecular Descriptor Calculator Class
    log.info("Computing RDKit Descriptors...")
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(all_descriptors)
    descriptor_values = [calc.CalcDescriptors(m) for m in df["molecule"]]

    # Lowercase the column names
    column_names = [name.lower() for name in calc.GetDescriptorNames()]
    rdkit_features_df = pd.DataFrame(descriptor_values, columns=column_names)

    # Now compute Mordred Features
    log.info("Computing Mordred Descriptors...")
    descriptor_choice = [AcidBase, Aromatic, Polarizability, RotatableBond]
    calc = MordredCalculator()
    for des in descriptor_choice:
        calc.register(des)
    mordred_df = calc.pandas(df["molecule"], nproc=1)

    # Lowercase the column names
    mordred_df.columns = [col.lower() for col in mordred_df.columns]

    # Compute stereochemistry descriptors
    stereo_df = compute_stereochemistry_descriptors(df)

    # Combine the DataFrame with the RDKit and Mordred Descriptors added
    # Note: This will overwrite any existing columns with the same name. This is a good thing
    #       since we want computed descriptors to overwrite anything in the input dataframe
    output_df = stereo_df.combine_first(mordred_df).combine_first(rdkit_features_df)

    # Ensure no duplicate column names
    output_df = output_df.loc[:, ~output_df.columns.duplicated()]

    # Reorder the columns to have all the ones in the input df first and then the descriptors
    input_columns = df.columns.tolist()
    output_df = output_df[input_columns + [col for col in output_df.columns if col not in input_columns]]

    # Drop the intermediate 'molecule' column
    del output_df["molecule"]

    # Return the DataFrame with the RDKit and Mordred Descriptors added
    return output_df

`compute_morgan_fingerprints(df, radius=2, n_bits=2048, counts=True)`

Compute and add Morgan fingerprints to the DataFrame.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing SMILES strings.	required
`radius`	`int`	Radius for the Morgan fingerprint.	`2`
`n_bits`	`int`	Number of bits for the fingerprint.	`2048`
`counts`	`bool`	Count simulation for the fingerprint.	`True`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.

Note

See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html

Source code in src/workbench/utils/chem_utils.py

def compute_morgan_fingerprints(df: pd.DataFrame, radius=2, n_bits=2048, counts=True) -> pd.DataFrame:
    """Compute and add Morgan fingerprints to the DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing SMILES strings.
        radius (int): Radius for the Morgan fingerprint.
        n_bits (int): Number of bits for the fingerprint.
        counts (bool): Count simulation for the fingerprint.

    Returns:
        pd.DataFrame: The input DataFrame with the Morgan fingerprints added as bit strings.

    Note:
        See: https://greglandrum.github.io/rdkit-blog/posts/2021-07-06-simulating-counts.html
    """
    delete_mol_column = False

    # Check for the SMILES column (case-insensitive)
    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
    if smiles_column is None:
        raise ValueError("Input DataFrame must have a 'smiles' column")

    # Sanity check the molecule column (sometimes it gets serialized, which doesn't work)
    if "molecule" in df.columns and df["molecule"].dtype == "string":
        log.warning("Detected serialized molecules in 'molecule' column. Removing...")
        del df["molecule"]

    # Convert SMILES to RDKit molecule objects (vectorized)
    if "molecule" not in df.columns:
        log.info("Converting SMILES to RDKit Molecules...")
        delete_mol_column = True
        df["molecule"] = df[smiles_column].apply(Chem.MolFromSmiles)
        # Make sure our molecules are not None
        failed_smiles = df[df["molecule"].isnull()][smiles_column].tolist()
        if failed_smiles:
            log.error(f"Failed to convert the following SMILES to molecules: {failed_smiles}")
        df = df.dropna(subset=["molecule"])

    # If we have fragments in our compounds, get the largest fragment before computing fingerprints
    largest_frags = df["molecule"].apply(remove_disconnected_fragments)

    # Create a Morgan fingerprint generator
    if counts:
        n_bits *= 4  # Multiply by 4 to simulate counts
    morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits, countSimulation=counts)

    # Compute Morgan fingerprints (vectorized)
    fingerprints = largest_frags.apply(
        lambda mol: (morgan_generator.GetFingerprint(mol).ToBitString() if mol else pd.NA)
    )

    # Add the fingerprints to the DataFrame
    df["fingerprint"] = fingerprints

    # Drop the intermediate 'molecule' column if it was added
    if delete_mol_column:
        del df["molecule"]
    return df

`compute_stereochemistry_descriptors(df)`

Compute stereochemistry descriptors for molecules in a DataFrame.

This function analyzes the stereochemical properties of molecules, including: - Chiral centers (R/S configuration) - Double bond stereochemistry (E/Z configuration)

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame with RDKit molecule objects in 'molecule' column	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: DataFrame with added stereochemistry descriptors

Source code in src/workbench/utils/chem_utils.py

def compute_stereochemistry_descriptors(df: pd.DataFrame) -> pd.DataFrame:
    """Compute stereochemistry descriptors for molecules in a DataFrame.

    This function analyzes the stereochemical properties of molecules, including:
    - Chiral centers (R/S configuration)
    - Double bond stereochemistry (E/Z configuration)

    Args:
        df (pd.DataFrame): Input DataFrame with RDKit molecule objects in 'molecule' column

    Returns:
        pd.DataFrame: DataFrame with added stereochemistry descriptors
    """
    if "molecule" not in df.columns:
        raise ValueError("Input DataFrame must have a 'molecule' column")

    log.info("Computing stereochemistry descriptors...")
    output_df = df.copy()

    # Create helper functions to process a single molecule
    def process_molecule(mol):
        if mol is None:
            log.warning("Found a None molecule, skipping...")
            return {
                "chiral_centers": 0,
                "r_cnt": 0,
                "s_cnt": 0,
                "db_stereo": 0,
                "e_cnt": 0,
                "z_cnt": 0,
                "chiral_fp": 0,
                "db_fp": 0,
            }

        try:
            # Use the more accurate CIP labeling algorithm (Cahn-Ingold-Prelog rules)
            # This assigns R/S to chiral centers and E/Z to double bonds based on
            # the priority of substituents (atomic number, mass, etc.)
            rdCIPLabeler.AssignCIPLabels(mol)

            # Find all potential stereochemistry sites in the molecule
            stereo_info = Chem.FindPotentialStereo(mol)

            # Initialize counters
            specified_centers = 0  # Number of chiral centers with defined stereochemistry
            r_cnt = 0  # Count of R configured centers
            s_cnt = 0  # Count of S configured centers
            stereo_atoms = []  # List to store atom indices and their R/S configuration

            specified_bonds = 0  # Number of double bonds with defined stereochemistry
            e_cnt = 0  # Count of E (trans) configured double bonds
            z_cnt = 0  # Count of Z (cis) configured double bonds
            stereo_bonds = []  # List to store bond indices and their E/Z configuration

            # Process all stereo information found in the molecule
            for element in stereo_info:
                # Handle tetrahedral chiral centers
                if element.type == Chem.StereoType.Atom_Tetrahedral:
                    atom_idx = element.centeredOn

                    # Only count centers where stereochemistry is explicitly defined
                    if element.specified == Chem.StereoSpecified.Specified:
                        specified_centers += 1
                        if element.descriptor == Chem.StereoDescriptor.Tet_CCW:
                            r_cnt += 1
                            stereo_atoms.append((atom_idx, "R"))
                        elif element.descriptor == Chem.StereoDescriptor.Tet_CW:
                            s_cnt += 1
                            stereo_atoms.append((atom_idx, "S"))

                # Handle double bond stereochemistry
                elif element.type == Chem.StereoType.Bond_Double:
                    bond_idx = element.centeredOn

                    # Only count bonds where stereochemistry is explicitly defined
                    if element.specified == Chem.StereoSpecified.Specified:
                        specified_bonds += 1
                        if element.descriptor == Chem.StereoDescriptor.Bond_Trans:
                            e_cnt += 1
                            stereo_bonds.append((bond_idx, "E"))
                        elif element.descriptor == Chem.StereoDescriptor.Bond_Cis:
                            z_cnt += 1
                            stereo_bonds.append((bond_idx, "Z"))

            # Calculate chiral center fingerprint - unique bit vector for stereochemical configuration
            chiral_fp = 0
            if stereo_atoms:
                for i, (idx, stereo) in enumerate(sorted(stereo_atoms, key=lambda x: x[0])):
                    bit_val = 1 if stereo == "R" else 0
                    chiral_fp += bit_val << i  # Shift bits to create a unique fingerprint

            # Calculate double bond fingerprint - bit vector for E/Z configurations
            db_fp = 0
            if stereo_bonds:
                for i, (idx, stereo) in enumerate(sorted(stereo_bonds, key=lambda x: x[0])):
                    bit_val = 1 if stereo == "E" else 0
                    db_fp += bit_val << i  # Shift bits to create a unique fingerprint

            return {
                "chiral_centers": specified_centers,
                "r_cnt": r_cnt,
                "s_cnt": s_cnt,
                "db_stereo": specified_bonds,
                "e_cnt": e_cnt,
                "z_cnt": z_cnt,
                "chiral_fp": chiral_fp,
                "db_fp": db_fp,
            }

        except Exception as e:
            log.warning(f"Error processing stereochemistry: {str(e)}")
            return {
                "chiral_centers": 0,
                "r_cnt": 0,
                "s_cnt": 0,
                "db_stereo": 0,
                "e_cnt": 0,
                "z_cnt": 0,
                "chiral_fp": 0,
                "db_fp": 0,
            }

    # Process all molecules and collect results
    results = []
    for mol in df["molecule"]:
        results.append(process_molecule(mol))

    # Add all descriptors to the output dataframe
    for key in results[0].keys():
        output_df[key] = [r[key] for r in results]

    # Boolean flag indicating if the molecule has any stereochemistry defined
    output_df["has_stereo"] = (output_df["chiral_centers"] > 0) | (output_df["db_stereo"] > 0)

    return output_df

`contains_heavy_metals(mol)`

Check if a molecule contains any heavy metals (broad filter).

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	RDKit molecule object.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if any heavy metals are detected, False otherwise.

Source code in src/workbench/utils/chem_utils.py

def contains_heavy_metals(mol: Mol) -> bool:
    """
    Check if a molecule contains any heavy metals (broad filter).

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if any heavy metals are detected, False otherwise.
    """
    heavy_metals = {"Zn", "Cu", "Fe", "Mn", "Co", "Pb", "Hg", "Cd", "As"}
    return any(atom.GetSymbol() in heavy_metals for atom in mol.GetAtoms())

`contains_metalloenzyme_relevant_metals(mol)`

Check if a molecule contains metals relevant to metalloenzymes.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	RDKit molecule object.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if metalloenzyme-relevant metals are detected, False otherwise.

Source code in src/workbench/utils/chem_utils.py

def contains_metalloenzyme_relevant_metals(mol: Mol) -> bool:
    """
    Check if a molecule contains metals relevant to metalloenzymes.

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if metalloenzyme-relevant metals are detected, False otherwise.
    """
    metalloenzyme_metals = {"Zn", "Cu", "Fe", "Mn", "Co"}
    return any(atom.GetSymbol() in metalloenzyme_metals for atom in mol.GetAtoms())

`contains_salts(mol)`

Check if a molecule contains common salts or counterions.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	RDKit molecule object.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if salts are detected, False otherwise.

Source code in src/workbench/utils/chem_utils.py

def contains_salts(mol: Mol) -> bool:
    """
    Check if a molecule contains common salts or counterions.

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if salts are detected, False otherwise.
    """
    # Define common inorganic salt fragments (SMARTS patterns)
    salt_patterns = ["[Na+]", "[K+]", "[Cl-]", "[Mg+2]", "[Ca+2]", "[NH4+]", "[SO4--]"]
    for pattern in salt_patterns:
        if mol.HasSubstructMatch(Chem.MolFromSmarts(pattern)):
            return True
    return False

`custom_tautomer_canonicalization(mol)`

Domain-specific processing of a molecule to select the canonical tautomer.

This function enumerates all possible tautomers for a given molecule and applies custom logic to select the canonical form.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	The RDKit molecule for which the canonical tautomer is to be determined.	required

Returns:

Name	Type	Description
`str`	`str`	The SMILES string of the selected canonical tautomer.

Source code in src/workbench/utils/chem_utils.py

def custom_tautomer_canonicalization(mol: Mol) -> str:
    """Domain-specific processing of a molecule to select the canonical tautomer.

    This function enumerates all possible tautomers for a given molecule and applies
    custom logic to select the canonical form.

    Args:
        mol (Mol): The RDKit molecule for which the canonical tautomer is to be determined.

    Returns:
        str: The SMILES string of the selected canonical tautomer.
    """
    tautomer_enumerator = TautomerEnumerator()
    enumerated_tautomers = tautomer_enumerator.Enumerate(mol)

    # Example custom logic: prioritize based on use-case specific criteria
    selected_tautomer = None
    highest_score = float("-inf")

    for taut in enumerated_tautomers:
        # Compute custom scoring logic:
        # 1. Prefer forms with fewer hydrogen bond donors (HBD) if membrane permeability is important
        # 2. Penalize forms with high molecular weight for better drug-likeness
        # 3. Incorporate known functional group preferences (e.g., keto > enol for binding)

        hbd = CalcNumHBD(taut)  # Hydrogen Bond Donors
        mw = CalcExactMolWt(taut)  # Molecular Weight
        aromatic_rings = taut.GetRingInfo().NumAromaticRings()  # Favor aromaticity

        # Example scoring: balance HBD, MW, and aromaticity
        score = -hbd - 0.01 * mw + aromatic_rings * 2

        # Update selected tautomer
        if score > highest_score:
            highest_score = score
            selected_tautomer = taut

    # Return the SMILES of the selected tautomer
    return Chem.MolToSmiles(selected_tautomer)

`df_to_sdf_file(df, output_file, smiles_col='smiles', id_col=None, include_cols=None, skip_invalid=True, generate_3d=True)`

Convert DataFrame with SMILES to SDF file.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing SMILES and other data	required
`output_file`	`str`	Path to output SDF file	required
`smiles_col`	`str`	Column name containing SMILES strings	`'smiles'`
`id_col`	`Optional[str]`	Column to use as molecule ID/name	`None`
`include_cols`	`Optional[List[str]]`	Specific columns to include as properties (default: all except smiles and molecule columns)	`None`
`skip_invalid`	`bool`	Skip invalid SMILES instead of raising error	`True`
`generate_3d`	`bool`	Generate 3D coordinates and optimize geometry	`True`

Source code in src/workbench/utils/chem_utils.py

def df_to_sdf_file(
    df: pd.DataFrame,
    output_file: str,
    smiles_col: str = "smiles",
    id_col: Optional[str] = None,
    include_cols: Optional[List[str]] = None,
    skip_invalid: bool = True,
    generate_3d: bool = True,
):
    """
    Convert DataFrame with SMILES to SDF file.

    Args:
        df: DataFrame containing SMILES and other data
        output_file: Path to output SDF file
        smiles_col: Column name containing SMILES strings
        id_col: Column to use as molecule ID/name
        include_cols: Specific columns to include as properties (default: all except smiles and molecule columns)
        skip_invalid: Skip invalid SMILES instead of raising error
        generate_3d: Generate 3D coordinates and optimize geometry
    """
    written_count = 0

    with SDWriter(output_file) as writer:
        writer.SetForceV3000(True)
        for idx, row in df.iterrows():
            mol = Chem.MolFromSmiles(row[smiles_col])
            if mol is None:
                if not skip_invalid:
                    raise ValueError(f"Invalid SMILES at row {idx}: {row[smiles_col]}")
                continue

            # Generate 3D coordinates
            if generate_3d:
                mol = Chem.AddHs(mol)

                # Try progressively more aggressive embedding strategies
                embed_strategies = [
                    {"maxAttempts": 1000, "randomSeed": 42},
                    {"maxAttempts": 1000, "randomSeed": 42, "useRandomCoords": True},
                    {"maxAttempts": 1000, "randomSeed": 42, "boxSizeMult": 5.0},
                ]

                embedded = False
                for strategy in embed_strategies:
                    if AllChem.EmbedMolecule(mol, **strategy) != -1:
                        embedded = True
                        break

                if not embedded:
                    if not skip_invalid:
                        raise ValueError(f"Could not generate 3D coords for row {idx}")
                    continue

                AllChem.MMFFOptimizeMolecule(mol)

            # Set molecule name/ID
            if id_col and id_col in df.columns:
                mol.SetProp("_Name", str(row[id_col]))

            # Determine which columns to include
            if include_cols:
                cols_to_add = [col for col in include_cols if col in df.columns and col != smiles_col]
            else:
                # Auto-exclude common molecule column names and SMILES column
                mol_col_names = ["mol", "molecule", "rdkit_mol", "Mol"]
                cols_to_add = [col for col in df.columns if col != smiles_col and col not in mol_col_names]

            # Add properties
            for col in cols_to_add:
                mol.SetProp(col, str(row[col]))

            writer.write(mol)
            written_count += 1

    log.important(f"Wrote {written_count} molecules to SDF: {output_file}")

`extract_advanced_salt_features(mol)`

Extract comprehensive salt-related features from RDKit molecule

Source code in src/workbench/utils/chem_utils.py

def extract_advanced_salt_features(
    mol: Optional[Chem.Mol],
) -> Tuple[Optional[Dict[str, Union[int, float]]], Optional[Chem.Mol]]:
    """Extract comprehensive salt-related features from RDKit molecule"""
    if mol is None:
        return None, None

    # Get fragments
    fragments = Chem.GetMolFrags(mol, asMols=True)

    # Identify API (largest organic fragment) vs salt fragments
    fragment_weights = [(frag, Descriptors.MolWt(frag)) for frag in fragments]
    fragment_weights.sort(key=lambda x: x[1], reverse=True)

    # Find largest organic fragment as API
    api_mol = None
    salt_frags = []

    for frag, mw in fragment_weights:
        atoms = [atom.GetSymbol() for atom in frag.GetAtoms()]
        if "C" in atoms and api_mol is None:  # First organic fragment = API
            api_mol = frag
        else:
            salt_frags.append(frag)

    # Fallback: if no organic fragments, use largest
    if api_mol is None:
        api_mol = fragment_weights[0][0]
        salt_frags = [frag for frag, _ in fragment_weights[1:]]

    # Initialize all features with default values
    features = {col: 0 for col in _get_salt_feature_columns()}
    features["mw_ratio"] = 1.0  # default for no salt

    # Basic features
    features.update(
        {
            "has_salt": int(len(salt_frags) > 0),
            "mw_ratio": Descriptors.MolWt(api_mol) / Descriptors.MolWt(mol),
        }
    )

    if salt_frags:
        # Salt characterization
        total_salt_mw = sum(Descriptors.MolWt(frag) for frag in salt_frags)
        features.update(
            {
                "salt_to_api_ratio": total_salt_mw / Descriptors.MolWt(api_mol),
                "ionic_strength_proxy": sum(abs(Chem.GetFormalCharge(frag)) for frag in salt_frags),
            }
        )

        # Salt type classification
        features.update(_classify_salt_types(salt_frags))

    return features, api_mol

`feature_resolution_issues(df, features, show_cols=None)`

Identify and print groups in a DataFrame where the given features have more than one unique SMILES, sorted by group size (largest number of unique SMILES first).

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing SMILES strings.	required
`features`	`List[str]`	List of features to check.	required
`show_cols`	`Optional[List[str]]`	Columns to display; defaults to all columns.	`None`

Source code in src/workbench/utils/chem_utils.py

def feature_resolution_issues(df: pd.DataFrame, features: List[str], show_cols: Optional[List[str]] = None) -> None:
    """
    Identify and print groups in a DataFrame where the given features have more than one unique SMILES,
    sorted by group size (largest number of unique SMILES first).

    Args:
        df (pd.DataFrame): Input DataFrame containing SMILES strings.
        features (List[str]): List of features to check.
        show_cols (Optional[List[str]]): Columns to display; defaults to all columns.
    """
    # Check for the 'smiles' column (case-insensitive)
    smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
    if smiles_column is None:
        raise ValueError("Input DataFrame must have a 'smiles' column")

    show_cols = show_cols if show_cols is not None else df.columns.tolist()

    # Drop duplicates to keep only unique SMILES for each feature combination
    unique_df = df.drop_duplicates(subset=[smiles_column] + features)

    # Find groups with more than one unique SMILES
    group_counts = unique_df.groupby(features).size()
    collision_groups = group_counts[group_counts > 1].sort_values(ascending=False)

    # Print each group in order of size (largest first)
    for group, count in collision_groups.items():
        # Get the rows for this group
        if isinstance(group, tuple):
            group_mask = (unique_df[features] == group).all(axis=1)
        else:
            group_mask = unique_df[features[0]] == group

        group_df = unique_df[group_mask]

        print(f"Feature Group (unique SMILES: {count}):")
        print(group_df[show_cols])
        print("\n")

`fingerprints_to_matrix(fingerprints, dtype=np.uint8)`

Convert bitstring fingerprints to numpy matrix.

Parameters:

Name	Type	Description	Default
`fingerprints`		pandas Series or list of bitstring fingerprints	required
`dtype`		numpy data type (uint8 is default: np.bool_ is good for Jaccard computations	`uint8`

Returns:

Type	Description
	dense numpy array of shape (n_molecules, n_bits)

Source code in src/workbench/utils/chem_utils.py

def fingerprints_to_matrix(fingerprints, dtype=np.uint8):
    """
    Convert bitstring fingerprints to numpy matrix.

    Args:
        fingerprints: pandas Series or list of bitstring fingerprints
        dtype: numpy data type (uint8 is default: np.bool_ is good for Jaccard computations

    Returns:
        dense numpy array of shape (n_molecules, n_bits)
    """

    # Dense matrix representation (we might support sparse in the future)
    return np.array([list(fp) for fp in fingerprints], dtype=dtype)

`geometric_mean(series)`

Computes the geometric mean manually to avoid using scipy.

Source code in src/workbench/utils/chem_utils.py

def geometric_mean(series: pd.Series) -> float:
    """Computes the geometric mean manually to avoid using scipy."""
    return np.exp(np.log(series).mean())

`halogen_toxicity_score(mol)`

Calculate the halogen count and toxicity threshold for a molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	RDKit molecule object.	required

Returns:

Type	Description
`(int, int)`	Tuple[int, int]: (halogen_count, halogen_threshold), where the threshold
`(int, int)`	scales with molecule size (minimum of 2 or 20% of atom count).

Source code in src/workbench/utils/chem_utils.py

def halogen_toxicity_score(mol: Mol) -> (int, int):
    """
    Calculate the halogen count and toxicity threshold for a molecule.

    Args:
        mol: RDKit molecule object.

    Returns:
        Tuple[int, int]: (halogen_count, halogen_threshold), where the threshold
        scales with molecule size (minimum of 2 or 20% of atom count).
    """
    # Define halogens and count their occurrences
    halogens = {"Cl", "Br", "I", "F"}
    halogen_count = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() in halogens)

    # Define threshold: small molecules tolerate fewer halogens
    # Threshold scales with molecule size to account for reasonable substitution
    molecule_size = mol.GetNumAtoms()
    halogen_threshold = max(2, int(molecule_size * 0.2))  # Minimum 2, scaled by 20% of molecule size

    return halogen_count, halogen_threshold

`img_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')`

Generate an image of the molecule represented by the given SMILES string.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	A SMILES string representing the molecule.	required
`width`	`int`	Width of the image in pixels. Default is 500.	`500`
`height`	`int`	Height of the image in pixels. Default is 500.	`500`
`background`	`str`	Background color of the image. Default is dark grey	`'rgba(64, 64, 64, 1)'`

Returns:

Name	Type	Description
`str`	`Optional[str]`	PIL image of the molecule or None if the SMILES string is invalid.

Source code in src/workbench/utils/chem_utils.py

def img_from_smiles(
    smiles: str, width: int = 500, height: int = 500, background: str = "rgba(64, 64, 64, 1)"
) -> Optional[str]:
    """
    Generate an image of the molecule represented by the given SMILES string.

    Args:
        smiles (str): A SMILES string representing the molecule.
        width (int): Width of the image in pixels. Default is 500.
        height (int): Height of the image in pixels. Default is 500.
        background (str): Background color of the image. Default is dark grey

    Returns:
        str: PIL image of the molecule or None if the SMILES string is invalid.
    """

    # Set up the drawing options
    dos = Draw.MolDrawOptions()
    if is_dark(background):
        rdMolDraw2D.SetDarkMode(dos)
    dos.setBackgroundColour(rgba_to_tuple(background))

    # Convert the SMILES string to an RDKit molecule and generate the image
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        img = Draw.MolToImage(mol, options=dos, size=(width, height))
        return img
    else:
        log.warning(f"Invalid SMILES: {smiles}")
        return None

`is_druglike_compound(mol)`

Filter for drug-likeness and QSAR relevance based on Lipinski's Rule of Five. Returns False for molecules unlikely to be orally bioavailable.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	RDKit molecule object.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if the molecule is drug-like, False otherwise.

Source code in src/workbench/utils/chem_utils.py

def is_druglike_compound(mol: Mol) -> bool:
    """
    Filter for drug-likeness and QSAR relevance based on Lipinski's Rule of Five.
    Returns False for molecules unlikely to be orally bioavailable.

    Args:
        mol: RDKit molecule object.

    Returns:
        bool: True if the molecule is drug-like, False otherwise.
    """

    # Lipinski's Rule of Five
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    if mw > 500 or logp > 5 or hbd > 5 or hba > 10:
        return False

    # Allow exceptions for linear molecules that meet strict RO5 criteria
    if mol.GetRingInfo().NumRings() == 0:
        if mw <= 300 and logp <= 3 and hbd <= 3 and hba <= 3:
            pass  # Allow small, non-cyclic druglike compounds
        else:
            return False

    return True

`log_to_category(log_series)`

Convert a pandas Series of log values to concentration categories.

Parameters: log_series (pd.Series): Series of logarithmic values (log10).

Returns: pd.Series: Series of concentration categories.

Source code in src/workbench/utils/chem_utils.py

def log_to_category(log_series: pd.Series) -> pd.Series:
    """
    Convert a pandas Series of log values to concentration categories.

    Parameters:
    log_series (pd.Series): Series of logarithmic values (log10).

    Returns:
    pd.Series: Series of concentration categories.
    """
    # Create a solubility classification column
    bins = [-float("inf"), -5, -4, float("inf")]
    labels = ["low", "medium", "high"]
    return pd.cut(log_series, bins=bins, labels=labels)

`log_to_micromolar(log_series)`

Convert a pandas Series of logarithmic values (log10) back to concentrations in µM (micromolar).

Parameters: log_series (pd.Series): Series of logarithmic values (log10).

Returns: pd.Series: Series of concentrations in micromolar.

Source code in src/workbench/utils/chem_utils.py

def log_to_micromolar(log_series: pd.Series) -> pd.Series:
    """
    Convert a pandas Series of logarithmic values (log10) back to concentrations in µM (micromolar).

    Parameters:
    log_series (pd.Series): Series of logarithmic values (log10).

    Returns:
    pd.Series: Series of concentrations in micromolar.
    """
    series_mol_per_l = 10**log_series  # Convert log10 back to mol/L
    series_µM = series_mol_per_l * 1e6  # Convert mol/L to µM
    return series_µM

`micromolar_to_log(series_μM)`

Convert a pandas Series of concentrations in µM (micromolar) to their logarithmic values (log10).

Parameters: series_uM (pd.Series): Series of concentrations in micromolar.

Returns: pd.Series: Series of logarithmic values (log10).

Source code in src/workbench/utils/chem_utils.py

def micromolar_to_log(series_µM: pd.Series) -> pd.Series:
    """
    Convert a pandas Series of concentrations in µM (micromolar) to their logarithmic values (log10).

    Parameters:
    series_uM (pd.Series): Series of concentrations in micromolar.

    Returns:
    pd.Series: Series of logarithmic values (log10).
    """
    # Replace 0 or negative values with a small number to avoid log errors
    adjusted_series = series_µM.clip(lower=1e-9)  # Alignment with another project

    series_mol_per_l = adjusted_series * 1e-6  # Convert µM/L to mol/L
    log_series = np.log10(series_mol_per_l)
    return log_series

`project_fingerprints(df, projection='UMAP')`

Project fingerprints onto a 2D plane using dimensionality reduction techniques.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing fingerprint data.	required
`projection`	`str`	Dimensionality reduction technique to use (TSNE or UMAP).	`'UMAP'`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The input DataFrame with the projected coordinates added as 'x' and 'y' columns.

Source code in src/workbench/utils/chem_utils.py

def project_fingerprints(df: pd.DataFrame, projection: str = "UMAP") -> pd.DataFrame:
    """Project fingerprints onto a 2D plane using dimensionality reduction techniques.

    Args:
        df (pd.DataFrame): Input DataFrame containing fingerprint data.
        projection (str): Dimensionality reduction technique to use (TSNE or UMAP).

    Returns:
        pd.DataFrame: The input DataFrame with the projected coordinates added as 'x' and 'y' columns.
    """
    # Check for the fingerprint column (case-insensitive)
    fingerprint_column = next((col for col in df.columns if "fingerprint" in col.lower()), None)
    if fingerprint_column is None:
        raise ValueError("Input DataFrame must have a fingerprint column")

    # Create a matrix of fingerprints
    X = fingerprints_to_matrix(df[fingerprint_column])

    # Check for UMAP availability
    if projection == "UMAP" and umap is None:
        log.warning("UMAP is not available. Using TSNE instead.")
        projection = "TSNE"

    # Run the projection
    if projection == "TSNE":
        # Run TSNE on the fingerprint matrix
        tsne = TSNE(n_components=2, perplexity=30, random_state=42)
        embedding = tsne.fit_transform(X)
    else:
        # Run UMAP
        # reducer = umap.UMAP(densmap=True)
        reducer = umap.UMAP(metric="jaccard")
        embedding = reducer.fit_transform(X)

    # Add coordinates to DataFrame
    df["x"] = embedding[:, 0]
    df["y"] = embedding[:, 1]

    # If vertices disconnect from the manifold, they are given NaN values (so replace with 0)
    df["x"] = df["x"].fillna(0)
    df["y"] = df["y"].fillna(0)

    # Jitter
    jitter_scale = 0.1
    df["x"] += np.random.uniform(0, jitter_scale, len(df))
    df["y"] += np.random.uniform(0, jitter_scale, len(df))

    return df

`remove_disconnected_fragments(mol)`

Remove disconnected fragments from a molecule, keeping the fragment with the most heavy atoms.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	RDKit molecule object.	required

Returns:

Name	Type	Description
`Mol`	`Mol`	The fragment with the most heavy atoms, or None if no such fragment exists.

Source code in src/workbench/utils/chem_utils.py

def remove_disconnected_fragments(mol: Chem.Mol) -> Chem.Mol:
    """
    Remove disconnected fragments from a molecule, keeping the fragment with the most heavy atoms.

    Args:
        mol (Mol): RDKit molecule object.

    Returns:
        Mol: The fragment with the most heavy atoms, or None if no such fragment exists.
    """
    if mol is None or mol.GetNumAtoms() == 0:
        return None
    fragments = Chem.GetMolFrags(mol, asMols=True)
    return max(fragments, key=lambda frag: frag.GetNumHeavyAtoms()) if fragments else None

`rollup_experimental_data(df, id, time, target, use_gmean=False)`

Rolls up a dataset by selecting the largest time per unique ID and averaging the target value if multiple records exist at that time. Supports both arithmetic and geometric mean.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input dataframe.	required
`id`	`str`	Column representing the unique molecule ID.	required
`time`	`str`	Column representing the time.	required
`target`	`str`	Column representing the target value.	required
`use_gmean`	`bool`	Whether to use the geometric mean instead of the arithmetic mean.	`False`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Rolled-up dataframe with all original columns retained.

Source code in src/workbench/utils/chem_utils.py

def rollup_experimental_data(
    df: pd.DataFrame, id: str, time: str, target: str, use_gmean: bool = False
) -> pd.DataFrame:
    """
    Rolls up a dataset by selecting the largest time per unique ID and averaging the target value
    if multiple records exist at that time. Supports both arithmetic and geometric mean.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        id (str): Column representing the unique molecule ID.
        time (str): Column representing the time.
        target (str): Column representing the target value.
        use_gmean (bool): Whether to use the geometric mean instead of the arithmetic mean.

    Returns:
        pd.DataFrame: Rolled-up dataframe with all original columns retained.
    """
    # Find the max time per unique ID
    max_time_df = df.groupby(id)[time].transform("max")
    filtered_df = df[df[time] == max_time_df]

    # Define aggregation function
    agg_func = geometric_mean if use_gmean else np.mean

    # Perform aggregation on all columns
    agg_dict = {col: "first" for col in df.columns if col not in [target, id, time]}
    agg_dict[target] = lambda x: agg_func(x) if len(x) > 1 else x.iloc[0]  # Apply mean or gmean

    rolled_up_df = filtered_df.groupby([id, time]).agg(agg_dict).reset_index()
    return rolled_up_df

`sdf_file_to_df(sdf_file, include_smiles=True, smiles_col='smiles', id_col=None, include_props=None, exclude_props=None)`

Convert SDF file to DataFrame.

Parameters:

Name	Type	Description	Default
`sdf_file`	`str`	Path to input SDF file	required
`include_smiles`	`bool`	Add SMILES column to output	`True`
`smiles_col`	`str`	Name for SMILES column	`'smiles'`
`id_col`	`Optional[str]`	Column name for molecule ID/name (uses _Name property)	`None`
`include_props`	`Optional[List[str]]`	Specific properties to include (default: all)	`None`
`exclude_props`	`Optional[List[str]]`	Properties to exclude from output	`None`

Returns:

Type	Description
`DataFrame`	DataFrame with molecules and their properties

Source code in src/workbench/utils/chem_utils.py

def sdf_file_to_df(
    sdf_file: str,
    include_smiles: bool = True,
    smiles_col: str = "smiles",
    id_col: Optional[str] = None,
    include_props: Optional[List[str]] = None,
    exclude_props: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    Convert SDF file to DataFrame.

    Args:
        sdf_file: Path to input SDF file
        include_smiles: Add SMILES column to output
        smiles_col: Name for SMILES column
        id_col: Column name for molecule ID/name (uses _Name property)
        include_props: Specific properties to include (default: all)
        exclude_props: Properties to exclude from output

    Returns:
        DataFrame with molecules and their properties
    """
    data = []

    suppl = Chem.SDMolSupplier(sdf_file)
    for idx, mol in enumerate(suppl):
        if mol is None:
            log.warning(f"Could not parse molecule at index {idx}")
            continue

        row_data = {}

        # Add SMILES if requested
        if include_smiles:
            row_data[smiles_col] = Chem.MolToSmiles(mol)

        # Add molecule name/ID if requested
        if id_col and mol.HasProp("_Name"):
            row_data[id_col] = mol.GetProp("_Name")

        # Get all properties
        prop_names = mol.GetPropNames()

        # Filter properties based on include/exclude lists
        if include_props:
            prop_names = [p for p in prop_names if p in include_props]
        if exclude_props:
            prop_names = [p for p in prop_names if p not in exclude_props]

        # Add properties to row
        for prop in prop_names:
            if prop != "_Name":  # Skip _Name if we already handled it
                row_data[prop] = mol.GetProp(prop)

        data.append(row_data)

    df = pd.DataFrame(data)
    log.important(f"Read {len(df)} molecules from SDF: {sdf_file}")

    return df

`show(smiles, width=500, height=500)`

Displays an image of the molecule represented by the given SMILES string.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	A SMILES string representing the molecule.	required
`width`	`int`	Width of the image in pixels. Default is 500.	`500`
`height`	`int`	Height of the image in pixels. Default is 500.	`500`

Returns: None

Source code in src/workbench/utils/chem_utils.py

def show(smiles: str, width: int = 500, height: int = 500) -> None:
    """
    Displays an image of the molecule represented by the given SMILES string.

    Args:
        smiles (str): A SMILES string representing the molecule.
        width (int): Width of the image in pixels. Default is 500.
        height (int): Height of the image in pixels. Default is 500.

    Returns:
    None
    """
    img = img_from_smiles(smiles, width, height)
    if img:
        img.show()

`standard_tautomer_canonicalization(mol)`

Standard processing of a molecule to select the canonical tautomer.

RDKit's TautomerEnumerator uses heuristics to select a canonical tautomer, such as preferring keto over enol forms and minimizing formal charges.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	The RDKit molecule for which the canonical tautomer is to be determined.	required

Returns:

Name	Type	Description
`str`	`str`	The SMILES string of the canonical tautomer.

Source code in src/workbench/utils/chem_utils.py

def standard_tautomer_canonicalization(mol: Mol) -> str:
    """Standard processing of a molecule to select the canonical tautomer.

    RDKit's `TautomerEnumerator` uses heuristics to select a canonical tautomer,
    such as preferring keto over enol forms and minimizing formal charges.

    Args:
        mol (Mol): The RDKit molecule for which the canonical tautomer is to be determined.

    Returns:
        str: The SMILES string of the canonical tautomer.
    """
    tautomer_enumerator = TautomerEnumerator()
    canonical_tautomer = tautomer_enumerator.Canonicalize(mol)
    return Chem.MolToSmiles(canonical_tautomer)

`svg_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')`

Generate an SVG image of the molecule represented by the given SMILES string.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	A SMILES string representing the molecule.	required
`width`	`int`	Width of the image in pixels. Default is 500.	`500`
`height`	`int`	Height of the image in pixels. Default is 500.	`500`
`background`	`str`	Background color of the image. Default is dark grey.	`'rgba(64, 64, 64, 1)'`

Returns:

Type	Description
`Optional[str]`	Optional[str]: Encoded SVG string of the molecule or None if the SMILES string is invalid.

Source code in src/workbench/utils/chem_utils.py

def svg_from_smiles(
    smiles: str, width: int = 500, height: int = 500, background: str = "rgba(64, 64, 64, 1)"
) -> Optional[str]:
    """
    Generate an SVG image of the molecule represented by the given SMILES string.

    Args:
        smiles (str): A SMILES string representing the molecule.
        width (int): Width of the image in pixels. Default is 500.
        height (int): Height of the image in pixels. Default is 500.
        background (str): Background color of the image. Default is dark grey.

    Returns:
        Optional[str]: Encoded SVG string of the molecule or None if the SMILES string is invalid.
    """
    # Convert the SMILES string to an RDKit molecule
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None

    # Compute 2D coordinates for the molecule
    AllChem.Compute2DCoords(mol)

    # Initialize the SVG drawer
    drawer = rdMolDraw2D.MolDraw2DSVG(width, height)

    # Configure drawing options
    options = drawer.drawOptions()
    if is_dark(background):
        rdMolDraw2D.SetDarkMode(options)
    options.setBackgroundColour(rgba_to_tuple(background))

    # Draw the molecule
    drawer.DrawMolecule(mol)
    drawer.FinishDrawing()

    # Clean and encode the SVG
    svg = drawer.GetDrawingText()
    encoded_svg = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
    return f"data:image/svg+xml;base64,{encoded_svg}"

`tautomerize_smiles(df)`

Perform tautomer enumeration and canonicalization on a DataFrame.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing SMILES strings.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A new DataFrame with additional 'smiles_canonical' and 'smiles_tautomer' columns.

Source code in src/workbench/utils/chem_utils.py

def tautomerize_smiles(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform tautomer enumeration and canonicalization on a DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing SMILES strings.

    Returns:
        pd.DataFrame: A new DataFrame with additional 'smiles_canonical' and 'smiles_tautomer' columns.
    """
    # Standardize SMILES strings and create 'molecule' column for further processing
    df = canonicalize(df, remove_mol_col=False)

    # Helper function to safely canonicalize a molecule's tautomer
    def safe_tautomerize(mol):
        """Safely canonicalize a molecule's tautomer, handling errors gracefully."""
        if not mol:
            return pd.NA
        try:
            # Use RDKit's standard Tautomer enumeration and canonicalization
            # For custom logic, replace with custom_tautomer_canonicalization(mol)
            return standard_tautomer_canonicalization(mol)
        except Exception as e:
            log.warning(f"Tautomerization failed: {str(e)}")
            return pd.NA

    # Apply tautomer canonicalization to each molecule
    df["smiles_tautomer"] = df["molecule"].apply(safe_tautomerize)

    # Drop intermediate RDKit molecule column to clean up the DataFrame
    df.drop(columns=["molecule"], inplace=True)

    # Now switch the smiles columns
    df.rename(columns={"smiles": "smiles_orig", "smiles_tautomer": "smiles"}, inplace=True)

    return df

`toxic_elements(mol)`

Identifies toxic elements or specific forms of elements in a molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	RDKit molecule object.	required

Returns:

Type	Description
`Optional[List[str]]`	Optional[List[str]]: List of toxic elements or specific forms if found, otherwise None.

Notes

Halogen toxicity logic integrates with halogen_toxicity_score and scales thresholds based on molecule size.

Source code in src/workbench/utils/chem_utils.py

def toxic_elements(mol: Mol) -> Optional[List[str]]:
    """
    Identifies toxic elements or specific forms of elements in a molecule.

    Args:
        mol: RDKit molecule object.

    Returns:
        Optional[List[str]]: List of toxic elements or specific forms if found, otherwise None.

    Notes:
        Halogen toxicity logic integrates with `halogen_toxicity_score` and scales thresholds
        based on molecule size.
    """
    # Always toxic elements (heavy metals and known toxic single elements)
    always_toxic = {"Pb", "Hg", "Cd", "As", "Be", "Tl", "Sb"}
    toxic_found = set()

    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        formal_charge = atom.GetFormalCharge()

        # Check for always toxic elements
        if symbol in always_toxic:
            toxic_found.add(symbol)

        # Conditionally toxic nitrogen (positively charged)
        if symbol == "N" and formal_charge > 0:
            # Exclude benign quaternary ammonium (e.g., choline-like structures)
            if mol.HasSubstructMatch(Chem.MolFromSmarts("[N+](C)(C)(C)C")):  # Example benign structure
                continue
            toxic_found.add("N+")

        # Halogen toxicity: Uses halogen_toxicity_score to flag excessive halogenation
        if symbol in {"Cl", "Br", "I", "F"}:
            halogen_count, halogen_threshold = halogen_toxicity_score(mol)
            if halogen_count > halogen_threshold:
                toxic_found.add(symbol)

    return list(toxic_found) if toxic_found else None

`toxic_groups(mol)`

Check if a molecule contains known toxic functional groups using RDKit's functional groups and SMARTS patterns.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	The molecule to evaluate.	required

Returns:

Type	Description
`Optional[List[str]]`	Optional[List[str]]: List of SMARTS patterns for toxic groups if found, otherwise None.

Source code in src/workbench/utils/chem_utils.py

def toxic_groups(mol: Chem.Mol) -> Optional[List[str]]:
    """
    Check if a molecule contains known toxic functional groups using RDKit's functional groups and SMARTS patterns.

    Args:
        mol (rdkit.Chem.Mol): The molecule to evaluate.

    Returns:
        Optional[List[str]]: List of SMARTS patterns for toxic groups if found, otherwise None.
    """
    toxic_smarts_matches = []

    # Use RDKit's functional group definitions
    toxic_group_names = ["Nitro", "Azide", "Alcohol", "Aldehyde", "Halogen", "TerminalAlkyne"]
    for group_name in toxic_group_names:
        group_node = next(node for node in fgroup_hierarchy if node.label == group_name)
        if mol.HasSubstructMatch(Chem.MolFromSmarts(group_node.smarts)):
            toxic_smarts_matches.append(group_node.smarts)  # Use group_node's SMARTS directly

    # Check for custom precompiled toxic SMARTS patterns
    for smarts, compiled in zip(toxic_smarts_patterns, compiled_toxic_smarts):
        if mol.HasSubstructMatch(compiled):  # Use precompiled SMARTS
            toxic_smarts_matches.append(smarts)

    # Special handling for N+
    if mol.HasSubstructMatch(Chem.MolFromSmarts("[N+]")):
        if not mol.HasSubstructMatch(Chem.MolFromSmarts("C[N+](C)(C)C")):  # Exclude benign
            toxic_smarts_matches.append("[N+]")  # Append as SMARTS

    # Exempt stabilizing functional groups using precompiled patterns
    for compiled in compiled_exempt_smarts:
        if mol.HasSubstructMatch(compiled):
            return None

    return toxic_smarts_matches if toxic_smarts_matches else None

Examples

Canonical Smiles

examples/chem_utils/canonicalize_smiles.py

"""Example for computing Canonicalize SMILES strings"""
import pandas as pd
from workbench.utils.chem_utils import canonicalize

test_data = [
    {"id": "Acetylacetone", "smiles": "CC(=O)CC(=O)C", "expected": "CC(=O)CC(C)=O"},
    {"id": "Imidazole", "smiles": "c1cnc[nH]1", "expected": "c1c[nH]cn1"},
    {"id": "Pyridone", "smiles": "C1=CC=NC(=O)C=C1", "expected": "O=c1cccccn1"},
    {"id": "Guanidine", "smiles": "C(=N)N=C(N)N", "expected": "N=CN=C(N)N"},
    {"id": "Catechol", "smiles": "c1cc(c(cc1)O)O", "expected": "Oc1ccccc1O"},
    {"id": "Formamide", "smiles": "C(=O)N", "expected": "NC=O"},
    {"id": "Urea", "smiles": "C(=O)(N)N", "expected": "NC(N)=O"},
    {"id": "Phenol", "smiles": "c1ccc(cc1)O", "expected": "Oc1ccccc1"},
]

# Convert test data to a DataFrame
df = pd.DataFrame(test_data)

# Perform canonicalization
result_df = canonicalize(df)
print(result_df)

Output

              id            smiles       expected smiles_canonical
0  Acetylacetone     CC(=O)CC(=O)C  CC(=O)CC(C)=O    CC(=O)CC(C)=O
1      Imidazole        c1cnc[nH]1     c1c[nH]cn1       c1c[nH]cn1
2       Pyridone  C1=CC=NC(=O)C=C1    O=c1cccccn1      O=c1cccccn1
3      Guanidine      C(=N)N=C(N)N     N=CN=C(N)N       N=CN=C(N)N
4       Catechol    c1cc(c(cc1)O)O     Oc1ccccc1O       Oc1ccccc1O
5      Formamide            C(=O)N           NC=O             NC=O
6           Urea         C(=O)(N)N        NC(N)=O          NC(N)=O
7         Phenol       c1ccc(cc1)O      Oc1ccccc1        Oc1ccccc1

Tautomerize Smiles

examples/chem_utils/tautomerize_smiles.py

"""Example for Tautomerizing SMILES strings"""
import pandas as pd
from workbench.utils.chem_utils import tautomerize_smiles

test_data = [
    # Salicylaldehyde undergoes keto-enol tautomerization.
    {"id": "Salicylaldehyde (Keto)", "smiles": "O=Cc1cccc(O)c1", "expected": "O=Cc1cccc(O)c1"},
    {"id": "2-Hydroxybenzaldehyde (Enol)", "smiles": "Oc1ccc(C=O)cc1", "expected": "O=Cc1ccc(O)cc1"},
    # Acetylacetone undergoes keto-enol tautomerization to favor the enol form.
    {"id": "Acetylacetone", "smiles": "CC(=O)CC(=O)C", "expected": "CC(=O)CC(C)=O"},
    # Imidazole undergoes a proton shift in the aromatic ring.
    {"id": "Imidazole", "smiles": "c1cnc[nH]1", "expected": "c1c[nH]cn1"},
    # Pyridone prefers the lactam form in RDKit's tautomer enumeration.
    {"id": "Pyridone", "smiles": "C1=CC=NC(=O)C=C1", "expected": "O=c1cccccn1"},
    # Guanidine undergoes amine-imine tautomerization.
    {"id": "Guanidine", "smiles": "C(=N)N=C(N)N", "expected": "N=C(N)N=CN"},
    # Catechol standardizes hydroxyl group placement in the aromatic system.
    {"id": "Catechol", "smiles": "c1cc(c(cc1)O)O", "expected": "Oc1ccccc1O"},
    # Formamide canonicalizes to NC=O, reflecting its stable form.
    {"id": "Formamide", "smiles": "C(=O)N", "expected": "NC=O"},
    # Urea undergoes a proton shift between nitrogen atoms.
    {"id": "Urea", "smiles": "C(=O)(N)N", "expected": "NC(N)=O"},
    # Phenol standardizes hydroxyl group placement in the aromatic system.
    {"id": "Phenol", "smiles": "c1ccc(cc1)O", "expected": "Oc1ccccc1"}
]

# Convert test data to a DataFrame
df = pd.DataFrame(test_data)

# Perform tautomerization
result_df = tautomerize_smiles(df)
print(result_df)

Output

                             id       smiles_orig        expected smiles_canonical          smiles
0        Salicylaldehyde (Keto)    O=Cc1cccc(O)c1  O=Cc1cccc(O)c1   O=Cc1cccc(O)c1  O=Cc1cccc(O)c1
1  2-Hydroxybenzaldehyde (Enol)    Oc1ccc(C=O)cc1  O=Cc1ccc(O)cc1   O=Cc1ccc(O)cc1  O=Cc1ccc(O)cc1
2                 Acetylacetone     CC(=O)CC(=O)C   CC(=O)CC(C)=O    CC(=O)CC(C)=O   CC(=O)CC(C)=O
3                     Imidazole        c1cnc[nH]1      c1c[nH]cn1       c1c[nH]cn1      c1c[nH]cn1
4                      Pyridone  C1=CC=NC(=O)C=C1     O=c1cccccn1      O=c1cccccn1     O=c1cccccn1
5                     Guanidine      C(=N)N=C(N)N      N=C(N)N=CN       N=CN=C(N)N      N=C(N)N=CN
6                      Catechol    c1cc(c(cc1)O)O      Oc1ccccc1O       Oc1ccccc1O      Oc1ccccc1O
7                     Formamide            C(=O)N            NC=O             NC=O            NC=O
8                          Urea         C(=O)(N)N         NC(N)=O          NC(N)=O         NC(N)=O
9                        Phenol       c1ccc(cc1)O       Oc1ccccc1        Oc1ccccc1       Oc1ccccc1

Additional Resources

Workbench API Classes: API Classes
Consulting Available: SuperCowPowers LLC

Chemical Utilities

log = logging.getLogger('workbench') module-attribute

add_compound_tags(df, mol_column='molecule')

add_salt_features(df)

canonicalize(df, remove_mol_col=True)

compute_molecular_descriptors(df, tautomerize=True)

compute_morgan_fingerprints(df, radius=2, n_bits=2048, counts=True)

compute_stereochemistry_descriptors(df)

contains_heavy_metals(mol)

contains_metalloenzyme_relevant_metals(mol)

contains_salts(mol)

custom_tautomer_canonicalization(mol)

df_to_sdf_file(df, output_file, smiles_col='smiles', id_col=None, include_cols=None, skip_invalid=True, generate_3d=True)

extract_advanced_salt_features(mol)

feature_resolution_issues(df, features, show_cols=None)

fingerprints_to_matrix(fingerprints, dtype=np.uint8)

geometric_mean(series)

halogen_toxicity_score(mol)

img_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')

is_druglike_compound(mol)

log_to_category(log_series)

log_to_micromolar(log_series)

micromolar_to_log(series_μM)

project_fingerprints(df, projection='UMAP')

remove_disconnected_fragments(mol)

rollup_experimental_data(df, id, time, target, use_gmean=False)

sdf_file_to_df(sdf_file, include_smiles=True, smiles_col='smiles', id_col=None, include_props=None, exclude_props=None)

show(smiles, width=500, height=500)

standard_tautomer_canonicalization(mol)

svg_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')

tautomerize_smiles(df)

toxic_elements(mol)

toxic_groups(mol)

Examples

Canonical Smiles

Tautomerize Smiles

Additional Resources

`log = logging.getLogger('workbench')` `module-attribute`

`add_compound_tags(df, mol_column='molecule')`

`add_salt_features(df)`

`canonicalize(df, remove_mol_col=True)`

`compute_molecular_descriptors(df, tautomerize=True)`

`compute_morgan_fingerprints(df, radius=2, n_bits=2048, counts=True)`

`compute_stereochemistry_descriptors(df)`

`contains_heavy_metals(mol)`

`contains_metalloenzyme_relevant_metals(mol)`

`contains_salts(mol)`

`custom_tautomer_canonicalization(mol)`

`df_to_sdf_file(df, output_file, smiles_col='smiles', id_col=None, include_cols=None, skip_invalid=True, generate_3d=True)`

`extract_advanced_salt_features(mol)`

`feature_resolution_issues(df, features, show_cols=None)`

`fingerprints_to_matrix(fingerprints, dtype=np.uint8)`

`geometric_mean(series)`

`halogen_toxicity_score(mol)`

`img_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')`

`is_druglike_compound(mol)`

`log_to_category(log_series)`

`log_to_micromolar(log_series)`

`micromolar_to_log(series_μM)`

`project_fingerprints(df, projection='UMAP')`

`remove_disconnected_fragments(mol)`

`rollup_experimental_data(df, id, time, target, use_gmean=False)`

`sdf_file_to_df(sdf_file, include_smiles=True, smiles_col='smiles', id_col=None, include_props=None, exclude_props=None)`

`show(smiles, width=500, height=500)`

`standard_tautomer_canonicalization(mol)`

`svg_from_smiles(smiles, width=500, height=500, background='rgba(64, 64, 64, 1)')`

`tautomerize_smiles(df)`

`toxic_elements(mol)`

`toxic_groups(mol)`