Skip to content

Pandas Dataframe Algorithms

Pandas Dataframes

Pandas dataframes are obviously not going to scale as well as our Spark and SQL Algorithms, but for 'moderate' sized data these algorithms provide some nice functionality.

Pandas Dataframe Algorithms

Workbench has a growing set of algorithms and data processing tools for Pandas Dataframes. In general these algorithm will take a dataframe as input and give you back a dataframe with additional columns.

FeatureSpaceProximity

Bases: Proximity

Proximity computations for numeric feature spaces using Euclidean distance.

Implements the Proximity ABC contract
  • neighbors(id_or_ids) id-based lookups
  • neighbors_from_query_df novel-input lookups (query_df must contain the same feature columns this model was built with)

The distance column in results is standardized Euclidean distance (raw sklearn NearestNeighbors output). For visualization, call project_2d() explicitly.

Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py
class FeatureSpaceProximity(Proximity):
    """Proximity computations for numeric feature spaces using Euclidean distance.

    Implements the Proximity ABC contract:
        - `neighbors(id_or_ids)`     id-based lookups
        - `neighbors_from_query_df`  novel-input lookups (query_df must contain the
                                     same feature columns this model was built with)

    The `distance` column in results is standardized Euclidean distance (raw sklearn
    NearestNeighbors output). For visualization, call `project_2d()` explicitly.
    """

    def __init__(
        self,
        df: pd.DataFrame,
        id_column: str,
        features: List[str],
        target: Optional[str] = None,
        include_all_columns: bool = False,
    ):
        """
        Initialize the FeatureSpaceProximity class.

        Args:
            df: DataFrame containing data for neighbor computations.
            id_column: Name of the column used as the identifier.
            features: List of feature column names to be used for neighbor computations.
            target: Name of the target column. Defaults to None.
            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
        """
        self._raw_features = features
        super().__init__(
            df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
        )

    def _prepare_data(self) -> None:
        """Filter out non-numeric features and drop NaN rows."""
        self.features = self._validate_features(self.df, self._raw_features)
        self.df = self.df.dropna(subset=self.features).copy()

    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
        """Remove non-numeric features and log warnings."""
        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
        if non_numeric:
            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
        return [f for f in features if f not in non_numeric]

    def _build_model(self) -> None:
        """Standardize features and fit Nearest Neighbors model."""
        self.scaler = StandardScaler()
        X = self.scaler.fit_transform(self.df[self.features])
        self.nn = NearestNeighbors().fit(X)

    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
        """Transform features using the fitted scaler.

        For novel-input queries via `neighbors_from_query_df`, the query DataFrame
        must contain the same feature columns this model was built with.
        """
        return self.scaler.transform(df[self.features])

    def project_2d(self) -> pd.DataFrame:
        """Project the numeric features to 2D for visualization (UMAP).

        Returns the reference DataFrame with 'x' / 'y' columns added.
        """
        if len(self.features) >= 2:
            self.df = Projection2D().fit_transform(self.df, features=self.features)
        return self.df

__init__(df, id_column, features, target=None, include_all_columns=False)

Initialize the FeatureSpaceProximity class.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing data for neighbor computations.

required
id_column str

Name of the column used as the identifier.

required
features List[str]

List of feature column names to be used for neighbor computations.

required
target Optional[str]

Name of the target column. Defaults to None.

None
include_all_columns bool

Include all DataFrame columns in neighbor results. Defaults to False.

False
Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py
def __init__(
    self,
    df: pd.DataFrame,
    id_column: str,
    features: List[str],
    target: Optional[str] = None,
    include_all_columns: bool = False,
):
    """
    Initialize the FeatureSpaceProximity class.

    Args:
        df: DataFrame containing data for neighbor computations.
        id_column: Name of the column used as the identifier.
        features: List of feature column names to be used for neighbor computations.
        target: Name of the target column. Defaults to None.
        include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
    """
    self._raw_features = features
    super().__init__(
        df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
    )

project_2d()

Project the numeric features to 2D for visualization (UMAP).

Returns the reference DataFrame with 'x' / 'y' columns added.

Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py
def project_2d(self) -> pd.DataFrame:
    """Project the numeric features to 2D for visualization (UMAP).

    Returns the reference DataFrame with 'x' / 'y' columns added.
    """
    if len(self.features) >= 2:
        self.df = Projection2D().fit_transform(self.df, features=self.features)
    return self.df

FingerprintProximity

Bases: Proximity

Proximity computations using Tanimoto similarity on molecular fingerprints.

Implements the Proximity ABC contract
  • neighbors(id_or_ids) id-based lookups
  • neighbors_from_query_df novel-input lookups (query_df needs a 'smiles' or 'fingerprint' column)

Supports both binary and count fingerprints (auto-detected): - Binary: uses Jaccard distance (equivalent to 1 - Tanimoto for binary vectors) - Count: uses Ruzicka distance (weighted Tanimoto for count vectors), computed on-the-fly via sparse operations — supports novel queries and scales to large N.

Result DataFrames include a similarity = 1 - distance column as a FingerprintProximity-specific extra (in addition to the canonical distance).

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
class FingerprintProximity(Proximity):
    """Proximity computations using Tanimoto similarity on molecular fingerprints.

    Implements the Proximity ABC contract:
        - `neighbors(id_or_ids)`     id-based lookups
        - `neighbors_from_query_df`  novel-input lookups (query_df needs a 'smiles'
                                     or 'fingerprint' column)

    Supports both binary and count fingerprints (auto-detected):
        - Binary: uses Jaccard distance (equivalent to 1 - Tanimoto for binary vectors)
        - Count: uses Ruzicka distance (weighted Tanimoto for count vectors), computed
          on-the-fly via sparse operations — supports novel queries and scales to large N.

    Result DataFrames include a `similarity = 1 - distance` column as a
    FingerprintProximity-specific extra (in addition to the canonical `distance`).
    """

    def __init__(
        self,
        df: pd.DataFrame,
        id_column: str,
        fingerprint_column: Optional[str] = None,
        target: Optional[str] = None,
        include_all_columns: bool = False,
        radius: int = 2,
        n_bits: int = 2048,
    ) -> None:
        """
        Initialize FingerprintProximity for Tanimoto similarity on molecular fingerprints.

        Args:
            df: DataFrame containing fingerprints or SMILES.
            id_column: Name of the column used as an identifier.
            fingerprint_column: Name of the column containing fingerprints (bit strings).
                If None, looks for existing "fingerprint" column or computes from SMILES.
            target: Name of the target column. Defaults to None.
            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
            radius: Radius for Morgan fingerprint computation (default: 2).
            n_bits: Number of bits for fingerprint (default: 2048).
        """
        self._fp_radius = radius
        self._fp_n_bits = n_bits
        self.fingerprint_column = self._resolve_fingerprint_column_name(df, fingerprint_column)

        super().__init__(
            df,
            id_column=id_column,
            features=[self.fingerprint_column],
            target=target,
            include_all_columns=include_all_columns,
        )

    @staticmethod
    def _resolve_fingerprint_column_name(df: pd.DataFrame, fingerprint_column: Optional[str]) -> str:
        """Determine the fingerprint column name, validating it exists or can be computed."""
        if fingerprint_column is not None:
            if fingerprint_column not in df.columns:
                raise ValueError(f"Fingerprint column '{fingerprint_column}' not found in DataFrame")
            return fingerprint_column

        if "fingerprint" in df.columns:
            log.info("Using existing 'fingerprint' column")
            return "fingerprint"

        # Will need to compute from SMILES - validate SMILES column exists
        smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
        if smiles_column is None:
            raise ValueError(
                "No fingerprint column provided and no SMILES column found. "
                "Either provide a fingerprint_column or include a 'smiles' column in the DataFrame."
            )

        return "fingerprint"

    def _prepare_data(self) -> None:
        """Compute fingerprints from SMILES if needed."""
        if self.fingerprint_column not in self.df.columns:
            log.info(f"Computing Morgan fingerprints (radius={self._fp_radius}, n_bits={self._fp_n_bits})...")
            self.df = compute_morgan_fingerprints(self.df, radius=self._fp_radius, n_bits=self._fp_n_bits)

    def _build_model(self) -> None:
        """Build the fingerprint proximity model for Tanimoto similarity.

        For binary fingerprints: uses Jaccard distance (1 - Tanimoto) via sklearn ball_tree.
        For count fingerprints: stores a sparse CSR reference matrix and a custom NN wrapper
            that computes Ruzicka (weighted Tanimoto) distance on-the-fly. No precomputed
            N×N matrix — supports novel queries and scales to large reference sets.
        """
        X, self._is_count_fp = self._fingerprints_to_matrix(self.df)

        if self._is_count_fp:
            log.info("Building NearestNeighbors model (sparse on-the-fly Ruzicka for count fingerprints)...")
            self._X_sparse = csr_matrix(X.astype(np.float32))
            self._row_sums = np.asarray(self._X_sparse.sum(axis=1)).ravel().astype(np.float32)
            self.nn = _SparseRuzickaNN(self._X_sparse, self._row_sums)
            self.X = None  # not used for count FPs
        else:
            log.info("Building NearestNeighbors model (Jaccard/Tanimoto for binary fingerprints)...")
            self.X = X
            self.nn = NearestNeighbors(metric="jaccard", algorithm="ball_tree").fit(self.X)

        # Cache: id → row index in the reference set. Used by _transform_features to
        # answer id-based queries without re-parsing fingerprint strings (and works
        # even after the artifact is slimmed by UQModelV1._slim_proximity).
        self._id_to_row = {row_id: i for i, row_id in enumerate(self.df[self.id_column].values)}

    def _transform_features(self, df: pd.DataFrame) -> Union[np.ndarray, csr_matrix]:
        """Transform features for querying the NN model.

        Three paths, in order of cost:
            1. Identity fast path: when df is the reference DataFrame itself,
               return the cached matrix directly.
            2. ID-based row lookup: when all IDs in `df[id_column]` are known in
               the reference set, slice rows from `_X_sparse` (or `self.X`) directly.
               No fingerprint parsing, no Morgan recomputation. This path works
               even after the artifact is slimmed (fingerprint column dropped).
            3. Novel-query path: parse fingerprints from `df`, computing Morgan
               from SMILES if needed.

        For count fingerprints the matrix is sparse CSR; for binary, dense.
        """
        # Path 1: reference DataFrame itself
        if df is self.df:
            return self._X_sparse if self._is_count_fp else self.X

        # Path 2: id-based row lookup. Cheap and works post-slim.
        if self.id_column in df.columns:
            ids = df[self.id_column].values
            id_to_row = getattr(self, "_id_to_row", None)
            if id_to_row is not None:
                try:
                    indices = np.fromiter((id_to_row[i] for i in ids), dtype=np.int64, count=len(ids))
                except KeyError:
                    indices = None
                if indices is not None:
                    if self._is_count_fp:
                        return self._X_sparse[indices]
                    return self.X[indices]

        # Path 3: novel-query path. Need fingerprints or SMILES.
        if self.fingerprint_column not in df.columns:
            if "smiles" not in df.columns and "SMILES" not in df.columns:
                raise ValueError(
                    f"Query DataFrame must contain either '{self.fingerprint_column}' " "or a 'smiles' column"
                )
            df = compute_morgan_fingerprints(df, radius=self._fp_radius, n_bits=self._fp_n_bits)

        matrix, _ = self._fingerprints_to_matrix(df)
        if self._is_count_fp:
            return csr_matrix(matrix.astype(np.float32))
        return matrix

    def _fingerprints_to_matrix(self, df: pd.DataFrame) -> tuple[np.ndarray, bool]:
        """Convert fingerprint strings to a numpy matrix.

        Supports two formats (auto-detected):
            - Bitstrings: "10110010..." → binary matrix (bool), is_count=False
            - Count vectors: "0,3,0,1,5,..." → count matrix (uint8), is_count=True
        """
        sample = str(df[self.fingerprint_column].iloc[0])
        if "," in sample:
            fingerprint_values = df[self.fingerprint_column].apply(
                lambda fp: np.array([int(x) for x in fp.split(",")], dtype=np.uint8)
            )
            return np.vstack(fingerprint_values), True
        else:
            fingerprint_bits = df[self.fingerprint_column].apply(
                lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
            )
            return np.vstack(fingerprint_bits), False

    def neighbors(
        self,
        id_or_ids,
        n_neighbors: Optional[int] = 5,
        min_similarity: Optional[float] = None,
        include_self: bool = True,
    ) -> pd.DataFrame:
        """Return neighbors for ID(s) already in the reference dataset.

        Args:
            id_or_ids: Single ID or list of IDs to look up
            n_neighbors: Number of neighbors to return (default: 5, ignored if min_similarity is set)
            min_similarity: If provided, find all neighbors with Tanimoto similarity >= this value (0-1)
            include_self: Whether to include self in results (default: True)

        Returns:
            DataFrame with columns: id_column, neighbor_id, similarity, [target], [in_model],
            and any other passthrough columns.
        """
        radius = 1 - min_similarity if min_similarity is not None else None
        result = super().neighbors(
            id_or_ids=id_or_ids,
            n_neighbors=n_neighbors,
            radius=radius,
            include_self=include_self,
        )
        return self._add_similarity_column(result)

    def neighbors_from_query_df(
        self,
        query_df: pd.DataFrame,
        n_neighbors: int = 5,
        min_similarity: Optional[float] = None,
    ) -> pd.DataFrame:
        """Return neighbors for novel queries not in the reference dataset.

        Args:
            query_df: DataFrame with either a 'smiles' or 'fingerprint' column. If a
                'query_id' column is present it's used to label results; otherwise
                positional indices are used.
            n_neighbors: Number of neighbors to return (default: 5, ignored if min_similarity is set)
            min_similarity: If provided, find all neighbors with Tanimoto similarity >= this value (0-1)

        Returns:
            DataFrame with columns: query_id, neighbor_id, similarity, [target], [in_model].
            Queries whose SMILES couldn't be parsed by RDKit are dropped with a
            warning — their rows simply don't appear in the result. Upstream
            consumers (residual_features._aggregate) reindex against the full
            input id list so missing queries surface as NaN rows there.
        """
        # Pre-validate SMILES. `compute_morgan_fingerprints` (called downstream
        # by _transform_features for novel queries) silently drops rows with
        # invalid SMILES, which then causes an array-length mismatch in
        # _neighbors_impl's result assembly. Drop bad rows here so the feature
        # matrix and query_ids stay aligned.
        smiles_col = next((c for c in ("smiles", "SMILES") if c in query_df.columns), None)
        if smiles_col is not None and self.fingerprint_column not in query_df.columns:
            from rdkit import Chem

            valid = query_df[smiles_col].apply(lambda s: Chem.MolFromSmiles(s) is not None)
            if not valid.all():
                n_bad = int((~valid).sum())
                bad_sample = query_df.loc[~valid, smiles_col].head(3).tolist()
                log.warning(
                    f"FingerprintProximity.neighbors_from_query_df: dropping {n_bad} "
                    f"row(s) with SMILES that RDKit can't parse "
                    f"(sample: {bad_sample}{'...' if n_bad > 3 else ''}). "
                    "These rows will be absent from the result."
                )
                query_df = query_df[valid].reset_index(drop=True)
                if len(query_df) == 0:
                    # All queries invalid — return an empty result rather than crash in the NN backend.
                    return pd.DataFrame(columns=["query_id", "neighbor_id", "similarity"])

        radius = 1 - min_similarity if min_similarity is not None else None
        result = super().neighbors_from_query_df(
            query_df=query_df,
            n_neighbors=n_neighbors,
            radius=radius,
        )
        return self._add_similarity_column(result)

    @staticmethod
    def _add_similarity_column(result_df: pd.DataFrame) -> pd.DataFrame:
        """Append `similarity = 1 - distance` and drop the raw distance column."""
        result_df["similarity"] = 1 - result_df["distance"]
        result_df.drop(columns=["distance"], inplace=True)
        # Re-sort: similarity descending (was ascending by distance).
        # Use the leading id column (first column) and similarity.
        id_col = result_df.columns[0]
        return result_df.sort_values([id_col, "similarity"], ascending=[True, False]).reset_index(drop=True)

    def project_2d(self) -> pd.DataFrame:
        """Project the fingerprint matrix to 2D for visualization using UMAP.

        For count fingerprints: lazily materializes the full N×N Ruzicka distance matrix
        for UMAP's precomputed-metric path. Memory cost is O(N²) — transient.
        For binary fingerprints: uses Jaccard distance directly on the fingerprint matrix.

        Returns the reference DataFrame with 'x' / 'y' columns added.

        Note: Projection2D is imported lazily so the module loads in script bundles
        that don't have UMAP / workbench's projection helper installed.
        """
        from workbench.algorithms.dataframe.projection_2d import Projection2D

        if self._is_count_fp:
            dist_matrix = self.nn.pairwise_ruzicka_matrix()
            # Symmetric jitter breaks tied eigenvalues in UMAP's spectral init
            rng = np.random.default_rng(seed=0)
            n = dist_matrix.shape[0]
            noise = rng.uniform(0.0, 1e-4, size=(n, n)).astype(np.float32)
            noise = (noise + noise.T) / 2.0
            np.fill_diagonal(noise, 0.0)
            jittered = np.clip(dist_matrix + noise, 0.0, 1.0)
            self.df = Projection2D().fit_transform(self.df, feature_matrix=jittered, metric="precomputed")
        else:
            self.df = Projection2D().fit_transform(self.df, feature_matrix=self.X, metric="jaccard")
        return self.df

__init__(df, id_column, fingerprint_column=None, target=None, include_all_columns=False, radius=2, n_bits=2048)

Initialize FingerprintProximity for Tanimoto similarity on molecular fingerprints.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing fingerprints or SMILES.

required
id_column str

Name of the column used as an identifier.

required
fingerprint_column Optional[str]

Name of the column containing fingerprints (bit strings). If None, looks for existing "fingerprint" column or computes from SMILES.

None
target Optional[str]

Name of the target column. Defaults to None.

None
include_all_columns bool

Include all DataFrame columns in neighbor results. Defaults to False.

False
radius int

Radius for Morgan fingerprint computation (default: 2).

2
n_bits int

Number of bits for fingerprint (default: 2048).

2048
Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def __init__(
    self,
    df: pd.DataFrame,
    id_column: str,
    fingerprint_column: Optional[str] = None,
    target: Optional[str] = None,
    include_all_columns: bool = False,
    radius: int = 2,
    n_bits: int = 2048,
) -> None:
    """
    Initialize FingerprintProximity for Tanimoto similarity on molecular fingerprints.

    Args:
        df: DataFrame containing fingerprints or SMILES.
        id_column: Name of the column used as an identifier.
        fingerprint_column: Name of the column containing fingerprints (bit strings).
            If None, looks for existing "fingerprint" column or computes from SMILES.
        target: Name of the target column. Defaults to None.
        include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
        radius: Radius for Morgan fingerprint computation (default: 2).
        n_bits: Number of bits for fingerprint (default: 2048).
    """
    self._fp_radius = radius
    self._fp_n_bits = n_bits
    self.fingerprint_column = self._resolve_fingerprint_column_name(df, fingerprint_column)

    super().__init__(
        df,
        id_column=id_column,
        features=[self.fingerprint_column],
        target=target,
        include_all_columns=include_all_columns,
    )

neighbors(id_or_ids, n_neighbors=5, min_similarity=None, include_self=True)

Return neighbors for ID(s) already in the reference dataset.

Parameters:

Name Type Description Default
id_or_ids

Single ID or list of IDs to look up

required
n_neighbors Optional[int]

Number of neighbors to return (default: 5, ignored if min_similarity is set)

5
min_similarity Optional[float]

If provided, find all neighbors with Tanimoto similarity >= this value (0-1)

None
include_self bool

Whether to include self in results (default: True)

True

Returns:

Type Description
DataFrame

DataFrame with columns: id_column, neighbor_id, similarity, [target], [in_model],

DataFrame

and any other passthrough columns.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def neighbors(
    self,
    id_or_ids,
    n_neighbors: Optional[int] = 5,
    min_similarity: Optional[float] = None,
    include_self: bool = True,
) -> pd.DataFrame:
    """Return neighbors for ID(s) already in the reference dataset.

    Args:
        id_or_ids: Single ID or list of IDs to look up
        n_neighbors: Number of neighbors to return (default: 5, ignored if min_similarity is set)
        min_similarity: If provided, find all neighbors with Tanimoto similarity >= this value (0-1)
        include_self: Whether to include self in results (default: True)

    Returns:
        DataFrame with columns: id_column, neighbor_id, similarity, [target], [in_model],
        and any other passthrough columns.
    """
    radius = 1 - min_similarity if min_similarity is not None else None
    result = super().neighbors(
        id_or_ids=id_or_ids,
        n_neighbors=n_neighbors,
        radius=radius,
        include_self=include_self,
    )
    return self._add_similarity_column(result)

neighbors_from_query_df(query_df, n_neighbors=5, min_similarity=None)

Return neighbors for novel queries not in the reference dataset.

Parameters:

Name Type Description Default
query_df DataFrame

DataFrame with either a 'smiles' or 'fingerprint' column. If a 'query_id' column is present it's used to label results; otherwise positional indices are used.

required
n_neighbors int

Number of neighbors to return (default: 5, ignored if min_similarity is set)

5
min_similarity Optional[float]

If provided, find all neighbors with Tanimoto similarity >= this value (0-1)

None

Returns:

Type Description
DataFrame

DataFrame with columns: query_id, neighbor_id, similarity, [target], [in_model].

DataFrame

Queries whose SMILES couldn't be parsed by RDKit are dropped with a

DataFrame

warning — their rows simply don't appear in the result. Upstream

DataFrame

consumers (residual_features._aggregate) reindex against the full

DataFrame

input id list so missing queries surface as NaN rows there.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def neighbors_from_query_df(
    self,
    query_df: pd.DataFrame,
    n_neighbors: int = 5,
    min_similarity: Optional[float] = None,
) -> pd.DataFrame:
    """Return neighbors for novel queries not in the reference dataset.

    Args:
        query_df: DataFrame with either a 'smiles' or 'fingerprint' column. If a
            'query_id' column is present it's used to label results; otherwise
            positional indices are used.
        n_neighbors: Number of neighbors to return (default: 5, ignored if min_similarity is set)
        min_similarity: If provided, find all neighbors with Tanimoto similarity >= this value (0-1)

    Returns:
        DataFrame with columns: query_id, neighbor_id, similarity, [target], [in_model].
        Queries whose SMILES couldn't be parsed by RDKit are dropped with a
        warning — their rows simply don't appear in the result. Upstream
        consumers (residual_features._aggregate) reindex against the full
        input id list so missing queries surface as NaN rows there.
    """
    # Pre-validate SMILES. `compute_morgan_fingerprints` (called downstream
    # by _transform_features for novel queries) silently drops rows with
    # invalid SMILES, which then causes an array-length mismatch in
    # _neighbors_impl's result assembly. Drop bad rows here so the feature
    # matrix and query_ids stay aligned.
    smiles_col = next((c for c in ("smiles", "SMILES") if c in query_df.columns), None)
    if smiles_col is not None and self.fingerprint_column not in query_df.columns:
        from rdkit import Chem

        valid = query_df[smiles_col].apply(lambda s: Chem.MolFromSmiles(s) is not None)
        if not valid.all():
            n_bad = int((~valid).sum())
            bad_sample = query_df.loc[~valid, smiles_col].head(3).tolist()
            log.warning(
                f"FingerprintProximity.neighbors_from_query_df: dropping {n_bad} "
                f"row(s) with SMILES that RDKit can't parse "
                f"(sample: {bad_sample}{'...' if n_bad > 3 else ''}). "
                "These rows will be absent from the result."
            )
            query_df = query_df[valid].reset_index(drop=True)
            if len(query_df) == 0:
                # All queries invalid — return an empty result rather than crash in the NN backend.
                return pd.DataFrame(columns=["query_id", "neighbor_id", "similarity"])

    radius = 1 - min_similarity if min_similarity is not None else None
    result = super().neighbors_from_query_df(
        query_df=query_df,
        n_neighbors=n_neighbors,
        radius=radius,
    )
    return self._add_similarity_column(result)

project_2d()

Project the fingerprint matrix to 2D for visualization using UMAP.

For count fingerprints: lazily materializes the full N×N Ruzicka distance matrix for UMAP's precomputed-metric path. Memory cost is O(N²) — transient. For binary fingerprints: uses Jaccard distance directly on the fingerprint matrix.

Returns the reference DataFrame with 'x' / 'y' columns added.

Note: Projection2D is imported lazily so the module loads in script bundles that don't have UMAP / workbench's projection helper installed.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def project_2d(self) -> pd.DataFrame:
    """Project the fingerprint matrix to 2D for visualization using UMAP.

    For count fingerprints: lazily materializes the full N×N Ruzicka distance matrix
    for UMAP's precomputed-metric path. Memory cost is O(N²) — transient.
    For binary fingerprints: uses Jaccard distance directly on the fingerprint matrix.

    Returns the reference DataFrame with 'x' / 'y' columns added.

    Note: Projection2D is imported lazily so the module loads in script bundles
    that don't have UMAP / workbench's projection helper installed.
    """
    from workbench.algorithms.dataframe.projection_2d import Projection2D

    if self._is_count_fp:
        dist_matrix = self.nn.pairwise_ruzicka_matrix()
        # Symmetric jitter breaks tied eigenvalues in UMAP's spectral init
        rng = np.random.default_rng(seed=0)
        n = dist_matrix.shape[0]
        noise = rng.uniform(0.0, 1e-4, size=(n, n)).astype(np.float32)
        noise = (noise + noise.T) / 2.0
        np.fill_diagonal(noise, 0.0)
        jittered = np.clip(dist_matrix + noise, 0.0, 1.0)
        self.df = Projection2D().fit_transform(self.df, feature_matrix=jittered, metric="precomputed")
    else:
        self.df = Projection2D().fit_transform(self.df, feature_matrix=self.X, metric="jaccard")
    return self.df

Projection2D

Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
class Projection2D:
    """Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP."""

    def __init__(self):
        """Initialize the Projection2D class."""
        self.log = logging.getLogger("workbench")
        self.projection_model = None

    def fit_transform(
        self,
        input_df: pd.DataFrame,
        features: list = None,
        feature_matrix: np.ndarray = None,
        metric: str = "euclidean",
        projection: str = "UMAP",
    ) -> pd.DataFrame:
        """Fit and transform a DataFrame using the selected dimensionality reduction method.

        This method creates a copy of the input DataFrame, processes the specified features
        for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
        containing the projected 2D coordinates.

        Args:
            input_df (pd.DataFrame): The DataFrame containing features to project.
            features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
            feature_matrix (np.ndarray, optional): Pre-computed feature matrix. If provided, features is ignored
                and no scaling is applied (caller is responsible for appropriate preprocessing).
            metric (str, optional): Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.
            projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

        Returns:
            pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
        """
        # Create a copy of the input DataFrame
        df = input_df.copy()

        # If a feature matrix is provided, use it directly (no scaling)
        if feature_matrix is not None:
            if len(feature_matrix) != len(df):
                self.log.critical("feature_matrix length must match DataFrame length.")
                return df
            X_processed = feature_matrix
        else:
            # Auto-identify numeric features if none are provided
            if features is None:
                features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
                self.log.info(f"Auto-identified numeric features: {features}")

            if len(features) < 2 or df.empty:
                self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
                return df

            # Process a copy of the feature data for projection
            X = df[features]
            X = X.apply(lambda col: col.fillna(col.mean()))
            X_processed = StandardScaler().fit_transform(X)

        # Select the projection method (using df for perplexity calculation)
        self.projection_model = self._get_projection_model(projection, df, metric=metric)

        # Apply the projection on the processed data
        projection_result = self.projection_model.fit_transform(X_processed)
        df[["x", "y"]] = projection_result

        # Resolve coincident points and return the new DataFrame
        return self.resolve_coincident_points(df)

    def _get_projection_model(self, projection: str, df: pd.DataFrame, metric: str = "euclidean"):
        """Select and return the appropriate projection model.

        Args:
            projection (str): The projection method ('TSNE', 'MDS', 'PCA', or 'UMAP').
            df (pd.DataFrame): The DataFrame being transformed (used for computing perplexity).
            metric (str): Distance metric for UMAP (default 'euclidean').

        Returns:
            A dimensionality reduction model instance.
        """
        if projection == "TSNE":
            perplexity = min(40, len(df) - 1)
            self.log.info(f"Projection: TSNE with perplexity {perplexity}")
            return TSNE(perplexity=perplexity)

        if projection == "MDS":
            self.log.info("Projection: MDS")
            return MDS(n_components=2, random_state=0)

        if projection == "PCA":
            self.log.info("Projection: PCA")
            return PCA(n_components=2)

        if projection == "UMAP" and UMAP_AVAILABLE:
            n_neighbors = min(5, len(df) - 1)
            min_dist = 0.5
            if n_neighbors < 5:
                self.log.warning(
                    f"Dataset size ({len(df)}) smaller than default n_neighbors, using n_neighbors={n_neighbors}"
                )
            self.log.info(f"Projection: UMAP with metric={metric}, n_neighbors={n_neighbors}, min_dist={min_dist}")
            return umap.UMAP(n_components=2, metric=metric, n_neighbors=n_neighbors, min_dist=min_dist)

        self.log.warning(
            f"Projection method '{projection}' not recognized or UMAP not available. Falling back to TSNE."
        )
        return TSNE(perplexity=min(40, len(df) - 1))

    @staticmethod
    def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
        """Resolve coincident points using random jitter

        Args:
            df (pd.DataFrame): DataFrame with x and y coordinates.

        Returns:
            pd.DataFrame: DataFrame with resolved coincident points
        """

        # Set jitter size based on rounding precision
        precision = 3
        jitter_amount = 10 ** (-precision) * 2  # 2x the rounding precision

        # Create rounded values for grouping
        rounded = pd.DataFrame(
            {"x_round": df["x"].round(precision), "y_round": df["y"].round(precision), "idx": df.index}
        )

        # Find duplicates
        duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
        if not duplicated.any():
            return df

        # Get the dtypes of the columns
        x_dtype = df["x"].dtype
        y_dtype = df["y"].dtype

        # Process each group
        for (x_round, y_round), group in rounded[duplicated].groupby(["x_round", "y_round"]):
            indices = group["idx"].values
            if len(indices) <= 1:
                continue

            # Apply random jitter to all points
            for i, idx in enumerate(indices):
                # Generate and apply properly typed offsets
                dx = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=x_dtype)
                dy = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=y_dtype)
                df.loc[idx, "x"] += dx
                df.loc[idx, "y"] += dy

        return df

__init__()

Initialize the Projection2D class.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
def __init__(self):
    """Initialize the Projection2D class."""
    self.log = logging.getLogger("workbench")
    self.projection_model = None

fit_transform(input_df, features=None, feature_matrix=None, metric='euclidean', projection='UMAP')

Fit and transform a DataFrame using the selected dimensionality reduction method.

This method creates a copy of the input DataFrame, processes the specified features for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns containing the projected 2D coordinates.

Parameters:

Name Type Description Default
input_df DataFrame

The DataFrame containing features to project.

required
features list

List of feature column names. If None, numeric columns are auto-selected.

None
feature_matrix ndarray

Pre-computed feature matrix. If provided, features is ignored and no scaling is applied (caller is responsible for appropriate preprocessing).

None
metric str

Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.

'euclidean'
projection str

The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

'UMAP'

Returns:

Type Description
DataFrame

pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
def fit_transform(
    self,
    input_df: pd.DataFrame,
    features: list = None,
    feature_matrix: np.ndarray = None,
    metric: str = "euclidean",
    projection: str = "UMAP",
) -> pd.DataFrame:
    """Fit and transform a DataFrame using the selected dimensionality reduction method.

    This method creates a copy of the input DataFrame, processes the specified features
    for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
    containing the projected 2D coordinates.

    Args:
        input_df (pd.DataFrame): The DataFrame containing features to project.
        features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
        feature_matrix (np.ndarray, optional): Pre-computed feature matrix. If provided, features is ignored
            and no scaling is applied (caller is responsible for appropriate preprocessing).
        metric (str, optional): Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.
        projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

    Returns:
        pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
    """
    # Create a copy of the input DataFrame
    df = input_df.copy()

    # If a feature matrix is provided, use it directly (no scaling)
    if feature_matrix is not None:
        if len(feature_matrix) != len(df):
            self.log.critical("feature_matrix length must match DataFrame length.")
            return df
        X_processed = feature_matrix
    else:
        # Auto-identify numeric features if none are provided
        if features is None:
            features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
            self.log.info(f"Auto-identified numeric features: {features}")

        if len(features) < 2 or df.empty:
            self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
            return df

        # Process a copy of the feature data for projection
        X = df[features]
        X = X.apply(lambda col: col.fillna(col.mean()))
        X_processed = StandardScaler().fit_transform(X)

    # Select the projection method (using df for perplexity calculation)
    self.projection_model = self._get_projection_model(projection, df, metric=metric)

    # Apply the projection on the processed data
    projection_result = self.projection_model.fit_transform(X_processed)
    df[["x", "y"]] = projection_result

    # Resolve coincident points and return the new DataFrame
    return self.resolve_coincident_points(df)

resolve_coincident_points(df) staticmethod

Resolve coincident points using random jitter

Parameters:

Name Type Description Default
df DataFrame

DataFrame with x and y coordinates.

required

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with resolved coincident points

Source code in src/workbench/algorithms/dataframe/projection_2d.py
@staticmethod
def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
    """Resolve coincident points using random jitter

    Args:
        df (pd.DataFrame): DataFrame with x and y coordinates.

    Returns:
        pd.DataFrame: DataFrame with resolved coincident points
    """

    # Set jitter size based on rounding precision
    precision = 3
    jitter_amount = 10 ** (-precision) * 2  # 2x the rounding precision

    # Create rounded values for grouping
    rounded = pd.DataFrame(
        {"x_round": df["x"].round(precision), "y_round": df["y"].round(precision), "idx": df.index}
    )

    # Find duplicates
    duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
    if not duplicated.any():
        return df

    # Get the dtypes of the columns
    x_dtype = df["x"].dtype
    y_dtype = df["y"].dtype

    # Process each group
    for (x_round, y_round), group in rounded[duplicated].groupby(["x_round", "y_round"]):
        indices = group["idx"].values
        if len(indices) <= 1:
            continue

        # Apply random jitter to all points
        for i, idx in enumerate(indices):
            # Generate and apply properly typed offsets
            dx = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=x_dtype)
            dy = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=y_dtype)
            df.loc[idx, "x"] += dx
            df.loc[idx, "y"] += dy

    return df

Questions?

The SuperCowPowers team is happy to answer any questions you may have about AWS and Workbench. Please contact us at workbench@supercowpowers.com or on chat us up on Discord