Skip to content

Pandas Dataframe Algorithms

Pandas Dataframes

Pandas dataframes are obviously not going to scale as well as our Spark and SQL Algorithms, but for 'moderate' sized data these algorithms provide some nice functionality.

Pandas Dataframe Algorithms

Workbench has a growing set of algorithms and data processing tools for Pandas Dataframes. In general these algorithm will take a dataframe as input and give you back a dataframe with additional columns.

FeatureSpaceProximity

Bases: Proximity

Proximity computations for numeric feature spaces using Euclidean distance.

Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py
class FeatureSpaceProximity(Proximity):
    """Proximity computations for numeric feature spaces using Euclidean distance."""

    def __init__(
        self,
        df: pd.DataFrame,
        id_column: str,
        features: List[str],
        target: Optional[str] = None,
        include_all_columns: bool = False,
    ):
        """
        Initialize the FeatureSpaceProximity class.

        Args:
            df: DataFrame containing data for neighbor computations.
            id_column: Name of the column used as the identifier.
            features: List of feature column names to be used for neighbor computations.
            target: Name of the target column. Defaults to None.
            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
        """
        # Validate and filter features before calling parent init
        self._raw_features = features
        super().__init__(
            df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
        )

    def _prepare_data(self) -> None:
        """Filter out non-numeric features and drop NaN rows."""
        # Validate features
        self.features = self._validate_features(self.df, self._raw_features)

        # Drop NaN rows for the features we're using
        self.df = self.df.dropna(subset=self.features).copy()

    def _validate_features(self, df: pd.DataFrame, features: List[str]) -> List[str]:
        """Remove non-numeric features and log warnings."""
        non_numeric = [f for f in features if f not in df.select_dtypes(include=["number"]).columns]
        if non_numeric:
            log.warning(f"Non-numeric features {non_numeric} aren't currently supported, excluding them")
        return [f for f in features if f not in non_numeric]

    def _build_model(self) -> None:
        """Standardize features and fit Nearest Neighbors model."""
        self.scaler = StandardScaler()
        X = self.scaler.fit_transform(self.df[self.features])
        self.nn = NearestNeighbors().fit(X)

    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
        """Transform features using the fitted scaler."""
        return self.scaler.transform(df[self.features])

    def _project_2d(self) -> None:
        """Project the numeric features to 2D for visualization."""
        if len(self.features) >= 2:
            self.df = Projection2D().fit_transform(self.df, features=self.features)

__init__(df, id_column, features, target=None, include_all_columns=False)

Initialize the FeatureSpaceProximity class.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing data for neighbor computations.

required
id_column str

Name of the column used as the identifier.

required
features List[str]

List of feature column names to be used for neighbor computations.

required
target Optional[str]

Name of the target column. Defaults to None.

None
include_all_columns bool

Include all DataFrame columns in neighbor results. Defaults to False.

False
Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py
def __init__(
    self,
    df: pd.DataFrame,
    id_column: str,
    features: List[str],
    target: Optional[str] = None,
    include_all_columns: bool = False,
):
    """
    Initialize the FeatureSpaceProximity class.

    Args:
        df: DataFrame containing data for neighbor computations.
        id_column: Name of the column used as the identifier.
        features: List of feature column names to be used for neighbor computations.
        target: Name of the target column. Defaults to None.
        include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
    """
    # Validate and filter features before calling parent init
    self._raw_features = features
    super().__init__(
        df, id_column=id_column, features=features, target=target, include_all_columns=include_all_columns
    )

FingerprintProximity

Bases: Proximity

Proximity computations for binary fingerprints using Tanimoto similarity.

Note: Tanimoto similarity is equivalent to Jaccard similarity for binary vectors. Tanimoto(A, B) = |A ∩ B| / |A ∪ B|

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
class FingerprintProximity(Proximity):
    """Proximity computations for binary fingerprints using Tanimoto similarity.

    Note: Tanimoto similarity is equivalent to Jaccard similarity for binary vectors.
    Tanimoto(A, B) = |A ∩ B| / |A ∪ B|
    """

    def __init__(
        self,
        df: pd.DataFrame,
        id_column: str,
        fingerprint_column: Optional[str] = None,
        target: Optional[str] = None,
        include_all_columns: bool = False,
        radius: int = 2,
        n_bits: int = 1024,
        counts: bool = False,
    ) -> None:
        """
        Initialize the FingerprintProximity class for binary fingerprint similarity.

        Args:
            df: DataFrame containing fingerprints or SMILES.
            id_column: Name of the column used as an identifier.
            fingerprint_column: Name of the column containing fingerprints (bit strings).
                If None, looks for existing "fingerprint" column or computes from SMILES.
            target: Name of the target column. Defaults to None.
            include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
            radius: Radius for Morgan fingerprint computation (default: 2).
            n_bits: Number of bits for fingerprint (default: 1024).
            counts: Whether to use count simulation (default: False).
        """
        # Store fingerprint computation parameters
        self._fp_radius = radius
        self._fp_n_bits = n_bits
        self._fp_counts = counts

        # Store the requested fingerprint column (may be None)
        self._fingerprint_column_arg = fingerprint_column

        # Determine fingerprint column name (but don't compute yet - that happens in _prepare_data)
        self.fingerprint_column = self._resolve_fingerprint_column_name(df, fingerprint_column)

        # Call parent constructor with fingerprint_column as the only "feature"
        super().__init__(
            df,
            id_column=id_column,
            features=[self.fingerprint_column],
            target=target,
            include_all_columns=include_all_columns,
        )

    @staticmethod
    def _resolve_fingerprint_column_name(df: pd.DataFrame, fingerprint_column: Optional[str]) -> str:
        """
        Determine the fingerprint column name, validating it exists or can be computed.

        Args:
            df: Input DataFrame.
            fingerprint_column: Explicitly specified fingerprint column, or None.

        Returns:
            Name of the fingerprint column to use.

        Raises:
            ValueError: If no fingerprint column exists and no SMILES column found.
        """
        # If explicitly provided, validate it exists
        if fingerprint_column is not None:
            if fingerprint_column not in df.columns:
                raise ValueError(f"Fingerprint column '{fingerprint_column}' not found in DataFrame")
            return fingerprint_column

        # Check for existing "fingerprint" column
        if "fingerprint" in df.columns:
            log.info("Using existing 'fingerprint' column")
            return "fingerprint"

        # Will need to compute from SMILES - validate SMILES column exists
        smiles_column = next((col for col in df.columns if col.lower() == "smiles"), None)
        if smiles_column is None:
            raise ValueError(
                "No fingerprint column provided and no SMILES column found. "
                "Either provide a fingerprint_column or include a 'smiles' column in the DataFrame."
            )

        # Fingerprints will be computed in _prepare_data
        return "fingerprint"

    def _prepare_data(self) -> None:
        """Compute fingerprints from SMILES if needed."""
        # If fingerprint column doesn't exist yet, compute it
        if self.fingerprint_column not in self.df.columns:
            log.info(f"Computing Morgan fingerprints (radius={self._fp_radius}, n_bits={self._fp_n_bits})...")
            self.df = compute_morgan_fingerprints(
                self.df, radius=self._fp_radius, n_bits=self._fp_n_bits, counts=self._fp_counts
            )

    def _build_model(self) -> None:
        """
        Build the fingerprint proximity model for Tanimoto similarity.
        Converts fingerprint strings to binary arrays and initializes NearestNeighbors.

        Note: sklearn uses Jaccard distance internally (1 - Tanimoto similarity).
        We convert back to Tanimoto similarity in the output methods.
        """
        log.info("Converting fingerprints to binary feature matrix...")

        # Convert fingerprint strings to binary arrays and store for later use
        self.X = self._fingerprints_to_matrix(self.df)

        # sklearn uses Jaccard distance = 1 - Tanimoto similarity
        # We convert to Tanimoto similarity in neighbors() and _precompute_metrics()
        log.info("Building NearestNeighbors model (Jaccard/Tanimoto metric, BallTree)...")
        self.nn = NearestNeighbors(metric="jaccard", algorithm="ball_tree").fit(self.X)

    def _transform_features(self, df: pd.DataFrame) -> np.ndarray:
        """
        Transform fingerprints to binary matrix for querying.

        Args:
            df: DataFrame containing fingerprints to transform.

        Returns:
            Binary feature matrix for the fingerprints.
        """
        return self._fingerprints_to_matrix(df)

    def _fingerprints_to_matrix(self, df: pd.DataFrame) -> np.ndarray:
        """
        Convert fingerprint strings to a binary numpy matrix.

        Args:
            df: DataFrame containing fingerprint column.

        Returns:
            2D numpy array of binary fingerprint bits.
        """
        fingerprint_bits = df[self.fingerprint_column].apply(
            lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
        )
        return np.vstack(fingerprint_bits)

    def _precompute_metrics(self) -> None:
        """Precompute metrics, adding Tanimoto similarity alongside distance."""
        # Call parent to compute nn_distance (Jaccard), nn_id, nn_target, nn_target_diff
        super()._precompute_metrics()

        # Add Tanimoto similarity (keep nn_distance for internal use by target_gradients)
        self.df["nn_similarity"] = 1 - self.df["nn_distance"]

    def _set_core_columns(self) -> None:
        """Set core columns using nn_similarity instead of nn_distance."""
        self.core_columns = [self.id_column, "nn_similarity", "nn_id"]
        if self.target:
            self.core_columns.extend([self.target, "nn_target", "nn_target_diff"])

    def _project_2d(self) -> None:
        """Project the fingerprint matrix to 2D for visualization using UMAP with Jaccard metric."""
        self.df = Projection2D().fit_transform(self.df, feature_matrix=self.X, metric="jaccard")

    def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
        """
        Find isolated data points based on Tanimoto similarity to nearest neighbor.

        Args:
            top_percent: Percentage of most isolated data points to return (e.g., 1.0 returns top 1%)

        Returns:
            DataFrame of observations with lowest Tanimoto similarity, sorted ascending
        """
        # For Tanimoto similarity, isolated means LOW similarity to nearest neighbor
        percentile = top_percent
        threshold = np.percentile(self.df["nn_similarity"], percentile)
        isolated = self.df[self.df["nn_similarity"] <= threshold].copy()
        isolated = isolated.sort_values("nn_similarity", ascending=True).reset_index(drop=True)
        return isolated if self.include_all_columns else isolated[self.core_columns]

    def proximity_stats(self) -> pd.DataFrame:
        """
        Return distribution statistics for nearest neighbor Tanimoto similarity.

        Returns:
            DataFrame with similarity distribution statistics (count, mean, std, percentiles)
        """
        return (
            self.df["nn_similarity"]
            .describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
            .to_frame()
        )

    def neighbors(
        self,
        id_or_ids: Union[str, int, List[Union[str, int]]],
        n_neighbors: Optional[int] = 5,
        min_similarity: Optional[float] = None,
        include_self: bool = True,
    ) -> pd.DataFrame:
        """
        Return neighbors for ID(s) from the existing dataset.

        Args:
            id_or_ids: Single ID or list of IDs to look up
            n_neighbors: Number of neighbors to return (default: 5, ignored if min_similarity is set)
            min_similarity: If provided, find all neighbors with Tanimoto similarity >= this value (0-1)
            include_self: Whether to include self in results (default: True)

        Returns:
            DataFrame containing neighbors with Tanimoto similarity scores
        """
        # Convert min_similarity to radius (Jaccard distance = 1 - Tanimoto similarity)
        radius = 1 - min_similarity if min_similarity is not None else None

        # Call parent method (returns Jaccard distance)
        neighbors_df = super().neighbors(
            id_or_ids=id_or_ids,
            n_neighbors=n_neighbors,
            radius=radius,
            include_self=include_self,
        )

        # Convert Jaccard distance to Tanimoto similarity
        neighbors_df["similarity"] = 1 - neighbors_df["distance"]
        neighbors_df.drop(columns=["distance"], inplace=True)

        return neighbors_df

__init__(df, id_column, fingerprint_column=None, target=None, include_all_columns=False, radius=2, n_bits=1024, counts=False)

Initialize the FingerprintProximity class for binary fingerprint similarity.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing fingerprints or SMILES.

required
id_column str

Name of the column used as an identifier.

required
fingerprint_column Optional[str]

Name of the column containing fingerprints (bit strings). If None, looks for existing "fingerprint" column or computes from SMILES.

None
target Optional[str]

Name of the target column. Defaults to None.

None
include_all_columns bool

Include all DataFrame columns in neighbor results. Defaults to False.

False
radius int

Radius for Morgan fingerprint computation (default: 2).

2
n_bits int

Number of bits for fingerprint (default: 1024).

1024
counts bool

Whether to use count simulation (default: False).

False
Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def __init__(
    self,
    df: pd.DataFrame,
    id_column: str,
    fingerprint_column: Optional[str] = None,
    target: Optional[str] = None,
    include_all_columns: bool = False,
    radius: int = 2,
    n_bits: int = 1024,
    counts: bool = False,
) -> None:
    """
    Initialize the FingerprintProximity class for binary fingerprint similarity.

    Args:
        df: DataFrame containing fingerprints or SMILES.
        id_column: Name of the column used as an identifier.
        fingerprint_column: Name of the column containing fingerprints (bit strings).
            If None, looks for existing "fingerprint" column or computes from SMILES.
        target: Name of the target column. Defaults to None.
        include_all_columns: Include all DataFrame columns in neighbor results. Defaults to False.
        radius: Radius for Morgan fingerprint computation (default: 2).
        n_bits: Number of bits for fingerprint (default: 1024).
        counts: Whether to use count simulation (default: False).
    """
    # Store fingerprint computation parameters
    self._fp_radius = radius
    self._fp_n_bits = n_bits
    self._fp_counts = counts

    # Store the requested fingerprint column (may be None)
    self._fingerprint_column_arg = fingerprint_column

    # Determine fingerprint column name (but don't compute yet - that happens in _prepare_data)
    self.fingerprint_column = self._resolve_fingerprint_column_name(df, fingerprint_column)

    # Call parent constructor with fingerprint_column as the only "feature"
    super().__init__(
        df,
        id_column=id_column,
        features=[self.fingerprint_column],
        target=target,
        include_all_columns=include_all_columns,
    )

isolated(top_percent=1.0)

Find isolated data points based on Tanimoto similarity to nearest neighbor.

Parameters:

Name Type Description Default
top_percent float

Percentage of most isolated data points to return (e.g., 1.0 returns top 1%)

1.0

Returns:

Type Description
DataFrame

DataFrame of observations with lowest Tanimoto similarity, sorted ascending

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def isolated(self, top_percent: float = 1.0) -> pd.DataFrame:
    """
    Find isolated data points based on Tanimoto similarity to nearest neighbor.

    Args:
        top_percent: Percentage of most isolated data points to return (e.g., 1.0 returns top 1%)

    Returns:
        DataFrame of observations with lowest Tanimoto similarity, sorted ascending
    """
    # For Tanimoto similarity, isolated means LOW similarity to nearest neighbor
    percentile = top_percent
    threshold = np.percentile(self.df["nn_similarity"], percentile)
    isolated = self.df[self.df["nn_similarity"] <= threshold].copy()
    isolated = isolated.sort_values("nn_similarity", ascending=True).reset_index(drop=True)
    return isolated if self.include_all_columns else isolated[self.core_columns]

neighbors(id_or_ids, n_neighbors=5, min_similarity=None, include_self=True)

Return neighbors for ID(s) from the existing dataset.

Parameters:

Name Type Description Default
id_or_ids Union[str, int, List[Union[str, int]]]

Single ID or list of IDs to look up

required
n_neighbors Optional[int]

Number of neighbors to return (default: 5, ignored if min_similarity is set)

5
min_similarity Optional[float]

If provided, find all neighbors with Tanimoto similarity >= this value (0-1)

None
include_self bool

Whether to include self in results (default: True)

True

Returns:

Type Description
DataFrame

DataFrame containing neighbors with Tanimoto similarity scores

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def neighbors(
    self,
    id_or_ids: Union[str, int, List[Union[str, int]]],
    n_neighbors: Optional[int] = 5,
    min_similarity: Optional[float] = None,
    include_self: bool = True,
) -> pd.DataFrame:
    """
    Return neighbors for ID(s) from the existing dataset.

    Args:
        id_or_ids: Single ID or list of IDs to look up
        n_neighbors: Number of neighbors to return (default: 5, ignored if min_similarity is set)
        min_similarity: If provided, find all neighbors with Tanimoto similarity >= this value (0-1)
        include_self: Whether to include self in results (default: True)

    Returns:
        DataFrame containing neighbors with Tanimoto similarity scores
    """
    # Convert min_similarity to radius (Jaccard distance = 1 - Tanimoto similarity)
    radius = 1 - min_similarity if min_similarity is not None else None

    # Call parent method (returns Jaccard distance)
    neighbors_df = super().neighbors(
        id_or_ids=id_or_ids,
        n_neighbors=n_neighbors,
        radius=radius,
        include_self=include_self,
    )

    # Convert Jaccard distance to Tanimoto similarity
    neighbors_df["similarity"] = 1 - neighbors_df["distance"]
    neighbors_df.drop(columns=["distance"], inplace=True)

    return neighbors_df

proximity_stats()

Return distribution statistics for nearest neighbor Tanimoto similarity.

Returns:

Type Description
DataFrame

DataFrame with similarity distribution statistics (count, mean, std, percentiles)

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def proximity_stats(self) -> pd.DataFrame:
    """
    Return distribution statistics for nearest neighbor Tanimoto similarity.

    Returns:
        DataFrame with similarity distribution statistics (count, mean, std, percentiles)
    """
    return (
        self.df["nn_similarity"]
        .describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
        .to_frame()
    )

Projection2D

Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
class Projection2D:
    """Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP."""

    def __init__(self):
        """Initialize the Projection2D class."""
        self.log = logging.getLogger("workbench")
        self.projection_model = None

    def fit_transform(
        self,
        input_df: pd.DataFrame,
        features: list = None,
        feature_matrix: np.ndarray = None,
        metric: str = "euclidean",
        projection: str = "UMAP",
    ) -> pd.DataFrame:
        """Fit and transform a DataFrame using the selected dimensionality reduction method.

        This method creates a copy of the input DataFrame, processes the specified features
        for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
        containing the projected 2D coordinates.

        Args:
            input_df (pd.DataFrame): The DataFrame containing features to project.
            features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
            feature_matrix (np.ndarray, optional): Pre-computed feature matrix. If provided, features is ignored
                and no scaling is applied (caller is responsible for appropriate preprocessing).
            metric (str, optional): Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.
            projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

        Returns:
            pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
        """
        # Create a copy of the input DataFrame
        df = input_df.copy()

        # If a feature matrix is provided, use it directly (no scaling)
        if feature_matrix is not None:
            if len(feature_matrix) != len(df):
                self.log.critical("feature_matrix length must match DataFrame length.")
                return df
            X_processed = feature_matrix
        else:
            # Auto-identify numeric features if none are provided
            if features is None:
                features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
                self.log.info(f"Auto-identified numeric features: {features}")

            if len(features) < 2 or df.empty:
                self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
                return df

            # Process a copy of the feature data for projection
            X = df[features]
            X = X.apply(lambda col: col.fillna(col.mean()))
            X_processed = StandardScaler().fit_transform(X)

        # Select the projection method (using df for perplexity calculation)
        self.projection_model = self._get_projection_model(projection, df, metric=metric)

        # Apply the projection on the processed data
        projection_result = self.projection_model.fit_transform(X_processed)
        df[["x", "y"]] = projection_result

        # Resolve coincident points and return the new DataFrame
        return self.resolve_coincident_points(df)

    def _get_projection_model(self, projection: str, df: pd.DataFrame, metric: str = "euclidean"):
        """Select and return the appropriate projection model.

        Args:
            projection (str): The projection method ('TSNE', 'MDS', 'PCA', or 'UMAP').
            df (pd.DataFrame): The DataFrame being transformed (used for computing perplexity).
            metric (str): Distance metric for UMAP (default 'euclidean').

        Returns:
            A dimensionality reduction model instance.
        """
        if projection == "TSNE":
            perplexity = min(40, len(df) - 1)
            self.log.info(f"Projection: TSNE with perplexity {perplexity}")
            return TSNE(perplexity=perplexity)

        if projection == "MDS":
            self.log.info("Projection: MDS")
            return MDS(n_components=2, random_state=0)

        if projection == "PCA":
            self.log.info("Projection: PCA")
            return PCA(n_components=2)

        if projection == "UMAP" and UMAP_AVAILABLE:
            self.log.info(f"Projection: UMAP with metric={metric}")
            return umap.UMAP(n_components=2, metric=metric)

        self.log.warning(
            f"Projection method '{projection}' not recognized or UMAP not available. Falling back to TSNE."
        )
        return TSNE(perplexity=min(40, len(df) - 1))

    @staticmethod
    def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
        """Resolve coincident points using random jitter

        Args:
            df (pd.DataFrame): DataFrame with x and y coordinates.

        Returns:
            pd.DataFrame: DataFrame with resolved coincident points
        """

        # Set jitter size based on rounding precision
        precision = 3
        jitter_amount = 10 ** (-precision) * 2  # 2x the rounding precision

        # Create rounded values for grouping
        rounded = pd.DataFrame(
            {"x_round": df["x"].round(precision), "y_round": df["y"].round(precision), "idx": df.index}
        )

        # Find duplicates
        duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
        if not duplicated.any():
            return df

        # Get the dtypes of the columns
        x_dtype = df["x"].dtype
        y_dtype = df["y"].dtype

        # Process each group
        for (x_round, y_round), group in rounded[duplicated].groupby(["x_round", "y_round"]):
            indices = group["idx"].values
            if len(indices) <= 1:
                continue

            # Apply random jitter to all points
            for i, idx in enumerate(indices):
                # Generate and apply properly typed offsets
                dx = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=x_dtype)
                dy = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=y_dtype)
                df.loc[idx, "x"] += dx
                df.loc[idx, "y"] += dy

        return df

__init__()

Initialize the Projection2D class.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
def __init__(self):
    """Initialize the Projection2D class."""
    self.log = logging.getLogger("workbench")
    self.projection_model = None

fit_transform(input_df, features=None, feature_matrix=None, metric='euclidean', projection='UMAP')

Fit and transform a DataFrame using the selected dimensionality reduction method.

This method creates a copy of the input DataFrame, processes the specified features for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns containing the projected 2D coordinates.

Parameters:

Name Type Description Default
input_df DataFrame

The DataFrame containing features to project.

required
features list

List of feature column names. If None, numeric columns are auto-selected.

None
feature_matrix ndarray

Pre-computed feature matrix. If provided, features is ignored and no scaling is applied (caller is responsible for appropriate preprocessing).

None
metric str

Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.

'euclidean'
projection str

The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

'UMAP'

Returns:

Type Description
DataFrame

pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
def fit_transform(
    self,
    input_df: pd.DataFrame,
    features: list = None,
    feature_matrix: np.ndarray = None,
    metric: str = "euclidean",
    projection: str = "UMAP",
) -> pd.DataFrame:
    """Fit and transform a DataFrame using the selected dimensionality reduction method.

    This method creates a copy of the input DataFrame, processes the specified features
    for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
    containing the projected 2D coordinates.

    Args:
        input_df (pd.DataFrame): The DataFrame containing features to project.
        features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
        feature_matrix (np.ndarray, optional): Pre-computed feature matrix. If provided, features is ignored
            and no scaling is applied (caller is responsible for appropriate preprocessing).
        metric (str, optional): Distance metric for UMAP (e.g., 'euclidean', 'jaccard'). Default 'euclidean'.
        projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

    Returns:
        pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
    """
    # Create a copy of the input DataFrame
    df = input_df.copy()

    # If a feature matrix is provided, use it directly (no scaling)
    if feature_matrix is not None:
        if len(feature_matrix) != len(df):
            self.log.critical("feature_matrix length must match DataFrame length.")
            return df
        X_processed = feature_matrix
    else:
        # Auto-identify numeric features if none are provided
        if features is None:
            features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
            self.log.info(f"Auto-identified numeric features: {features}")

        if len(features) < 2 or df.empty:
            self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
            return df

        # Process a copy of the feature data for projection
        X = df[features]
        X = X.apply(lambda col: col.fillna(col.mean()))
        X_processed = StandardScaler().fit_transform(X)

    # Select the projection method (using df for perplexity calculation)
    self.projection_model = self._get_projection_model(projection, df, metric=metric)

    # Apply the projection on the processed data
    projection_result = self.projection_model.fit_transform(X_processed)
    df[["x", "y"]] = projection_result

    # Resolve coincident points and return the new DataFrame
    return self.resolve_coincident_points(df)

resolve_coincident_points(df) staticmethod

Resolve coincident points using random jitter

Parameters:

Name Type Description Default
df DataFrame

DataFrame with x and y coordinates.

required

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with resolved coincident points

Source code in src/workbench/algorithms/dataframe/projection_2d.py
@staticmethod
def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
    """Resolve coincident points using random jitter

    Args:
        df (pd.DataFrame): DataFrame with x and y coordinates.

    Returns:
        pd.DataFrame: DataFrame with resolved coincident points
    """

    # Set jitter size based on rounding precision
    precision = 3
    jitter_amount = 10 ** (-precision) * 2  # 2x the rounding precision

    # Create rounded values for grouping
    rounded = pd.DataFrame(
        {"x_round": df["x"].round(precision), "y_round": df["y"].round(precision), "idx": df.index}
    )

    # Find duplicates
    duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
    if not duplicated.any():
        return df

    # Get the dtypes of the columns
    x_dtype = df["x"].dtype
    y_dtype = df["y"].dtype

    # Process each group
    for (x_round, y_round), group in rounded[duplicated].groupby(["x_round", "y_round"]):
        indices = group["idx"].values
        if len(indices) <= 1:
            continue

        # Apply random jitter to all points
        for i, idx in enumerate(indices):
            # Generate and apply properly typed offsets
            dx = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=x_dtype)
            dy = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=y_dtype)
            df.loc[idx, "x"] += dx
            df.loc[idx, "y"] += dy

    return df

Questions?

The SuperCowPowers team is happy to answer any questions you may have about AWS and Workbench. Please contact us at workbench@supercowpowers.com or on chat us up on Discord