Skip to content

Pandas Dataframe Algorithms

Pandas Dataframes

Pandas dataframes are obviously not going to scale as well as our Spark and SQL Algorithms, but for 'moderate' sized data these algorithms provide some nice functionality.

Pandas Dataframe Algorithms

Workbench has a growing set of algorithms and data processing tools for Pandas Dataframes. In general these algorithm will take a dataframe as input and give you back a dataframe with additional columns.

FeatureSpaceProximity

Bases: Proximity

Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py
class FeatureSpaceProximity(Proximity):
    def __init__(self, model: Model, n_neighbors: int = 10) -> None:
        """
        Initialize the FeatureSpaceProximity class.

        Args:
            model (Model): A Workbench model object.
            n_neighbors (int): Number of neighbors to compute. Defaults to 10.
        """

        # Grab the features and target from the model
        features = model.features()
        target = model.target()

        # Grab the feature set for the model
        fs = FeatureSet(model.get_input())

        # If we have a "inference" view, pull the data from that view
        view_name = f"inf_{model.uuid.replace('-', '_')}"
        if view_name in fs.views():
            self.df = fs.view(view_name).pull_dataframe()

        # Otherwise, pull the data from the feature set and run inference
        else:
            inf_view = InferenceView.create(model)
            self.df = inf_view.pull_dataframe()

        # Call the parent class constructor
        super().__init__(self.df, id_column=fs.id_column, features=features, target=target, n_neighbors=n_neighbors)

        # Project the data to 2D
        self.df = Projection2D().fit_transform(self.df, features=features)

__init__(model, n_neighbors=10)

Initialize the FeatureSpaceProximity class.

Parameters:

Name Type Description Default
model Model

A Workbench model object.

required
n_neighbors int

Number of neighbors to compute. Defaults to 10.

10
Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py
def __init__(self, model: Model, n_neighbors: int = 10) -> None:
    """
    Initialize the FeatureSpaceProximity class.

    Args:
        model (Model): A Workbench model object.
        n_neighbors (int): Number of neighbors to compute. Defaults to 10.
    """

    # Grab the features and target from the model
    features = model.features()
    target = model.target()

    # Grab the feature set for the model
    fs = FeatureSet(model.get_input())

    # If we have a "inference" view, pull the data from that view
    view_name = f"inf_{model.uuid.replace('-', '_')}"
    if view_name in fs.views():
        self.df = fs.view(view_name).pull_dataframe()

    # Otherwise, pull the data from the feature set and run inference
    else:
        inf_view = InferenceView.create(model)
        self.df = inf_view.pull_dataframe()

    # Call the parent class constructor
    super().__init__(self.df, id_column=fs.id_column, features=features, target=target, n_neighbors=n_neighbors)

    # Project the data to 2D
    self.df = Projection2D().fit_transform(self.df, features=features)

FingerprintProximity

Bases: Proximity

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
class FingerprintProximity(Proximity):
    def __init__(
        self, df: pd.DataFrame, id_column: Union[int, str], fingerprint_column: str, n_neighbors: int = 5
    ) -> None:
        """
        Initialize the FingerprintProximity class for binary fingerprint similarity.

        Args:
            df (pd.DataFrame): DataFrame containing fingerprints.
            id_column (Union[int, str]): Name of the column used as an identifier.
            fingerprint_column (str): Name of the column containing fingerprints.
            n_neighbors (int): Default number of neighbors to compute.
        """
        self.fingerprint_column = fingerprint_column

        # Call the parent class constructor
        super().__init__(df, id_column=id_column, features=[fingerprint_column], n_neighbors=n_neighbors)

    # Override the build_proximity_model method
    def build_proximity_model(self) -> None:
        """
        Prepare the fingerprint data for nearest neighbor calculations.
        Converts fingerprint strings to binary arrays and initializes NearestNeighbors.
        """
        log.info("Converting fingerprints to binary feature matrix...")
        self.proximity_type = ProximityType.SIMILARITY

        # Convert fingerprint strings to binary arrays

        fingerprint_bits = self.df[self.fingerprint_column].apply(
            lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
        )
        self.X = np.vstack(fingerprint_bits)

        # Use Jaccard similarity for binary fingerprints
        log.info("Computing NearestNeighbors with Jaccard metric...")
        self.nn = NearestNeighbors(metric="jaccard", n_neighbors=self.n_neighbors + 1).fit(self.X)

    # Override the prep_features_for_query method
    def prep_features_for_query(self, query_df: pd.DataFrame) -> np.ndarray:
        """
        Prepare the query DataFrame by converting fingerprints to binary arrays.

        Args:
            query_df (pd.DataFrame): DataFrame containing query fingerprints.

        Returns:
            np.ndarray: Binary feature matrix for the query fingerprints.
        """
        fingerprint_bits = query_df[self.fingerprint_column].apply(
            lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
        )
        return np.vstack(fingerprint_bits)

    def all_neighbors(
        self,
        min_similarity: float = None,
        include_self: bool = False,
        add_columns: List[str] = None,
    ) -> pd.DataFrame:
        """
        Find neighbors for all fingerprints in the dataset.

        Args:
            min_similarity: Minimum similarity threshold (0-1)
            include_self: Whether to include self in results
            add_columns: Additional columns to include in results

        Returns:
            DataFrame containing neighbors and similarities
        """

        # Call the parent class method to find neighbors
        return self.neighbors(
            query_df=self.df,
            min_similarity=min_similarity,
            include_self=include_self,
            add_columns=add_columns,
        )

    def neighbors(
        self,
        query_df: pd.DataFrame,
        min_similarity: float = None,
        include_self: bool = False,
        add_columns: List[str] = None,
    ) -> pd.DataFrame:
        """
        Find neighbors for each row in the query DataFrame.

        Args:
            query_df: DataFrame containing query fingerprints
            min_similarity: Minimum similarity threshold (0-1)
            include_self: Whether to include self in results (if present)
            add_columns: Additional columns to include in results

        Returns:
            DataFrame containing neighbors and similarities

        Note: The query DataFrame must include the feature columns. The id_column is optional.
        """

        # Calculate radius from similarity if provided
        radius = 1 - min_similarity if min_similarity is not None else None

        # Call the parent class method to find neighbors
        neighbors_df = super().neighbors(
            query_df=query_df,
            radius=radius,
            include_self=include_self,
            add_columns=add_columns,
        )

        # Convert distances to similarity
        neighbors_df["similarity"] = 1 - neighbors_df["distance"]
        neighbors_df.drop(columns=["distance"], inplace=True)
        return neighbors_df

__init__(df, id_column, fingerprint_column, n_neighbors=5)

Initialize the FingerprintProximity class for binary fingerprint similarity.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing fingerprints.

required
id_column Union[int, str]

Name of the column used as an identifier.

required
fingerprint_column str

Name of the column containing fingerprints.

required
n_neighbors int

Default number of neighbors to compute.

5
Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def __init__(
    self, df: pd.DataFrame, id_column: Union[int, str], fingerprint_column: str, n_neighbors: int = 5
) -> None:
    """
    Initialize the FingerprintProximity class for binary fingerprint similarity.

    Args:
        df (pd.DataFrame): DataFrame containing fingerprints.
        id_column (Union[int, str]): Name of the column used as an identifier.
        fingerprint_column (str): Name of the column containing fingerprints.
        n_neighbors (int): Default number of neighbors to compute.
    """
    self.fingerprint_column = fingerprint_column

    # Call the parent class constructor
    super().__init__(df, id_column=id_column, features=[fingerprint_column], n_neighbors=n_neighbors)

all_neighbors(min_similarity=None, include_self=False, add_columns=None)

Find neighbors for all fingerprints in the dataset.

Parameters:

Name Type Description Default
min_similarity float

Minimum similarity threshold (0-1)

None
include_self bool

Whether to include self in results

False
add_columns List[str]

Additional columns to include in results

None

Returns:

Type Description
DataFrame

DataFrame containing neighbors and similarities

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def all_neighbors(
    self,
    min_similarity: float = None,
    include_self: bool = False,
    add_columns: List[str] = None,
) -> pd.DataFrame:
    """
    Find neighbors for all fingerprints in the dataset.

    Args:
        min_similarity: Minimum similarity threshold (0-1)
        include_self: Whether to include self in results
        add_columns: Additional columns to include in results

    Returns:
        DataFrame containing neighbors and similarities
    """

    # Call the parent class method to find neighbors
    return self.neighbors(
        query_df=self.df,
        min_similarity=min_similarity,
        include_self=include_self,
        add_columns=add_columns,
    )

build_proximity_model()

Prepare the fingerprint data for nearest neighbor calculations. Converts fingerprint strings to binary arrays and initializes NearestNeighbors.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def build_proximity_model(self) -> None:
    """
    Prepare the fingerprint data for nearest neighbor calculations.
    Converts fingerprint strings to binary arrays and initializes NearestNeighbors.
    """
    log.info("Converting fingerprints to binary feature matrix...")
    self.proximity_type = ProximityType.SIMILARITY

    # Convert fingerprint strings to binary arrays

    fingerprint_bits = self.df[self.fingerprint_column].apply(
        lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
    )
    self.X = np.vstack(fingerprint_bits)

    # Use Jaccard similarity for binary fingerprints
    log.info("Computing NearestNeighbors with Jaccard metric...")
    self.nn = NearestNeighbors(metric="jaccard", n_neighbors=self.n_neighbors + 1).fit(self.X)

neighbors(query_df, min_similarity=None, include_self=False, add_columns=None)

Find neighbors for each row in the query DataFrame.

Parameters:

Name Type Description Default
query_df DataFrame

DataFrame containing query fingerprints

required
min_similarity float

Minimum similarity threshold (0-1)

None
include_self bool

Whether to include self in results (if present)

False
add_columns List[str]

Additional columns to include in results

None

Returns:

Type Description
DataFrame

DataFrame containing neighbors and similarities

Note: The query DataFrame must include the feature columns. The id_column is optional.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def neighbors(
    self,
    query_df: pd.DataFrame,
    min_similarity: float = None,
    include_self: bool = False,
    add_columns: List[str] = None,
) -> pd.DataFrame:
    """
    Find neighbors for each row in the query DataFrame.

    Args:
        query_df: DataFrame containing query fingerprints
        min_similarity: Minimum similarity threshold (0-1)
        include_self: Whether to include self in results (if present)
        add_columns: Additional columns to include in results

    Returns:
        DataFrame containing neighbors and similarities

    Note: The query DataFrame must include the feature columns. The id_column is optional.
    """

    # Calculate radius from similarity if provided
    radius = 1 - min_similarity if min_similarity is not None else None

    # Call the parent class method to find neighbors
    neighbors_df = super().neighbors(
        query_df=query_df,
        radius=radius,
        include_self=include_self,
        add_columns=add_columns,
    )

    # Convert distances to similarity
    neighbors_df["similarity"] = 1 - neighbors_df["distance"]
    neighbors_df.drop(columns=["distance"], inplace=True)
    return neighbors_df

prep_features_for_query(query_df)

Prepare the query DataFrame by converting fingerprints to binary arrays.

Parameters:

Name Type Description Default
query_df DataFrame

DataFrame containing query fingerprints.

required

Returns:

Type Description
ndarray

np.ndarray: Binary feature matrix for the query fingerprints.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py
def prep_features_for_query(self, query_df: pd.DataFrame) -> np.ndarray:
    """
    Prepare the query DataFrame by converting fingerprints to binary arrays.

    Args:
        query_df (pd.DataFrame): DataFrame containing query fingerprints.

    Returns:
        np.ndarray: Binary feature matrix for the query fingerprints.
    """
    fingerprint_bits = query_df[self.fingerprint_column].apply(
        lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
    )
    return np.vstack(fingerprint_bits)

Projection2D

Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
class Projection2D:
    """Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP."""

    def __init__(self):
        """Initialize the Projection2D class."""
        self.log = logging.getLogger("workbench")
        self.projection_model = None

    def fit_transform(self, input_df: pd.DataFrame, features: list = None, projection: str = "UMAP") -> pd.DataFrame:
        """Fit and transform a DataFrame using the selected dimensionality reduction method.

        This method creates a copy of the input DataFrame, processes the specified features
        for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
        containing the projected 2D coordinates.

        Args:
            input_df (pd.DataFrame): The DataFrame containing features to project.
            features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
            projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

        Returns:
            pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
        """
        # Create a copy of the input DataFrame
        df = input_df.copy()

        # Auto-identify numeric features if none are provided
        if features is None:
            features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
            self.log.info(f"Auto-identified numeric features: {features}")

        if len(features) < 2 or df.empty:
            self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
            return df

        # Process a copy of the feature data for projection
        X = df[features]
        X = X.apply(lambda col: col.fillna(col.mean()))
        X_scaled = StandardScaler().fit_transform(X)

        # Select the projection method (using df for perplexity calculation)
        self.projection_model = self._get_projection_model(projection, df)

        # Apply the projection on the normalized data
        projection_result = self.projection_model.fit_transform(X_scaled)
        df[["x", "y"]] = projection_result

        # Resolve coincident points by adding jitter and return the new DataFrame
        return self.resolve_coincident_points(df)

    def _get_projection_model(self, projection: str, df: pd.DataFrame):
        """Select and return the appropriate projection model.

        Args:
            projection (str): The projection method ('TSNE', 'MDS', 'PCA', or 'UMAP').
            df (pd.DataFrame): The DataFrame being transformed (used for computing perplexity).

        Returns:
            A dimensionality reduction model instance.
        """
        if projection == "TSNE":
            perplexity = min(40, len(df) - 1)
            self.log.info(f"Projection: TSNE with perplexity {perplexity}")
            return TSNE(perplexity=perplexity)

        if projection == "MDS":
            self.log.info("Projection: MDS")
            return MDS(n_components=2, random_state=0)

        if projection == "PCA":
            self.log.info("Projection: PCA")
            return PCA(n_components=2)

        if projection == "UMAP" and UMAP_AVAILABLE:
            self.log.info("Projection: UMAP")
            return umap.UMAP(n_components=2)

        self.log.warning(
            f"Projection method '{projection}' not recognized or UMAP not available. Falling back to TSNE."
        )
        return TSNE(perplexity=min(40, len(df) - 1))

    @staticmethod
    def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
        """Resolve coincident points in a DataFrame by adding jitter.

        Args:
            df (pd.DataFrame): The DataFrame containing x and y projection coordinates.

        Returns:
            pd.DataFrame: The DataFrame with resolved coincident points.
        """
        jitter_x = (df["x"].max() - df["x"].min()) * 0.005
        jitter_y = (df["y"].max() - df["y"].min()) * 0.005
        df["x"] += np.random.normal(0, jitter_x, len(df))
        df["y"] += np.random.normal(0, jitter_y, len(df))
        return df

__init__()

Initialize the Projection2D class.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
def __init__(self):
    """Initialize the Projection2D class."""
    self.log = logging.getLogger("workbench")
    self.projection_model = None

fit_transform(input_df, features=None, projection='UMAP')

Fit and transform a DataFrame using the selected dimensionality reduction method.

This method creates a copy of the input DataFrame, processes the specified features for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns containing the projected 2D coordinates.

Parameters:

Name Type Description Default
input_df DataFrame

The DataFrame containing features to project.

required
features list

List of feature column names. If None, numeric columns are auto-selected.

None
projection str

The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

'UMAP'

Returns:

Type Description
DataFrame

pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
def fit_transform(self, input_df: pd.DataFrame, features: list = None, projection: str = "UMAP") -> pd.DataFrame:
    """Fit and transform a DataFrame using the selected dimensionality reduction method.

    This method creates a copy of the input DataFrame, processes the specified features
    for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
    containing the projected 2D coordinates.

    Args:
        input_df (pd.DataFrame): The DataFrame containing features to project.
        features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
        projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

    Returns:
        pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
    """
    # Create a copy of the input DataFrame
    df = input_df.copy()

    # Auto-identify numeric features if none are provided
    if features is None:
        features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
        self.log.info(f"Auto-identified numeric features: {features}")

    if len(features) < 2 or df.empty:
        self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
        return df

    # Process a copy of the feature data for projection
    X = df[features]
    X = X.apply(lambda col: col.fillna(col.mean()))
    X_scaled = StandardScaler().fit_transform(X)

    # Select the projection method (using df for perplexity calculation)
    self.projection_model = self._get_projection_model(projection, df)

    # Apply the projection on the normalized data
    projection_result = self.projection_model.fit_transform(X_scaled)
    df[["x", "y"]] = projection_result

    # Resolve coincident points by adding jitter and return the new DataFrame
    return self.resolve_coincident_points(df)

resolve_coincident_points(df) staticmethod

Resolve coincident points in a DataFrame by adding jitter.

Parameters:

Name Type Description Default
df DataFrame

The DataFrame containing x and y projection coordinates.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The DataFrame with resolved coincident points.

Source code in src/workbench/algorithms/dataframe/projection_2d.py
@staticmethod
def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
    """Resolve coincident points in a DataFrame by adding jitter.

    Args:
        df (pd.DataFrame): The DataFrame containing x and y projection coordinates.

    Returns:
        pd.DataFrame: The DataFrame with resolved coincident points.
    """
    jitter_x = (df["x"].max() - df["x"].min()) * 0.005
    jitter_y = (df["y"].max() - df["y"].min()) * 0.005
    df["x"] += np.random.normal(0, jitter_x, len(df))
    df["y"] += np.random.normal(0, jitter_y, len(df))
    return df

Questions?

The SuperCowPowers team is happy to answer any questions you may have about AWS and Workbench. Please contact us at workbench@supercowpowers.com or on chat us up on Discord