Pandas Dataframe Algorithms

Pandas Dataframes

Pandas dataframes are obviously not going to scale as well as our Spark and SQL Algorithms, but for 'moderate' sized data these algorithms provide some nice functionality.

Pandas Dataframe Algorithms

Workbench has a growing set of algorithms and data processing tools for Pandas Dataframes. In general these algorithm will take a dataframe as input and give you back a dataframe with additional columns.

`FeatureSpaceProximity`

Bases: Proximity

Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py

class FeatureSpaceProximity(Proximity):
    def __init__(self, model: Model, n_neighbors: int = 10) -> None:
        """
        Initialize the FeatureSpaceProximity class.

        Args:
            model (Model): A Workbench model object.
            n_neighbors (int): Number of neighbors to compute. Defaults to 10.
        """

        # Grab the features and target from the model
        features = model.features()
        target = model.target()

        # Grab the feature set for the model
        fs = FeatureSet(model.get_input())

        # If we have a "inference" view, pull the data from that view
        view_name = f"inf_{model.name.replace('-', '_')}"
        if view_name in fs.views():
            self.df = fs.view(view_name).pull_dataframe()

        # Otherwise, pull the data from the feature set and run inference
        else:
            inf_view = InferenceView.create(model)
            self.df = inf_view.pull_dataframe()

        # Call the parent class constructor
        super().__init__(self.df, id_column=fs.id_column, features=features, target=target, n_neighbors=n_neighbors)

        # Project the data to 2D
        self.df = Projection2D().fit_transform(self.df, features=features)

`init(model, n_neighbors=10)`

Initialize the FeatureSpaceProximity class.

Parameters:

Name	Type	Description	Default
`model`	`Model`	A Workbench model object.	required
`n_neighbors`	`int`	Number of neighbors to compute. Defaults to 10.	`10`

Source code in src/workbench/algorithms/dataframe/feature_space_proximity.py

def __init__(self, model: Model, n_neighbors: int = 10) -> None:
    """
    Initialize the FeatureSpaceProximity class.

    Args:
        model (Model): A Workbench model object.
        n_neighbors (int): Number of neighbors to compute. Defaults to 10.
    """

    # Grab the features and target from the model
    features = model.features()
    target = model.target()

    # Grab the feature set for the model
    fs = FeatureSet(model.get_input())

    # If we have a "inference" view, pull the data from that view
    view_name = f"inf_{model.name.replace('-', '_')}"
    if view_name in fs.views():
        self.df = fs.view(view_name).pull_dataframe()

    # Otherwise, pull the data from the feature set and run inference
    else:
        inf_view = InferenceView.create(model)
        self.df = inf_view.pull_dataframe()

    # Call the parent class constructor
    super().__init__(self.df, id_column=fs.id_column, features=features, target=target, n_neighbors=n_neighbors)

    # Project the data to 2D
    self.df = Projection2D().fit_transform(self.df, features=features)

`FingerprintProximity`

Bases: Proximity

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py

class FingerprintProximity(Proximity):
    def __init__(
        self, df: pd.DataFrame, id_column: Union[int, str], fingerprint_column: str, n_neighbors: int = 5
    ) -> None:
        """
        Initialize the FingerprintProximity class for binary fingerprint similarity.

        Args:
            df (pd.DataFrame): DataFrame containing fingerprints.
            id_column (Union[int, str]): Name of the column used as an identifier.
            fingerprint_column (str): Name of the column containing fingerprints.
            n_neighbors (int): Default number of neighbors to compute.
        """
        self.fingerprint_column = fingerprint_column

        # Call the parent class constructor
        super().__init__(df, id_column=id_column, features=[fingerprint_column], n_neighbors=n_neighbors)

    # Override the build_proximity_model method
    def build_proximity_model(self) -> None:
        """
        Prepare the fingerprint data for nearest neighbor calculations.
        Converts fingerprint strings to binary arrays and initializes NearestNeighbors.
        """
        log.info("Converting fingerprints to binary feature matrix...")
        self.proximity_type = ProximityType.SIMILARITY

        # Convert fingerprint strings to binary arrays

        fingerprint_bits = self.df[self.fingerprint_column].apply(
            lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
        )
        self.X = np.vstack(fingerprint_bits)

        # Use Jaccard similarity for binary fingerprints
        log.info("Computing NearestNeighbors with Jaccard metric...")
        self.nn = NearestNeighbors(metric="jaccard", n_neighbors=self.n_neighbors + 1).fit(self.X)

    # Override the prep_features_for_query method
    def prep_features_for_query(self, query_df: pd.DataFrame) -> np.ndarray:
        """
        Prepare the query DataFrame by converting fingerprints to binary arrays.

        Args:
            query_df (pd.DataFrame): DataFrame containing query fingerprints.

        Returns:
            np.ndarray: Binary feature matrix for the query fingerprints.
        """
        fingerprint_bits = query_df[self.fingerprint_column].apply(
            lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
        )
        return np.vstack(fingerprint_bits)

    def all_neighbors(
        self,
        min_similarity: float = None,
        include_self: bool = False,
        add_columns: List[str] = None,
    ) -> pd.DataFrame:
        """
        Find neighbors for all fingerprints in the dataset.

        Args:
            min_similarity: Minimum similarity threshold (0-1)
            include_self: Whether to include self in results
            add_columns: Additional columns to include in results

        Returns:
            DataFrame containing neighbors and similarities
        """

        # Call the parent class method to find neighbors
        return self.neighbors(
            query_df=self.df,
            min_similarity=min_similarity,
            include_self=include_self,
            add_columns=add_columns,
        )

    def neighbors(
        self,
        query_df: pd.DataFrame,
        min_similarity: float = None,
        include_self: bool = False,
        add_columns: List[str] = None,
    ) -> pd.DataFrame:
        """
        Find neighbors for each row in the query DataFrame.

        Args:
            query_df: DataFrame containing query fingerprints
            min_similarity: Minimum similarity threshold (0-1)
            include_self: Whether to include self in results (if present)
            add_columns: Additional columns to include in results

        Returns:
            DataFrame containing neighbors and similarities

        Note: The query DataFrame must include the feature columns. The id_column is optional.
        """

        # Calculate radius from similarity if provided
        radius = 1 - min_similarity if min_similarity is not None else None

        # Call the parent class method to find neighbors
        neighbors_df = super().neighbors(
            query_df=query_df,
            radius=radius,
            include_self=include_self,
            add_columns=add_columns,
        )

        # Convert distances to similarity
        neighbors_df["similarity"] = 1 - neighbors_df["distance"]
        neighbors_df.drop(columns=["distance"], inplace=True)
        return neighbors_df

`init(df, id_column, fingerprint_column, n_neighbors=5)`

Initialize the FingerprintProximity class for binary fingerprint similarity.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing fingerprints.	required
`id_column`	`Union[int, str]`	Name of the column used as an identifier.	required
`fingerprint_column`	`str`	Name of the column containing fingerprints.	required
`n_neighbors`	`int`	Default number of neighbors to compute.	`5`

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py

def __init__(
    self, df: pd.DataFrame, id_column: Union[int, str], fingerprint_column: str, n_neighbors: int = 5
) -> None:
    """
    Initialize the FingerprintProximity class for binary fingerprint similarity.

    Args:
        df (pd.DataFrame): DataFrame containing fingerprints.
        id_column (Union[int, str]): Name of the column used as an identifier.
        fingerprint_column (str): Name of the column containing fingerprints.
        n_neighbors (int): Default number of neighbors to compute.
    """
    self.fingerprint_column = fingerprint_column

    # Call the parent class constructor
    super().__init__(df, id_column=id_column, features=[fingerprint_column], n_neighbors=n_neighbors)

`all_neighbors(min_similarity=None, include_self=False, add_columns=None)`

Find neighbors for all fingerprints in the dataset.

Parameters:

Name	Type	Description	Default
`min_similarity`	`float`	Minimum similarity threshold (0-1)	`None`
`include_self`	`bool`	Whether to include self in results	`False`
`add_columns`	`List[str]`	Additional columns to include in results	`None`

Returns:

Type	Description
`DataFrame`	DataFrame containing neighbors and similarities

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py

def all_neighbors(
    self,
    min_similarity: float = None,
    include_self: bool = False,
    add_columns: List[str] = None,
) -> pd.DataFrame:
    """
    Find neighbors for all fingerprints in the dataset.

    Args:
        min_similarity: Minimum similarity threshold (0-1)
        include_self: Whether to include self in results
        add_columns: Additional columns to include in results

    Returns:
        DataFrame containing neighbors and similarities
    """

    # Call the parent class method to find neighbors
    return self.neighbors(
        query_df=self.df,
        min_similarity=min_similarity,
        include_self=include_self,
        add_columns=add_columns,
    )

`build_proximity_model()`

Prepare the fingerprint data for nearest neighbor calculations. Converts fingerprint strings to binary arrays and initializes NearestNeighbors.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py

def build_proximity_model(self) -> None:
    """
    Prepare the fingerprint data for nearest neighbor calculations.
    Converts fingerprint strings to binary arrays and initializes NearestNeighbors.
    """
    log.info("Converting fingerprints to binary feature matrix...")
    self.proximity_type = ProximityType.SIMILARITY

    # Convert fingerprint strings to binary arrays

    fingerprint_bits = self.df[self.fingerprint_column].apply(
        lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
    )
    self.X = np.vstack(fingerprint_bits)

    # Use Jaccard similarity for binary fingerprints
    log.info("Computing NearestNeighbors with Jaccard metric...")
    self.nn = NearestNeighbors(metric="jaccard", n_neighbors=self.n_neighbors + 1).fit(self.X)

`neighbors(query_df, min_similarity=None, include_self=False, add_columns=None)`

Find neighbors for each row in the query DataFrame.

Parameters:

Name	Type	Description	Default
`query_df`	`DataFrame`	DataFrame containing query fingerprints	required
`min_similarity`	`float`	Minimum similarity threshold (0-1)	`None`
`include_self`	`bool`	Whether to include self in results (if present)	`False`
`add_columns`	`List[str]`	Additional columns to include in results	`None`

Returns:

Type	Description
`DataFrame`	DataFrame containing neighbors and similarities

Note: The query DataFrame must include the feature columns. The id_column is optional.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py

def neighbors(
    self,
    query_df: pd.DataFrame,
    min_similarity: float = None,
    include_self: bool = False,
    add_columns: List[str] = None,
) -> pd.DataFrame:
    """
    Find neighbors for each row in the query DataFrame.

    Args:
        query_df: DataFrame containing query fingerprints
        min_similarity: Minimum similarity threshold (0-1)
        include_self: Whether to include self in results (if present)
        add_columns: Additional columns to include in results

    Returns:
        DataFrame containing neighbors and similarities

    Note: The query DataFrame must include the feature columns. The id_column is optional.
    """

    # Calculate radius from similarity if provided
    radius = 1 - min_similarity if min_similarity is not None else None

    # Call the parent class method to find neighbors
    neighbors_df = super().neighbors(
        query_df=query_df,
        radius=radius,
        include_self=include_self,
        add_columns=add_columns,
    )

    # Convert distances to similarity
    neighbors_df["similarity"] = 1 - neighbors_df["distance"]
    neighbors_df.drop(columns=["distance"], inplace=True)
    return neighbors_df

`prep_features_for_query(query_df)`

Prepare the query DataFrame by converting fingerprints to binary arrays.

Parameters:

Name	Type	Description	Default
`query_df`	`DataFrame`	DataFrame containing query fingerprints.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Binary feature matrix for the query fingerprints.

Source code in src/workbench/algorithms/dataframe/fingerprint_proximity.py

def prep_features_for_query(self, query_df: pd.DataFrame) -> np.ndarray:
    """
    Prepare the query DataFrame by converting fingerprints to binary arrays.

    Args:
        query_df (pd.DataFrame): DataFrame containing query fingerprints.

    Returns:
        np.ndarray: Binary feature matrix for the query fingerprints.
    """
    fingerprint_bits = query_df[self.fingerprint_column].apply(
        lambda fp: np.array([int(bit) for bit in fp], dtype=np.bool_)
    )
    return np.vstack(fingerprint_bits)

`Projection2D`

Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP.

Source code in src/workbench/algorithms/dataframe/projection_2d.py

class Projection2D:
    """Perform Dimensionality Reduction on a DataFrame using TSNE, MDS, PCA, or UMAP."""

    def __init__(self):
        """Initialize the Projection2D class."""
        self.log = logging.getLogger("workbench")
        self.projection_model = None

    def fit_transform(self, input_df: pd.DataFrame, features: list = None, projection: str = "UMAP") -> pd.DataFrame:
        """Fit and transform a DataFrame using the selected dimensionality reduction method.

        This method creates a copy of the input DataFrame, processes the specified features
        for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
        containing the projected 2D coordinates.

        Args:
            input_df (pd.DataFrame): The DataFrame containing features to project.
            features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
            projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

        Returns:
            pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
        """
        # Create a copy of the input DataFrame
        df = input_df.copy()

        # Auto-identify numeric features if none are provided
        if features is None:
            features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
            self.log.info(f"Auto-identified numeric features: {features}")

        if len(features) < 2 or df.empty:
            self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
            return df

        # Process a copy of the feature data for projection
        X = df[features]
        X = X.apply(lambda col: col.fillna(col.mean()))
        X_scaled = StandardScaler().fit_transform(X)

        # Select the projection method (using df for perplexity calculation)
        self.projection_model = self._get_projection_model(projection, df)

        # Apply the projection on the normalized data
        projection_result = self.projection_model.fit_transform(X_scaled)
        df[["x", "y"]] = projection_result

        # Resolve coincident points and return the new DataFrame
        return self.resolve_coincident_points(df)

    def _get_projection_model(self, projection: str, df: pd.DataFrame):
        """Select and return the appropriate projection model.

        Args:
            projection (str): The projection method ('TSNE', 'MDS', 'PCA', or 'UMAP').
            df (pd.DataFrame): The DataFrame being transformed (used for computing perplexity).

        Returns:
            A dimensionality reduction model instance.
        """
        if projection == "TSNE":
            perplexity = min(40, len(df) - 1)
            self.log.info(f"Projection: TSNE with perplexity {perplexity}")
            return TSNE(perplexity=perplexity)

        if projection == "MDS":
            self.log.info("Projection: MDS")
            return MDS(n_components=2, random_state=0)

        if projection == "PCA":
            self.log.info("Projection: PCA")
            return PCA(n_components=2)

        if projection == "UMAP" and UMAP_AVAILABLE:
            self.log.info("Projection: UMAP")
            return umap.UMAP(n_components=2)

        self.log.warning(
            f"Projection method '{projection}' not recognized or UMAP not available. Falling back to TSNE."
        )
        return TSNE(perplexity=min(40, len(df) - 1))

    @staticmethod
    def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
        """Resolve coincident points using random jitter

        Args:
            df (pd.DataFrame): DataFrame with x and y coordinates.

        Returns:
            pd.DataFrame: DataFrame with resolved coincident points
        """

        # Set jitter size based on rounding precision
        precision = 3
        jitter_amount = 10 ** (-precision) * 2  # 2x the rounding precision

        # Create rounded values for grouping
        rounded = pd.DataFrame(
            {"x_round": df["x"].round(precision), "y_round": df["y"].round(precision), "idx": df.index}
        )

        # Find duplicates
        duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
        print("Coincident Points found:", duplicated.sum())
        if not duplicated.any():
            return df

        # Get the dtypes of the columns
        x_dtype = df["x"].dtype
        y_dtype = df["y"].dtype

        # Process each group
        for (x_round, y_round), group in rounded[duplicated].groupby(["x_round", "y_round"]):
            indices = group["idx"].values
            if len(indices) <= 1:
                continue

            # Apply random jitter to all points
            for i, idx in enumerate(indices):
                # Generate and apply properly typed offsets
                dx = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=x_dtype)
                dy = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=y_dtype)
                df.loc[idx, "x"] += dx
                df.loc[idx, "y"] += dy

        return df

`init()`

Initialize the Projection2D class.

Source code in src/workbench/algorithms/dataframe/projection_2d.py

def __init__(self):
    """Initialize the Projection2D class."""
    self.log = logging.getLogger("workbench")
    self.projection_model = None

`fit_transform(input_df, features=None, projection='UMAP')`

Fit and transform a DataFrame using the selected dimensionality reduction method.

This method creates a copy of the input DataFrame, processes the specified features for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns containing the projected 2D coordinates.

Parameters:

Name	Type	Description	Default
`input_df`	`DataFrame`	The DataFrame containing features to project.	required
`features`	`list`	List of feature column names. If None, numeric columns are auto-selected.	`None`
`projection`	`str`	The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.	`'UMAP'`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.

Source code in src/workbench/algorithms/dataframe/projection_2d.py

def fit_transform(self, input_df: pd.DataFrame, features: list = None, projection: str = "UMAP") -> pd.DataFrame:
    """Fit and transform a DataFrame using the selected dimensionality reduction method.

    This method creates a copy of the input DataFrame, processes the specified features
    for normalization and projection, and returns a new DataFrame with added 'x' and 'y' columns
    containing the projected 2D coordinates.

    Args:
        input_df (pd.DataFrame): The DataFrame containing features to project.
        features (list, optional): List of feature column names. If None, numeric columns are auto-selected.
        projection (str, optional): The projection to use ('UMAP', 'TSNE', 'MDS' or 'PCA'). Default 'UMAP'.

    Returns:
        pd.DataFrame: A new DataFrame (a copy of input_df) with added 'x' and 'y' columns.
    """
    # Create a copy of the input DataFrame
    df = input_df.copy()

    # Auto-identify numeric features if none are provided
    if features is None:
        features = [col for col in df.select_dtypes(include="number").columns if not col.endswith("id")]
        self.log.info(f"Auto-identified numeric features: {features}")

    if len(features) < 2 or df.empty:
        self.log.critical("At least two numeric features are required, and DataFrame must not be empty.")
        return df

    # Process a copy of the feature data for projection
    X = df[features]
    X = X.apply(lambda col: col.fillna(col.mean()))
    X_scaled = StandardScaler().fit_transform(X)

    # Select the projection method (using df for perplexity calculation)
    self.projection_model = self._get_projection_model(projection, df)

    # Apply the projection on the normalized data
    projection_result = self.projection_model.fit_transform(X_scaled)
    df[["x", "y"]] = projection_result

    # Resolve coincident points and return the new DataFrame
    return self.resolve_coincident_points(df)

`resolve_coincident_points(df)` `staticmethod`

Resolve coincident points using random jitter

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame with x and y coordinates.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: DataFrame with resolved coincident points

Source code in src/workbench/algorithms/dataframe/projection_2d.py

@staticmethod
def resolve_coincident_points(df: pd.DataFrame) -> pd.DataFrame:
    """Resolve coincident points using random jitter

    Args:
        df (pd.DataFrame): DataFrame with x and y coordinates.

    Returns:
        pd.DataFrame: DataFrame with resolved coincident points
    """

    # Set jitter size based on rounding precision
    precision = 3
    jitter_amount = 10 ** (-precision) * 2  # 2x the rounding precision

    # Create rounded values for grouping
    rounded = pd.DataFrame(
        {"x_round": df["x"].round(precision), "y_round": df["y"].round(precision), "idx": df.index}
    )

    # Find duplicates
    duplicated = rounded.duplicated(subset=["x_round", "y_round"], keep=False)
    print("Coincident Points found:", duplicated.sum())
    if not duplicated.any():
        return df

    # Get the dtypes of the columns
    x_dtype = df["x"].dtype
    y_dtype = df["y"].dtype

    # Process each group
    for (x_round, y_round), group in rounded[duplicated].groupby(["x_round", "y_round"]):
        indices = group["idx"].values
        if len(indices) <= 1:
            continue

        # Apply random jitter to all points
        for i, idx in enumerate(indices):
            # Generate and apply properly typed offsets
            dx = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=x_dtype)
            dy = np.array(jitter_amount * (np.random.random() * 2 - 1), dtype=y_dtype)
            df.loc[idx, "x"] += dx
            df.loc[idx, "y"] += dy

    return df

Questions?

The SuperCowPowers team is happy to answer any questions you may have about AWS and Workbench. Please contact us at workbench@supercowpowers.com or on chat us up on Discord

Pandas Dataframe Algorithms

FeatureSpaceProximity

__init__(model, n_neighbors=10)

FingerprintProximity

__init__(df, id_column, fingerprint_column, n_neighbors=5)

all_neighbors(min_similarity=None, include_self=False, add_columns=None)

build_proximity_model()

neighbors(query_df, min_similarity=None, include_self=False, add_columns=None)

prep_features_for_query(query_df)

Projection2D

__init__()

fit_transform(input_df, features=None, projection='UMAP')

resolve_coincident_points(df) staticmethod

Questions?

`FeatureSpaceProximity`

`init(model, n_neighbors=10)`

`FingerprintProximity`

`init(df, id_column, fingerprint_column, n_neighbors=5)`

`all_neighbors(min_similarity=None, include_self=False, add_columns=None)`

`build_proximity_model()`

`neighbors(query_df, min_similarity=None, include_self=False, add_columns=None)`

`prep_features_for_query(query_df)`

`Projection2D`

`init()`

`fit_transform(input_df, features=None, projection='UMAP')`

`resolve_coincident_points(df)` `staticmethod`