Skip to content

Graph Algorithms

Graph Algorithms

WIP: These classes are currently actively being developed and are subject to change in both API and functionality over time.

Graph Algorithms

Docs TBD

ProximityGraph

Build a NetworkX graph using the Proximity class.

Source code in src/workbench/algorithms/graph/light/proximity_graph.py
class ProximityGraph:
    """Build a NetworkX graph using the Proximity class."""

    def __init__(self):
        """Build a NetworkX graph using the Proximity class."""

        # The graph is stored as NetworkX graph
        self._nx_graph = None

        # GraphStore
        self.graph_store = GraphStore()

    def build_graph(self, proximity_instance: Proximity) -> None:
        """
        Build a NetworkX graph using a Proximity class.

        Args:
            proximity_instance (Proximity): An instance of a Proximity class to compute neighbors.
        """
        # Retrieve all neighbors and their distances
        prox = proximity_instance
        node_df = prox.df
        id_column = prox.id_column
        log.info("Retrieving all neighbors...")
        all_neighbors_df = prox.all_neighbors()

        # Add nodes with attributes (features)
        log.info("Adding nodes to the proximity graph...")
        self._nx_graph = nx.Graph()

        # Check for duplicate IDs in the node DataFrame
        if not node_df[id_column].is_unique:
            log.error(f"Column '{id_column}' contains duplicate values. Using first occurrence for each ID...")
            node_df = node_df.drop_duplicates(subset=[id_column], keep="first")

        # Set the id_column as index and add nodes
        self._nx_graph.add_nodes_from(node_df.set_index(id_column, drop=False).to_dict("index").items())

        # Determine edge weights based on proximity type
        if prox.proximity_type == ProximityType.SIMILARITY:
            all_neighbors_df["weight"] = all_neighbors_df["similarity"]
        elif prox.proximity_type == ProximityType.DISTANCE:
            # Normalize and invert distance
            max_distance = all_neighbors_df["distance"].max()
            all_neighbors_df["weight"] = 1.0 - all_neighbors_df["distance"] / max_distance

        # Add edges to the graph
        log.info("Adding edges to the graph...")
        min_edges = 2
        min_weight = 0.8

        # Group by source ID and process each group
        for source_id, group in all_neighbors_df.groupby(id_column):
            # Sort by weight (assuming higher is better)
            sorted_group = group.sort_values("weight", ascending=False)

            # Take all edges up to min_edges (or all if fewer)
            actual_min_edges = min(len(sorted_group), min_edges)
            top_edges = sorted_group.iloc[:actual_min_edges]

            # Also take any additional neighbors above min_weight (beyond the top edges)
            high_weight_edges = sorted_group.iloc[actual_min_edges:][
                sorted_group.iloc[actual_min_edges:]["weight"] > min_weight
            ]

            # Combine both sets
            edges_to_add = pd.concat([top_edges, high_weight_edges])

            # Add all edges at once
            self._nx_graph.add_edges_from(
                [(source_id, row["neighbor_id"], {"weight": row["weight"]}) for _, row in edges_to_add.iterrows()]
            )

    @property
    def nx_graph(self) -> nx.Graph:
        """
        Get the NetworkX graph object.

        Returns:
            nx.Graph: The NetworkX graph object.
        """
        return self._nx_graph

    def load_graph(self, graph_path: str):
        """
        Load a graph from the GraphStore.

        Args:
            graph_path (str): The path to the graph in GraphStore.
        """
        self._nx_graph = self.graph_store.get(graph_path)

    def store_graph(self, graph_path: str):
        """
        Store the graph in the GraphStore.

        Args:
            graph_path (str): The path to store the graph in GraphStore.
        """
        self.graph_store.upsert(graph_path, self._nx_graph)

    def get_neighborhood(self, node_id: Union[str, int], radius: int = 1) -> nx.Graph:
        """
        Get a subgraph containing nodes within a given number of hops around a specific node.

        Args:
            node_id: The ID of the node to center the neighborhood around.
            radius: The number of hops to consider around the node (default: 1).

        Returns:
            nx.Graph: A subgraph containing the specified neighborhood.
        """
        # Use NetworkX's ego_graph to extract the neighborhood within the given radius
        if node_id in self._nx_graph:
            return nx.ego_graph(self._nx_graph, node_id, radius=radius)
        else:
            raise ValueError(f"Node ID '{node_id}' not found in the graph.")

nx_graph property

Get the NetworkX graph object.

Returns:

Type Description
Graph

nx.Graph: The NetworkX graph object.

__init__()

Build a NetworkX graph using the Proximity class.

Source code in src/workbench/algorithms/graph/light/proximity_graph.py
def __init__(self):
    """Build a NetworkX graph using the Proximity class."""

    # The graph is stored as NetworkX graph
    self._nx_graph = None

    # GraphStore
    self.graph_store = GraphStore()

build_graph(proximity_instance)

Build a NetworkX graph using a Proximity class.

Parameters:

Name Type Description Default
proximity_instance Proximity

An instance of a Proximity class to compute neighbors.

required
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
def build_graph(self, proximity_instance: Proximity) -> None:
    """
    Build a NetworkX graph using a Proximity class.

    Args:
        proximity_instance (Proximity): An instance of a Proximity class to compute neighbors.
    """
    # Retrieve all neighbors and their distances
    prox = proximity_instance
    node_df = prox.df
    id_column = prox.id_column
    log.info("Retrieving all neighbors...")
    all_neighbors_df = prox.all_neighbors()

    # Add nodes with attributes (features)
    log.info("Adding nodes to the proximity graph...")
    self._nx_graph = nx.Graph()

    # Check for duplicate IDs in the node DataFrame
    if not node_df[id_column].is_unique:
        log.error(f"Column '{id_column}' contains duplicate values. Using first occurrence for each ID...")
        node_df = node_df.drop_duplicates(subset=[id_column], keep="first")

    # Set the id_column as index and add nodes
    self._nx_graph.add_nodes_from(node_df.set_index(id_column, drop=False).to_dict("index").items())

    # Determine edge weights based on proximity type
    if prox.proximity_type == ProximityType.SIMILARITY:
        all_neighbors_df["weight"] = all_neighbors_df["similarity"]
    elif prox.proximity_type == ProximityType.DISTANCE:
        # Normalize and invert distance
        max_distance = all_neighbors_df["distance"].max()
        all_neighbors_df["weight"] = 1.0 - all_neighbors_df["distance"] / max_distance

    # Add edges to the graph
    log.info("Adding edges to the graph...")
    min_edges = 2
    min_weight = 0.8

    # Group by source ID and process each group
    for source_id, group in all_neighbors_df.groupby(id_column):
        # Sort by weight (assuming higher is better)
        sorted_group = group.sort_values("weight", ascending=False)

        # Take all edges up to min_edges (or all if fewer)
        actual_min_edges = min(len(sorted_group), min_edges)
        top_edges = sorted_group.iloc[:actual_min_edges]

        # Also take any additional neighbors above min_weight (beyond the top edges)
        high_weight_edges = sorted_group.iloc[actual_min_edges:][
            sorted_group.iloc[actual_min_edges:]["weight"] > min_weight
        ]

        # Combine both sets
        edges_to_add = pd.concat([top_edges, high_weight_edges])

        # Add all edges at once
        self._nx_graph.add_edges_from(
            [(source_id, row["neighbor_id"], {"weight": row["weight"]}) for _, row in edges_to_add.iterrows()]
        )

get_neighborhood(node_id, radius=1)

Get a subgraph containing nodes within a given number of hops around a specific node.

Parameters:

Name Type Description Default
node_id Union[str, int]

The ID of the node to center the neighborhood around.

required
radius int

The number of hops to consider around the node (default: 1).

1

Returns:

Type Description
Graph

nx.Graph: A subgraph containing the specified neighborhood.

Source code in src/workbench/algorithms/graph/light/proximity_graph.py
def get_neighborhood(self, node_id: Union[str, int], radius: int = 1) -> nx.Graph:
    """
    Get a subgraph containing nodes within a given number of hops around a specific node.

    Args:
        node_id: The ID of the node to center the neighborhood around.
        radius: The number of hops to consider around the node (default: 1).

    Returns:
        nx.Graph: A subgraph containing the specified neighborhood.
    """
    # Use NetworkX's ego_graph to extract the neighborhood within the given radius
    if node_id in self._nx_graph:
        return nx.ego_graph(self._nx_graph, node_id, radius=radius)
    else:
        raise ValueError(f"Node ID '{node_id}' not found in the graph.")

load_graph(graph_path)

Load a graph from the GraphStore.

Parameters:

Name Type Description Default
graph_path str

The path to the graph in GraphStore.

required
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
def load_graph(self, graph_path: str):
    """
    Load a graph from the GraphStore.

    Args:
        graph_path (str): The path to the graph in GraphStore.
    """
    self._nx_graph = self.graph_store.get(graph_path)

store_graph(graph_path)

Store the graph in the GraphStore.

Parameters:

Name Type Description Default
graph_path str

The path to store the graph in GraphStore.

required
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
def store_graph(self, graph_path: str):
    """
    Store the graph in the GraphStore.

    Args:
        graph_path (str): The path to store the graph in GraphStore.
    """
    self.graph_store.upsert(graph_path, self._nx_graph)

show_graph(graph, id_column)

Display the graph using Plotly.

Source code in src/workbench/algorithms/graph/light/proximity_graph.py
def show_graph(graph, id_column):
    """Display the graph using Plotly."""
    graph_plot = GraphPlot()
    properties = graph_plot.update_properties(graph, labels=id_column, hover_text="all")
    fig = properties[0]
    fig.update_layout(paper_bgcolor="rgb(30,30,30)", plot_bgcolor="rgb(30,30,30)")
    fig.show()

Questions?

The SuperCowPowers team is happy to answer any questions you may have about AWS and Workbench. Please contact us at workbench@supercowpowers.com or on chat us up on Discord