Graph Algorithms
Graph Algorithms
WIP: These classes are currently actively being developed and are subject to change in both API and functionality over time.
Graph Algorithms
Docs TBD
ProximityGraph
Build a NetworkX graph using the Proximity class.
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
| class ProximityGraph:
"""Build a NetworkX graph using the Proximity class."""
def __init__(self):
"""Build a NetworkX graph using the Proximity class."""
# The graph is stored as NetworkX graph
self._nx_graph = None
# GraphStore
self.graph_store = GraphStore()
def build_graph(self, proximity_instance: Proximity) -> None:
"""
Build a NetworkX graph using a Proximity class.
Args:
proximity_instance (Proximity): An instance of a Proximity class to compute neighbors.
"""
# Retrieve all neighbors and their distances
prox = proximity_instance
node_df = prox.df
id_column = prox.id_column
log.info("Retrieving all neighbors...")
all_neighbors_df = prox.all_neighbors()
# Add nodes with attributes (features)
log.info("Adding nodes to the proximity graph...")
self._nx_graph = nx.Graph()
# Check for duplicate IDs in the node DataFrame
if not node_df[id_column].is_unique:
log.error(f"Column '{id_column}' contains duplicate values. Using first occurrence for each ID...")
node_df = node_df.drop_duplicates(subset=[id_column], keep="first")
# Set the id_column as index and add nodes
self._nx_graph.add_nodes_from(node_df.set_index(id_column, drop=False).to_dict("index").items())
# Determine edge weights based on proximity type
if prox.proximity_type == ProximityType.SIMILARITY:
all_neighbors_df["weight"] = all_neighbors_df["similarity"]
elif prox.proximity_type == ProximityType.DISTANCE:
# Normalize and invert distance
max_distance = all_neighbors_df["distance"].max()
all_neighbors_df["weight"] = 1.0 - all_neighbors_df["distance"] / max_distance
# Add edges to the graph
log.info("Adding edges to the graph...")
min_edges = 2
min_weight = 0.8
# Group by source ID and process each group
for source_id, group in all_neighbors_df.groupby(id_column):
# Sort by weight (assuming higher is better)
sorted_group = group.sort_values("weight", ascending=False)
# Take all edges up to min_edges (or all if fewer)
actual_min_edges = min(len(sorted_group), min_edges)
top_edges = sorted_group.iloc[:actual_min_edges]
# Also take any additional neighbors above min_weight (beyond the top edges)
high_weight_edges = sorted_group.iloc[actual_min_edges:][
sorted_group.iloc[actual_min_edges:]["weight"] > min_weight
]
# Combine both sets
edges_to_add = pd.concat([top_edges, high_weight_edges])
# Add all edges at once
self._nx_graph.add_edges_from(
[(source_id, row["neighbor_id"], {"weight": row["weight"]}) for _, row in edges_to_add.iterrows()]
)
@property
def nx_graph(self) -> nx.Graph:
"""
Get the NetworkX graph object.
Returns:
nx.Graph: The NetworkX graph object.
"""
return self._nx_graph
def load_graph(self, graph_path: str):
"""
Load a graph from the GraphStore.
Args:
graph_path (str): The path to the graph in GraphStore.
"""
self._nx_graph = self.graph_store.get(graph_path)
def store_graph(self, graph_path: str):
"""
Store the graph in the GraphStore.
Args:
graph_path (str): The path to store the graph in GraphStore.
"""
self.graph_store.upsert(graph_path, self._nx_graph)
def get_neighborhood(self, node_id: Union[str, int], radius: int = 1) -> nx.Graph:
"""
Get a subgraph containing nodes within a given number of hops around a specific node.
Args:
node_id: The ID of the node to center the neighborhood around.
radius: The number of hops to consider around the node (default: 1).
Returns:
nx.Graph: A subgraph containing the specified neighborhood.
"""
# Use NetworkX's ego_graph to extract the neighborhood within the given radius
if node_id in self._nx_graph:
return nx.ego_graph(self._nx_graph, node_id, radius=radius)
else:
raise ValueError(f"Node ID '{node_id}' not found in the graph.")
|
nx_graph
property
Get the NetworkX graph object.
Returns:
Type |
Description |
Graph
|
nx.Graph: The NetworkX graph object.
|
__init__()
Build a NetworkX graph using the Proximity class.
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
| def __init__(self):
"""Build a NetworkX graph using the Proximity class."""
# The graph is stored as NetworkX graph
self._nx_graph = None
# GraphStore
self.graph_store = GraphStore()
|
build_graph(proximity_instance)
Build a NetworkX graph using a Proximity class.
Parameters:
Name |
Type |
Description |
Default |
proximity_instance
|
Proximity
|
An instance of a Proximity class to compute neighbors.
|
required
|
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
| def build_graph(self, proximity_instance: Proximity) -> None:
"""
Build a NetworkX graph using a Proximity class.
Args:
proximity_instance (Proximity): An instance of a Proximity class to compute neighbors.
"""
# Retrieve all neighbors and their distances
prox = proximity_instance
node_df = prox.df
id_column = prox.id_column
log.info("Retrieving all neighbors...")
all_neighbors_df = prox.all_neighbors()
# Add nodes with attributes (features)
log.info("Adding nodes to the proximity graph...")
self._nx_graph = nx.Graph()
# Check for duplicate IDs in the node DataFrame
if not node_df[id_column].is_unique:
log.error(f"Column '{id_column}' contains duplicate values. Using first occurrence for each ID...")
node_df = node_df.drop_duplicates(subset=[id_column], keep="first")
# Set the id_column as index and add nodes
self._nx_graph.add_nodes_from(node_df.set_index(id_column, drop=False).to_dict("index").items())
# Determine edge weights based on proximity type
if prox.proximity_type == ProximityType.SIMILARITY:
all_neighbors_df["weight"] = all_neighbors_df["similarity"]
elif prox.proximity_type == ProximityType.DISTANCE:
# Normalize and invert distance
max_distance = all_neighbors_df["distance"].max()
all_neighbors_df["weight"] = 1.0 - all_neighbors_df["distance"] / max_distance
# Add edges to the graph
log.info("Adding edges to the graph...")
min_edges = 2
min_weight = 0.8
# Group by source ID and process each group
for source_id, group in all_neighbors_df.groupby(id_column):
# Sort by weight (assuming higher is better)
sorted_group = group.sort_values("weight", ascending=False)
# Take all edges up to min_edges (or all if fewer)
actual_min_edges = min(len(sorted_group), min_edges)
top_edges = sorted_group.iloc[:actual_min_edges]
# Also take any additional neighbors above min_weight (beyond the top edges)
high_weight_edges = sorted_group.iloc[actual_min_edges:][
sorted_group.iloc[actual_min_edges:]["weight"] > min_weight
]
# Combine both sets
edges_to_add = pd.concat([top_edges, high_weight_edges])
# Add all edges at once
self._nx_graph.add_edges_from(
[(source_id, row["neighbor_id"], {"weight": row["weight"]}) for _, row in edges_to_add.iterrows()]
)
|
get_neighborhood(node_id, radius=1)
Get a subgraph containing nodes within a given number of hops around a specific node.
Parameters:
Name |
Type |
Description |
Default |
node_id
|
Union[str, int]
|
The ID of the node to center the neighborhood around.
|
required
|
radius
|
int
|
The number of hops to consider around the node (default: 1).
|
1
|
Returns:
Type |
Description |
Graph
|
nx.Graph: A subgraph containing the specified neighborhood.
|
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
| def get_neighborhood(self, node_id: Union[str, int], radius: int = 1) -> nx.Graph:
"""
Get a subgraph containing nodes within a given number of hops around a specific node.
Args:
node_id: The ID of the node to center the neighborhood around.
radius: The number of hops to consider around the node (default: 1).
Returns:
nx.Graph: A subgraph containing the specified neighborhood.
"""
# Use NetworkX's ego_graph to extract the neighborhood within the given radius
if node_id in self._nx_graph:
return nx.ego_graph(self._nx_graph, node_id, radius=radius)
else:
raise ValueError(f"Node ID '{node_id}' not found in the graph.")
|
load_graph(graph_path)
Load a graph from the GraphStore.
Parameters:
Name |
Type |
Description |
Default |
graph_path
|
str
|
The path to the graph in GraphStore.
|
required
|
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
| def load_graph(self, graph_path: str):
"""
Load a graph from the GraphStore.
Args:
graph_path (str): The path to the graph in GraphStore.
"""
self._nx_graph = self.graph_store.get(graph_path)
|
store_graph(graph_path)
Store the graph in the GraphStore.
Parameters:
Name |
Type |
Description |
Default |
graph_path
|
str
|
The path to store the graph in GraphStore.
|
required
|
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
| def store_graph(self, graph_path: str):
"""
Store the graph in the GraphStore.
Args:
graph_path (str): The path to store the graph in GraphStore.
"""
self.graph_store.upsert(graph_path, self._nx_graph)
|
show_graph(graph, id_column)
Display the graph using Plotly.
Source code in src/workbench/algorithms/graph/light/proximity_graph.py
| def show_graph(graph, id_column):
"""Display the graph using Plotly."""
graph_plot = GraphPlot()
properties = graph_plot.update_properties(graph, labels=id_column, hover_text="all")
fig = properties[0]
fig.update_layout(paper_bgcolor="rgb(30,30,30)", plot_bgcolor="rgb(30,30,30)")
fig.show()
|
Questions?

The SuperCowPowers team is happy to answer any questions you may have about AWS and Workbench. Please contact us at workbench@supercowpowers.com or on chat us up on Discord