Thank you, @matthewmcneely, for providing this information; it is greatly appreciated. The system is performing effectively for the majority of the data we are analyzing. However, I have identified a few inconsistencies that need to be addressed.
Here is the code we are currently using:
import networkx as nx
from pydgraph import convert
from typing import List
class NetworkXPathAnalyzer:
def __init__(self):
self.graph = None
self.node_types = {}
def load_graph_from_dgraph(self,
node_types: List[str],
edge_predicates: List[str],
max_depth: int = 10) -> nx.MultiDiGraph:
"""
Load graph data from Dgraph into NetworkX for analysis.
More efficient than multiple shortest path queries.
"""
# Build comprehensive query using @recurse
type_filter = " OR ".join([f"type({nt})" for nt in node_types])
edge_filter = "\n ".join(edge_predicates + [f"~{ep}" for ep in edge_predicates])
query = f"""
{{
nodes(func: has(dgraph.type)) @filter({type_filter}) @recurse(depth: {max_depth}) {{
id:uid
dgraph.type
{edge_filter}
}}
}}
"""
print("Loading graph data from Dgraph...")
res = dgraphManager.query(query) # Our method to run query
# Use pydgraph's extract_dict to convert to nodes and edges
nodes = {}
edges = []
convert.extract_dict(nodes=nodes, edges=edges, data=res)
# Create NetworkX graph
self.graph = nx.MultiDiGraph()
# Add nodes with type information
for node_id, node_data in nodes.items():
node_type = node_data.get('dgraph.type', ['Unknown'])[0] if isinstance(
node_data.get('dgraph.type'), list) else node_data.get('dgraph.type', 'Unknown')
self.graph.add_node(node_id, node_type=node_type)
self.node_types[node_id] = node_type
# Add edges
for edge in edges:
self.graph.add_edge(edge['src'], edge['dst'], predicate=edge.get('predicate', 'unknown'))
print(f"Loaded graph with {self.graph.number_of_nodes()} nodes and {self.graph.number_of_edges()} edges")
return self.graph
def find_paths_between_types(self,
source_type: str,
target_type: str,
max_paths_to_analyze: int = 10000,
count_self_loops_as_one: bool = True):
"""
Find shortest and longest paths between node types using bidirectional_dijkstra.
Much more efficient than multiple Dgraph queries.
"""
if self.graph is None:
raise ValueError("Graph not loaded. Call load_graph_from_dgraph first.")
# Get nodes of specified types
source_nodes = [node for node, data in self.graph.nodes(data=True)
if data.get('node_type') == source_type]
target_nodes = [node for node, data in self.graph.nodes(data=True)
if data.get('node_type') == target_type]
if not source_nodes or not target_nodes:
return {"error": f"No nodes found for types {source_type} or {target_type}"}
# Calculate all shortest paths using bidirectional_dijkstra
all_path_lengths = []
paths_analyzed = 0
for source in source_nodes:
if paths_analyzed >= max_paths_to_analyze:
break
for target in target_nodes:
if paths_analyzed >= max_paths_to_analyze:
break
try:
# Handle self-loops
if source == target:
if count_self_loops_as_one:
all_path_lengths.append(1)
else:
all_path_lengths.append(0)
paths_analyzed += 1
continue
# Use bidirectional_dijkstra for efficient pathfinding
path_length, path = nx.bidirectional_dijkstra(self.graph, source, target)
# bidirectional_dijkstra returns (length, path)
# For unweighted graphs, length equals number of edges
all_path_lengths.append(int(path_length))
paths_analyzed += 1
except nx.NetworkXNoPath:
# No path exists between these nodes
continue
except Exception as e:
print(f"Error processing path from {source} to {target}: {e}")
continue
if not all_path_lengths:
return {
"shortest_path_length": 0,
"longest_path_length": 0,
"message": f"No paths found between {source_type} and {target_type}"
}
# Calculate comprehensive statistics
return self._calculate_path_statistics(all_path_lengths, source_type, target_type)
def _calculate_path_statistics(self, path_lengths: List[int],
source_type: str, target_type: str):
"""Calculate comprehensive path statistics."""
return {
"source_type": source_type,
"target_type": target_type,
"shortest_path_length": min(path_lengths),
"longest_path_length": max(path_lengths),
}
Specifically, in both the query and the code, we expected the shortest path length to be 5, but the result returned was 8. Furthermore, when adjusting the depth of the @recurse query (e.g., increasing from 10 → 20 → 30 or decreasing from 10 → 5 → 2), the shortest length reported came back as 7, whereas we were anticipating 5. we tried both methods of networkx 1st shortest_path_length and 2nd bidirectional_dijkstra Both methods return 7 and 8, respectively, but not 5 as expected.
It seems the path calculation is being influenced by something we may not be accounting for correctly—either in the query recursion depth, how the edges are being extracted/added to the graph, or in the way NetworkX handles shortest paths within this setup.
It’s crucial that we clarify these discrepancies, and I am confident that addressing them will improve the accuracy of the results. I’d greatly appreciate any insights or suggestions on what might be causing the divergence between the expected and actual shortest path values.
Thank you again for your guidance and support!