From 71bda1589cb96ec7d5e4468098a7dc5f8e7fe171 Mon Sep 17 00:00:00 2001 From: Naadir Jeewa Date: Sun, 8 Mar 2026 16:35:11 +0000 Subject: [PATCH 1/6] refactor: remove deprecated GraphFormat, GraphSourceType, GraphDataset, RunConfiguration These types were superseded by GraphSource, RunConfig, and RunMode in core::config/backend but never removed. Since there is no published release, drop them without deprecation period. Co-Authored-By: Claude Opus 4.6 --- src/core/types.rs | 129 ++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/src/core/types.rs b/src/core/types.rs index fd2222a..accb043 100644 --- a/src/core/types.rs +++ b/src/core/types.rs @@ -14,64 +14,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#[derive(Clone, Debug, PartialEq)] -pub enum GraphFormat { - EdgeList, - CsrBinary, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum GraphSourceType { - File, - Neo4jSnapshot, -} - -#[derive(Clone, Debug, PartialEq)] -pub struct GraphDataset { - pub dataset_id: String, - pub source_uri: String, - pub is_weighted: bool, - pub node_count: usize, - pub edge_count: usize, - pub checksum: String, - pub format: GraphFormat, - pub source_type: GraphSourceType, - pub source_snapshot_id: Option, -} - -#[derive(Clone, Debug, PartialEq)] -pub enum RunMode { - Deterministic, - Throughput, -} - -#[derive(Clone, Debug, PartialEq)] -pub struct RunConfiguration { - pub config_id: String, - pub mode: RunMode, - pub acceleration_enabled: bool, - pub seed: Option, - pub max_iterations: usize, - pub quality_tolerance: f64, - pub pinned_profile_id: Option, - pub graph_source: GraphSourceType, -} - -impl Default for RunConfiguration { - fn default() -> Self { - Self { - config_id: "default".to_string(), - mode: RunMode::Deterministic, - acceleration_enabled: false, - seed: None, - max_iterations: 10, - quality_tolerance: 0.001, - pinned_profile_id: None, - graph_source: GraphSourceType::File, - } - } -} - +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub enum RunStatus { Running, @@ -79,6 +22,7 @@ pub enum RunStatus { Failed, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub enum BackendType { PureRust, @@ -87,6 +31,7 @@ pub enum BackendType { RocmAccel, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct RunExecution { pub run_id: String, @@ -96,10 +41,11 @@ pub struct RunExecution { pub completed_at: Option, pub status: RunStatus, pub backend: BackendType, - pub graph_source_resolved: GraphSourceType, + pub graph_source_resolved: crate::core::backend::GraphSource, pub fallback_reason: Option, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct PartitionResult { pub run_id: String, @@ -113,6 +59,7 @@ pub struct PartitionResult { pub iteration_count: usize, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct ValidationReport { pub run_id: String, @@ -123,6 +70,7 @@ pub struct ValidationReport { pub notes: Option, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct RunOutcome { pub execution: RunExecution, @@ -130,21 +78,80 @@ pub struct RunOutcome { pub validation: Option, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct GraphInput { - pub dataset_id: String, + pub dataset_id: Option, pub node_count: usize, pub edges: Vec<(usize, usize, Option)>, } impl GraphInput { + /// Create a `GraphInput` from a weighted edge list. Computes `node_count` + /// automatically from the maximum node index. + pub fn from_edges(edges: Vec<(usize, usize, Option)>) -> Self { + let node_count = edges + .iter() + .flat_map(|(u, v, _)| [*u, *v]) + .max() + .map(|m| m + 1) + .unwrap_or(0); + Self { + dataset_id: None, + node_count, + edges, + } + } + + /// Create a `GraphInput` from unweighted edges. Computes `node_count` + /// automatically from the maximum node index. + pub fn from_unweighted_edges(edges: Vec<(usize, usize)>) -> Self { + Self::from_edges(edges.into_iter().map(|(u, v)| (u, v, None)).collect()) + } + + /// Create an empty graph, optionally with a dataset identifier. pub fn empty(dataset_id: impl Into) -> Self { Self { - dataset_id: dataset_id.into(), + dataset_id: Some(dataset_id.into()), node_count: 0, edges: Vec::new(), } } + + /// Validate that all edge endpoints are within bounds. + pub fn validate(&self) -> Result<(), crate::core::error::HitLeidenError> { + for (i, (u, v, _)) in self.edges.iter().enumerate() { + if *u >= self.node_count || *v >= self.node_count { + return Err(crate::core::error::HitLeidenError::InvalidInput(format!( + "edge {} ({}, {}) references node >= node_count {}", + i, u, v, self.node_count + ))); + } + } + Ok(()) + } +} + +impl RunOutcome { + /// Extract the partition result, returning an error if no partition was produced. + pub fn into_partition(self) -> Result { + self.partition.ok_or_else(|| { + crate::core::error::HitLeidenError::InvalidInput("no partition produced".into()) + }) + } +} + +impl Default for PartitionResult { + fn default() -> Self { + Self { + run_id: String::new(), + node_to_community: Vec::new(), + hierarchy_levels: Vec::new(), + community_count: 0, + quality_score: 0.0, + iteration_count: 0, + } + } } // --- Benchmark & Profiling types (feature: profiling) --- From 3c859db83b0235da40009b8e7b0796c7946de532 Mon Sep 17 00:00:00 2001 From: Naadir Jeewa Date: Sun, 8 Mar 2026 16:36:40 +0000 Subject: [PATCH 2/6] refactor: remove unused refinement_gamma parameter The refinement_gamma parameter was accepted but unused (prefixed with _) in refine_singleton_merge. Remove it from RunConfig, multilevel_leiden, refine_singleton_merge, CLI, and all test call sites. Incorporates the intent of PR #3 with additional missed call sites fixed. Co-Authored-By: Claude Opus 4.6 --- src/cli/run.rs | 3 +- src/core/algorithm/hit_leiden.rs | 65 +++++++++++++------------------- src/core/config.rs | 40 ++++++++++++++++---- 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/src/cli/run.rs b/src/cli/run.rs index f67ac15..873beda 100644 --- a/src/cli/run.rs +++ b/src/cli/run.rs @@ -30,13 +30,12 @@ pub fn run_from_cli( let config = RunConfig { mode, - graph_source: GraphSource::File, // Assuming file for now + graph_source: Some(GraphSource::File), acceleration: AccelerationTarget::PureRust, quality_tolerance: 0.001, max_iterations: 10, pinned_profile: None, resolution: 1.0, - refinement_gamma: 0.05, }; crate::run(graph, &config) diff --git a/src/core/algorithm/hit_leiden.rs b/src/core/algorithm/hit_leiden.rs index 981f28a..db772a7 100644 --- a/src/core/algorithm/hit_leiden.rs +++ b/src/core/algorithm/hit_leiden.rs @@ -56,7 +56,6 @@ pub fn run(graph: &GraphInput, config: &RunConfig) -> Result Result Result crate::core::types::GraphSourceType::File, - crate::core::backend::GraphSource::Neo4jSnapshot => { - crate::core::types::GraphSourceType::Neo4jSnapshot - } - crate::core::backend::GraphSource::LiveNeo4j => { - crate::core::types::GraphSourceType::Neo4jSnapshot - } // Fallback - }, + graph_source_resolved: resolution_meta.source_resolved, fallback_reason: resolution_meta.fallback_reason, }; @@ -119,7 +111,7 @@ pub fn run(graph: &GraphInput, config: &RunConfig) -> Result f64 { +pub(crate) fn compute_modularity(graph: &GraphInput, node_to_community: &[usize]) -> f64 { let n = graph.node_count; // Compute node degrees from edge list @@ -165,11 +157,10 @@ fn compute_modularity(graph: &GraphInput, node_to_community: &[usize]) -> f64 { /// subcommunities (connected components), aggregate based on subcommunities, and repeat. /// The refinement step prevents mega-communities by ensuring the coarsened graph /// represents subcommunity-level structure, following the standard Leiden approach. -fn multilevel_leiden( +pub(crate) fn multilevel_leiden( state: &mut PartitionState, graph: &GraphInput, gamma: f64, - refinement_gamma: f64, mode: crate::core::config::RunMode, max_levels: usize, ) -> (usize, Vec>) { @@ -197,12 +188,10 @@ fn multilevel_leiden( // Refinement: within each community, merge singletons into subcommunities. // Uses the SAME resolution as movement for the quality function. - // The refinement_gamma (0.05) is only for the connectivity criterion. let mut subcommunities = refine_singleton_merge( &state.supergraphs[0], &state.community_mapping_per_level[0], gamma, - refinement_gamma, ); // The community assignment (for final output) comes from movement @@ -279,7 +268,6 @@ fn multilevel_leiden( &state.supergraphs[0], &state.node_to_comm, gamma, - refinement_gamma, ); let new_subcomm_count = count_unique(&subcommunities); @@ -360,7 +348,7 @@ fn count_unique(v: &[usize]) -> usize { /// Deterministically rewrite community labels to contiguous IDs [0..k-1] /// by scanning nodes in index order and assigning first-seen labels. -fn canonicalize_community_ids_in_place(node_to_community: &mut [usize]) { +pub(crate) fn canonicalize_community_ids_in_place(node_to_community: &mut [usize]) { let mut remap: HashMap = HashMap::new(); let mut next_id = 0usize; @@ -455,7 +443,6 @@ fn refine_singleton_merge( graph: &crate::core::graph::in_memory::InMemoryGraph, node_to_community: &[usize], gamma: f64, - _refinement_gamma: f64, ) -> Vec { let n = graph.node_count; if n == 0 { @@ -1809,7 +1796,7 @@ mod tests { /// Helper: build a GraphInput from an edge list with unit weights. fn graph(node_count: usize, edges: &[(usize, usize)]) -> GraphInput { GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count, edges: edges.iter().map(|&(u, v)| (u, v, Some(1.0))).collect(), } @@ -1818,7 +1805,7 @@ mod tests { /// Helper: build a GraphInput with explicit weights. fn weighted_graph(node_count: usize, edges: &[(usize, usize, f64)]) -> GraphInput { GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count, edges: edges.iter().map(|&(u, v, w)| (u, v, Some(w))).collect(), } @@ -1905,7 +1892,7 @@ mod tests { #[test] fn test_should_skip_aggregation_when_no_delta_and_no_refinement() { let delta = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 4, edges: vec![], }; @@ -1916,7 +1903,7 @@ mod tests { #[test] fn test_should_not_skip_aggregation_when_delta_or_refinement_exists() { let delta_non_empty = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 4, edges: vec![(0, 1, Some(1.0))], }; @@ -1924,7 +1911,7 @@ mod tests { assert!(!should_skip_aggregation(&delta_non_empty, &refined_empty)); let delta_empty = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 4, edges: vec![], }; @@ -2138,7 +2125,7 @@ mod tests { let inmem = InMemoryGraph::from(&g); let delta = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 3, edges: vec![], }; @@ -2243,7 +2230,7 @@ mod tests { fn test_modularity_empty_graph() { // No edges => Q = 0 let g = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 3, edges: vec![], }; @@ -2373,7 +2360,7 @@ mod tests { ); let mut state = PartitionState::identity(6); - let (iters, hierarchy) = multilevel_leiden(&mut state, &g, 1.0, 0.05, mode, 10); + let (iters, hierarchy) = multilevel_leiden(&mut state, &g, 1.0, mode, 10); assert!(iters > 0, "should take at least 1 iteration"); assert!( @@ -2400,13 +2387,13 @@ mod tests { dual_mode_test!(test_multilevel_leiden_single_node, |mode| { // Single node graph, no edges. let g = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 1, edges: vec![], }; let mut state = PartitionState::identity(1); - let (iters, hierarchy) = multilevel_leiden(&mut state, &g, 1.0, 0.05, mode, 10); + let (iters, hierarchy) = multilevel_leiden(&mut state, &g, 1.0, mode, 10); assert_eq!(state.node_to_comm, vec![0]); assert!(iters >= 1, "should still complete at least 1 iteration"); @@ -2416,13 +2403,13 @@ mod tests { dual_mode_test!(test_multilevel_leiden_disconnected_components, |mode| { // 4 disconnected nodes: each should be its own community. let g = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 4, edges: vec![], }; let mut state = PartitionState::identity(4); - let (_iters, _hierarchy) = multilevel_leiden(&mut state, &g, 1.0, 0.05, mode, 10); + let (_iters, _hierarchy) = multilevel_leiden(&mut state, &g, 1.0, mode, 10); let comm_count = count_unique(&state.node_to_comm); assert_eq!( @@ -2437,7 +2424,7 @@ mod tests { let g = graph(6, &[(0, 1), (1, 2), (0, 2), (3, 4), (4, 5), (3, 5), (2, 3)]); let mut state = PartitionState::identity(6); - let (_iters, hierarchy) = multilevel_leiden(&mut state, &g, 1.0, 0.05, mode, 10); + let (_iters, hierarchy) = multilevel_leiden(&mut state, &g, 1.0, mode, 10); // Each level should have exactly node_count entries for (i, level) in hierarchy.iter().enumerate() { @@ -2976,7 +2963,7 @@ mod tests { refined.set(2, true); // Only node 2 was refined let delta = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 3, edges: vec![], }; @@ -3124,7 +3111,7 @@ mod tests { ); let mut state = PartitionState::identity(6); state.supergraphs.push(InMemoryGraph::from(&GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 6, edges: vec![], })); @@ -3158,7 +3145,7 @@ mod tests { ); let mut state = PartitionState::identity(6); state.supergraphs.push(InMemoryGraph::from(&GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 6, edges: vec![], })); @@ -3190,7 +3177,7 @@ mod tests { ); let mut state = PartitionState::identity(4); state.supergraphs.push(InMemoryGraph::from(&GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 4, edges: vec![], })); @@ -3233,7 +3220,7 @@ mod tests { ); let mut state = PartitionState::identity(8); state.supergraphs.push(InMemoryGraph::from(&GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 8, edges: vec![], })); @@ -3343,7 +3330,7 @@ mod tests { fn test_connected_components_single_node() { // Single node with no edges: 1 component of size 1. let g = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 1, edges: vec![], }; diff --git a/src/core/config.rs b/src/core/config.rs index b8b81fb..074fa55 100644 --- a/src/core/config.rs +++ b/src/core/config.rs @@ -16,16 +16,21 @@ use crate::core::backend::{AccelerationTarget, GraphSource}; +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum RunMode { Deterministic, Throughput, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct RunConfig { pub mode: RunMode, - pub graph_source: GraphSource, + /// Graph source hint. Only meaningful for CLI / orchestrator usage. + /// Library consumers using `run()` or `run_simple()` with in-memory data + /// can leave this as `None`. + pub graph_source: Option, pub acceleration: AccelerationTarget, pub quality_tolerance: f64, pub max_iterations: usize, @@ -34,28 +39,24 @@ pub struct RunConfig { /// Default 1.0 matches the HIT-Leiden paper (standard modularity). /// Used for both movement and refinement quality functions. pub resolution: f64, - /// Refinement connectivity criterion gamma. Controls which nodes participate - /// in refinement merging (must satisfy cut_size >= gamma * v_total * (S - v_total)). - /// Default 0.05. NOT used for quality function. - pub refinement_gamma: f64, } impl Default for RunConfig { fn default() -> Self { Self { mode: RunMode::Deterministic, - graph_source: GraphSource::File, + graph_source: None, acceleration: AccelerationTarget::PureRust, quality_tolerance: 0.001, max_iterations: 10, pinned_profile: None, resolution: 1.0, - refinement_gamma: 0.05, } } } impl RunConfig { + /// Validate the configuration. pub fn validate(&self) -> Result<(), String> { if self.max_iterations == 0 { return Err("max_iterations must be > 0".to_string()); @@ -65,4 +66,29 @@ impl RunConfig { } Ok(()) } + + /// Set the run mode. + pub fn with_mode(mut self, mode: RunMode) -> Self { + self.mode = mode; + self + } + + /// Set the resolution parameter (gamma). + pub fn with_resolution(mut self, r: f64) -> Self { + self.resolution = r; + self + } + + /// Set the maximum number of iterations. + pub fn with_max_iterations(mut self, n: usize) -> Self { + self.max_iterations = n; + self + } + + /// Set the quality tolerance for convergence. + pub fn with_quality_tolerance(mut self, t: f64) -> Self { + self.quality_tolerance = t; + self + } + } From 7f29b992bf24882a56acef00e5aa3c84420d6a17 Mon Sep 17 00:00:00 2001 From: Naadir Jeewa Date: Sun, 8 Mar 2026 16:36:51 +0000 Subject: [PATCH 3/6] feat: improve library usability with serde, builders, and run_simple - Add conditional serde derives behind "serde" feature flag - Make graph_source Optional in RunConfig (library users don't need it) - Add builder methods: with_mode, with_resolution, with_max_iterations, with_quality_tolerance - Add run_simple() entry point for edges-in, partition-out usage - Re-export PartitionResult, PartitionState, CommunityRelation, CommunityState from crate root - Gate CLI module behind "cli" feature Co-Authored-By: Claude Opus 4.6 --- Cargo.toml | 13 +++---- benchmarks/criterion/hit_leiden_suite.rs | 2 +- src/benchmark/dynamic_graph.rs | 8 ++-- src/benchmark/hit_leiden_incremental.rs | 4 +- src/benchmark/runner.rs | 2 +- src/bin/profile_incremental.rs | 2 +- src/bin/profile_run.rs | 2 +- src/core/backend.rs | 2 + src/core/error.rs | 2 + src/core/graph/in_memory.rs | 1 + src/core/graph/neo4j_snapshot.rs | 2 +- src/core/mod.rs | 1 + src/core/partition/state.rs | 1 + src/core/runtime/resolver.rs | 2 +- src/lib.rs | 38 ++++++++++++++++++- src/main.rs | 9 +++-- tests/contract/test_run_validate.rs | 2 +- ...test_connected_graph_not_all_singletons.rs | 2 +- .../test_default_config_minimal_args.rs | 6 +-- .../test_deterministic_identity.rs | 2 +- .../integration/test_neo4j_snapshot_parity.rs | 2 +- .../test_throughput_equivalence.rs | 2 +- tests/property/test_partition_invariants.rs | 2 +- 23 files changed, 76 insertions(+), 33 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7c1472f..51e04a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,9 +21,7 @@ edition = "2021" license = "MIT OR Apache-2.0" [dependencies] -clap = { version = "4.5", features = [ - "derive", -] } +clap = { version = "4.5", features = ["derive"], optional = true } ahash = "0.8" rayon = "1.10" smallvec = "1.13" @@ -52,13 +50,13 @@ anyhow = "1.0.102" lender = "0.6.2" [features] -default = [ -] +default = [] +cli = ["dep:clap"] +serde = ["dep:serde", "dep:serde_json"] profiling = [ + "serde", "webgraph", "lender", - "serde", - "serde_json", "plotly", "inferno", "prost", @@ -71,6 +69,7 @@ profiling = [ [[bin]] name = "hit-leiden" path = "src/main.rs" +required-features = ["cli"] [[bin]] name = "profile_run" diff --git a/benchmarks/criterion/hit_leiden_suite.rs b/benchmarks/criterion/hit_leiden_suite.rs index 006b39f..0274f74 100644 --- a/benchmarks/criterion/hit_leiden_suite.rs +++ b/benchmarks/criterion/hit_leiden_suite.rs @@ -53,7 +53,7 @@ fn load_uk_2007() -> GraphInput { ); GraphInput { - dataset_id: "uk-2007-05@100000".to_string(), + dataset_id: Some("uk-2007-05@100000".to_string()), node_count: num_nodes, edges, } diff --git a/src/benchmark/dynamic_graph.rs b/src/benchmark/dynamic_graph.rs index e7dfd8c..09ebee5 100644 --- a/src/benchmark/dynamic_graph.rs +++ b/src/benchmark/dynamic_graph.rs @@ -59,7 +59,7 @@ impl DynamicGraphBuilder { cumulative_edges.extend_from_slice(chunk); batches.push(GraphInput { - dataset_id: format!("batch_{}", idx), + dataset_id: Some(format!("batch_{}", idx)), node_count: self.node_count, edges: cumulative_edges .iter() @@ -106,7 +106,7 @@ impl DynamicGraphBuilder { cumulative_edges.extend_from_slice(&shuffled[start..end]); update_batches.push(GraphInput { - dataset_id: format!("paper_batch_{}", round), + dataset_id: Some(format!("paper_batch_{}", round)), node_count: self.node_count, edges: cumulative_edges .iter() @@ -117,7 +117,7 @@ impl DynamicGraphBuilder { IncrementalSplit { initial_graph: GraphInput { - dataset_id: "paper_initial".to_string(), + dataset_id: Some("paper_initial".to_string()), node_count: self.node_count, edges: initial_edges .iter() @@ -138,7 +138,7 @@ mod tests { #[test] fn paper_split_uses_initial_ratio_and_fixed_rounds() { let graph = GraphInput { - dataset_id: "test".to_string(), + dataset_id: Some("test".to_string()), node_count: 100, edges: (0..100).map(|i| (i, (i + 1) % 100, None::)).collect(), }; diff --git a/src/benchmark/hit_leiden_incremental.rs b/src/benchmark/hit_leiden_incremental.rs index cceda73..de0f4f1 100644 --- a/src/benchmark/hit_leiden_incremental.rs +++ b/src/benchmark/hit_leiden_incremental.rs @@ -225,7 +225,7 @@ pub fn run_incremental_with_config( let _ = ig_disk_cache::save( cache_dir, &ig_disk_cache::CachedIgraphLeiden { - dataset_id: batch_graph.dataset_id.clone(), + dataset_id: batch_graph.dataset_id.clone().unwrap_or_default(), node_count: batch_graph.node_count, edge_count: batch_graph.edges.len(), time_ms: ig.time_ms, @@ -296,7 +296,7 @@ fn load_cached_igraph_result( } let cache_dir = config.cache_dir.as_ref()?; let first_batch = batches.first()?; - let cached = ig_disk_cache::load(cache_dir, &first_batch.dataset_id, first_batch.edges.len())?; + let cached = ig_disk_cache::load(cache_dir, first_batch.dataset_id.as_deref().unwrap_or(""), first_batch.edges.len())?; Some(IgResult { time_ms: cached.time_ms, modularity: cached.modularity, diff --git a/src/benchmark/runner.rs b/src/benchmark/runner.rs index 7405158..413e35f 100644 --- a/src/benchmark/runner.rs +++ b/src/benchmark/runner.rs @@ -119,7 +119,7 @@ pub mod benchmark_runner { let benchmark_run = BenchmarkRun { timestamp: timestamp.clone(), - dataset_id: config.graph.dataset_id.clone(), + dataset_id: config.graph.dataset_id.clone().unwrap_or_default(), timeout_seconds: config.timeout_seconds, truncated: outcome.truncated, batches: outcome.batches, diff --git a/src/bin/profile_incremental.rs b/src/bin/profile_incremental.rs index 40f73ee..2016072 100644 --- a/src/bin/profile_incremental.rs +++ b/src/bin/profile_incremental.rs @@ -58,7 +58,7 @@ fn load_graph() -> GraphInput { edges.len() ); GraphInput { - dataset_id: "uk-2007-05@100000".to_string(), + dataset_id: Some("uk-2007-05@100000".to_string()), node_count: num_nodes, edges, } diff --git a/src/bin/profile_run.rs b/src/bin/profile_run.rs index 8b85245..fa2534d 100644 --- a/src/bin/profile_run.rs +++ b/src/bin/profile_run.rs @@ -54,7 +54,7 @@ fn load_graph() -> GraphInput { edges.len() ); GraphInput { - dataset_id: "uk-2007-05@100000".to_string(), + dataset_id: Some("uk-2007-05@100000".to_string()), node_count: num_nodes, edges, } diff --git a/src/core/backend.rs b/src/core/backend.rs index 54cadb7..553f045 100644 --- a/src/core/backend.rs +++ b/src/core/backend.rs @@ -14,6 +14,7 @@ // // SPDX-License-Identifier: Apache-2.0 +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum GraphSource { File, @@ -21,6 +22,7 @@ pub enum GraphSource { LiveNeo4j, } +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum AccelerationTarget { PureRust, diff --git a/src/core/error.rs b/src/core/error.rs index bb83257..1197f58 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -24,4 +24,6 @@ pub enum HitLeidenError { Backend(String), #[error("acceleration error: {0}")] Acceleration(String), + #[error("validation failed: {0}")] + ValidationFailed(String), } diff --git a/src/core/graph/in_memory.rs b/src/core/graph/in_memory.rs index 8d73ba0..d6e7e04 100644 --- a/src/core/graph/in_memory.rs +++ b/src/core/graph/in_memory.rs @@ -17,6 +17,7 @@ use crate::core::types::GraphInput; use ahash::{HashMap, HashMapExt}; +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct InMemoryGraph { pub node_count: usize, diff --git a/src/core/graph/neo4j_snapshot.rs b/src/core/graph/neo4j_snapshot.rs index 36b460d..d7585f9 100644 --- a/src/core/graph/neo4j_snapshot.rs +++ b/src/core/graph/neo4j_snapshot.rs @@ -28,7 +28,7 @@ pub fn project_from_neo4j( projection_config: &ProjectionConfig, ) -> Result { Ok(GraphInput { - dataset_id: format!("neo4j:{}", projection_config.snapshot_id), + dataset_id: Some(format!("neo4j:{}", projection_config.snapshot_id)), node_count: 0, edges: Vec::new(), }) diff --git a/src/core/mod.rs b/src/core/mod.rs index f4655ec..9b8c3ad 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -16,6 +16,7 @@ pub mod algorithm; pub mod backend; +pub mod community_state; pub mod config; pub mod error; pub mod graph; diff --git a/src/core/partition/state.rs b/src/core/partition/state.rs index c4c4c33..182faf5 100644 --- a/src/core/partition/state.rs +++ b/src/core/partition/state.rs @@ -16,6 +16,7 @@ use crate::core::graph::in_memory::InMemoryGraph; +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Clone, Debug, PartialEq)] pub struct PartitionState { pub node_to_comm: Vec, diff --git a/src/core/runtime/resolver.rs b/src/core/runtime/resolver.rs index d749692..08837be 100644 --- a/src/core/runtime/resolver.rs +++ b/src/core/runtime/resolver.rs @@ -19,7 +19,7 @@ use crate::core::config::RunConfig; pub fn resolve(config: &RunConfig) -> ResolutionMetadata { ResolutionMetadata { - source_resolved: config.graph_source, + source_resolved: config.graph_source.unwrap_or(GraphSource::File), accel_resolved: config.acceleration, fallback_reason: None, } diff --git a/src/lib.rs b/src/lib.rs index 93fd45b..afca108 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,7 @@ // SPDX-License-Identifier: Apache-2.0 pub mod benchmark; +#[cfg(feature = "cli")] pub mod cli; pub mod core; @@ -22,12 +23,47 @@ pub use core::backend::{AccelerationTarget, GraphSource}; pub use core::config::{RunConfig, RunMode}; pub use core::error::HitLeidenError; pub use core::report::{BenchmarkOutcome, ValidationOutcome}; -pub use core::types::{GraphInput, RunOutcome}; +pub use core::community_state::{CommunityRelation, CommunityState}; +pub use core::partition::state::PartitionState; +pub use core::types::{GraphInput, PartitionResult, RunOutcome}; +/// Run community detection on the given graph with the specified configuration. +/// +/// # Examples +/// +/// ```no_run +/// use hit_leiden::{GraphInput, RunConfig, RunMode}; +/// +/// let graph = GraphInput::from_unweighted_edges(vec![(0, 1), (1, 2), (2, 0)]); +/// let config = RunConfig::default().with_mode(RunMode::Deterministic); +/// let outcome = hit_leiden::run(&graph, &config).unwrap(); +/// let partition = outcome.into_partition().unwrap(); +/// println!("communities: {:?}", partition.node_to_community); +/// ``` pub fn run(graph: &GraphInput, config: &RunConfig) -> Result { core::algorithm::hit_leiden::run(graph, config) } +/// Simplified entry point: edges in, partition out. Uses default configuration. +/// +/// # Examples +/// +/// ```no_run +/// use hit_leiden::PartitionResult; +/// +/// let edges = vec![(0, 1, None), (1, 2, None), (2, 0, None)]; +/// let partition: PartitionResult = hit_leiden::run_simple(edges).unwrap(); +/// println!("communities: {:?}", partition.node_to_community); +/// ``` +pub fn run_simple( + edges: Vec<(usize, usize, Option)>, +) -> Result { + let graph = GraphInput::from_edges(edges); + let config = RunConfig::default(); + let outcome = run(&graph, &config)?; + outcome.into_partition() +} + pub fn project_from_neo4j( source_config: &core::graph::neo4j_snapshot::Neo4jSourceConfig, projection_config: &core::graph::neo4j_mapping::ProjectionConfig, diff --git a/src/main.rs b/src/main.rs index b1ae7f2..4944ee5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -220,10 +220,11 @@ fn load_webgraph( ); Ok(hit_leiden::core::types::GraphInput { - dataset_id: path - .file_stem() - .map(|s| s.to_string_lossy().to_string()) - .unwrap_or_else(|| "unknown".to_string()), + dataset_id: Some( + path.file_stem() + .map(|s| s.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()), + ), node_count: num_nodes, edges, }) diff --git a/tests/contract/test_run_validate.rs b/tests/contract/test_run_validate.rs index ca21a5e..95ba1d3 100644 --- a/tests/contract/test_run_validate.rs +++ b/tests/contract/test_run_validate.rs @@ -19,7 +19,7 @@ use hit_leiden::{run, validate, GraphInput, RunConfig}; #[test] fn run_and_validate_contract() { let graph = GraphInput { - dataset_id: "d1".to_string(), + dataset_id: Some("d1".to_string()), node_count: 3, edges: vec![(0, 1, None), (1, 2, None)], }; diff --git a/tests/integration/test_connected_graph_not_all_singletons.rs b/tests/integration/test_connected_graph_not_all_singletons.rs index ce44c2d..307a35e 100644 --- a/tests/integration/test_connected_graph_not_all_singletons.rs +++ b/tests/integration/test_connected_graph_not_all_singletons.rs @@ -21,7 +21,7 @@ use hit_leiden::{run, GraphInput, RunConfig}; #[test] fn connected_graph_not_all_singletons() { let graph = GraphInput { - dataset_id: "connected-1".to_string(), + dataset_id: Some("connected-1".to_string()), node_count: 6, edges: vec![ (0, 1, Some(1.0)), diff --git a/tests/integration/test_default_config_minimal_args.rs b/tests/integration/test_default_config_minimal_args.rs index e5cacfe..e0ed165 100644 --- a/tests/integration/test_default_config_minimal_args.rs +++ b/tests/integration/test_default_config_minimal_args.rs @@ -14,15 +14,15 @@ // // SPDX-License-Identifier: Apache-2.0 -use hit_leiden::{cli::run::run_default, GraphInput}; +use hit_leiden::{run, GraphInput, RunConfig}; #[test] fn default_run_with_minimal_required_graph_source() { let graph = GraphInput { - dataset_id: "min".into(), + dataset_id: Some("min".to_string()), node_count: 1, edges: vec![], }; - let out = run_default(&graph).expect("default run should succeed"); + let out = run(&graph, &RunConfig::default()).expect("default run should succeed"); assert_eq!(out.partition.unwrap().node_to_community.len(), 1); } diff --git a/tests/integration/test_deterministic_identity.rs b/tests/integration/test_deterministic_identity.rs index aa329ec..ee08ba4 100644 --- a/tests/integration/test_deterministic_identity.rs +++ b/tests/integration/test_deterministic_identity.rs @@ -19,7 +19,7 @@ use hit_leiden::{run, GraphInput, RunConfig}; #[test] fn deterministic_replay_identity() { let graph = GraphInput { - dataset_id: "d2".to_string(), + dataset_id: Some("d2".to_string()), node_count: 4, edges: vec![(0, 1, None), (2, 3, None)], }; diff --git a/tests/integration/test_neo4j_snapshot_parity.rs b/tests/integration/test_neo4j_snapshot_parity.rs index b920bb8..e61c134 100644 --- a/tests/integration/test_neo4j_snapshot_parity.rs +++ b/tests/integration/test_neo4j_snapshot_parity.rs @@ -29,5 +29,5 @@ fn neo4j_projection_parity_shape() { batched: true, }; let graph = project_from_neo4j(&source, &proj).expect("projection"); - assert!(graph.dataset_id.starts_with("neo4j:")); + assert!(graph.dataset_id.as_deref().unwrap_or("").starts_with("neo4j:")); } diff --git a/tests/integration/test_throughput_equivalence.rs b/tests/integration/test_throughput_equivalence.rs index c395473..edf2071 100644 --- a/tests/integration/test_throughput_equivalence.rs +++ b/tests/integration/test_throughput_equivalence.rs @@ -19,7 +19,7 @@ use hit_leiden::{core::config::RunMode, run, validate, GraphInput, RunConfig}; #[test] fn throughput_equivalence_bounds() { let graph = GraphInput { - dataset_id: "d3".to_string(), + dataset_id: Some("d3".to_string()), node_count: 2, edges: vec![(0, 1, Some(1.0))], }; diff --git a/tests/property/test_partition_invariants.rs b/tests/property/test_partition_invariants.rs index 6e01506..0e187af 100644 --- a/tests/property/test_partition_invariants.rs +++ b/tests/property/test_partition_invariants.rs @@ -21,7 +21,7 @@ proptest! { #[test] fn partition_len_matches_nodes(node_count in 0usize..50) { let graph = GraphInput { - dataset_id: "p1".to_string(), + dataset_id: Some("p1".to_string()), node_count, edges: vec![], }; From 3c727beaa579860cc7414d7a24fbb8ad17cf1990 Mon Sep 17 00:00:00 2001 From: Naadir Jeewa Date: Sun, 8 Mar 2026 16:37:01 +0000 Subject: [PATCH 4/6] feat: add CommunityState for incremental updates with hierarchy CommunityState wraps PartitionState to provide a persistent, serializable community detection state that supports incremental graph updates via delta edge lists. Key features: - initial() runs full Leiden and captures hierarchy levels - update() applies delta edges incrementally, updating level 0 - hierarchy_levels() exposes the multi-level community structure - community_tree() derives parent-child relationships via majority vote - into_parts()/from_parts() for decomposed storage and restoration Hierarchy support enables DRIFT search (top-down traversal of community tree for progressive query refinement). Co-Authored-By: Claude Opus 4.6 --- src/core/community_state.rs | 379 ++++++++++++++++++++++++++++++++++++ 1 file changed, 379 insertions(+) create mode 100644 src/core/community_state.rs diff --git a/src/core/community_state.rs b/src/core/community_state.rs new file mode 100644 index 0000000..6cbe446 --- /dev/null +++ b/src/core/community_state.rs @@ -0,0 +1,379 @@ +// Copyright 2026 naadir jeewa +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +use crate::core::algorithm::hit_leiden::{ + canonicalize_community_ids_in_place, compute_modularity, +}; +use crate::core::config::RunConfig; +use crate::core::error::HitLeidenError; +use crate::core::partition::state::PartitionState; +use crate::core::types::{GraphInput, PartitionResult}; +use ahash::{HashMap, HashMapExt, HashSet, HashSetExt}; + +/// Persistent community detection state that can be serialized, stored, +/// and later restored to apply incremental updates. +/// +/// # Usage +/// +/// ```no_run +/// use hit_leiden::{CommunityState, GraphInput, RunConfig}; +/// +/// // Initial run +/// let edges = GraphInput::from_unweighted_edges(vec![(0, 1), (1, 2), (2, 0)]); +/// let (state, partition) = CommunityState::initial(&edges, &RunConfig::default())?; +/// +/// // Persist state (requires `serde` feature) +/// // let bytes = serde_json::to_vec(&state)?; +/// +/// // Later: restore and apply delta +/// // let state: CommunityState = serde_json::from_slice(&bytes)?; +/// let delta = GraphInput::from_edges(vec![(2, 3, Some(1.0)), (3, 4, Some(1.0))]); +/// let (updated_state, updated_partition) = state.update(&delta, &RunConfig::default())?; +/// # Ok::<(), hit_leiden::HitLeidenError>(()) +/// ``` +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[derive(Clone, Debug)] +pub struct CommunityState { + /// Internal partition state (community mappings, supergraphs, hierarchy). + partition_state: PartitionState, + /// The current full graph as an edge list, needed to compute modularity + /// and to reconstruct `GraphInput` for the algorithm. + current_edges: Vec<(usize, usize, Option)>, + /// Current node count. + node_count: usize, + /// Hierarchy levels from community detection. Level 0 = finest (most + /// communities), last = coarsest (fewest). Each entry maps node index + /// to community ID at that level. Stored so incremental updates can + /// return meaningful hierarchy data without full recompute. + hierarchy_levels: Vec>, +} + +/// A parent-child relationship between communities at successive hierarchy levels. +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CommunityRelation { + /// Community ID at the finer (child) level. + pub child_community: usize, + /// The finer level index. + pub child_level: usize, + /// Community ID at the coarser (parent) level. + pub parent_community: usize, + /// The coarser level index. + pub parent_level: usize, +} + +impl CommunityState { + /// Run initial community detection on a graph and return the persistent + /// state alongside the partition result. + pub fn initial( + graph: &GraphInput, + config: &RunConfig, + ) -> Result<(Self, PartitionResult), HitLeidenError> { + config + .validate() + .map_err(|e| HitLeidenError::InvalidInput(e.to_string()))?; + graph.validate()?; + + let mut partition_state = PartitionState::identity(graph.node_count); + + let (iteration_count, hierarchy_levels) = + crate::core::algorithm::hit_leiden::multilevel_leiden( + &mut partition_state, + graph, + config.resolution, + config.mode, + config.max_iterations, + ); + + canonicalize_community_ids_in_place(&mut partition_state.node_to_comm); + + let community_count = count_communities(&partition_state.node_to_comm); + let quality_score = compute_modularity(graph, &partition_state.node_to_comm); + + let partition = PartitionResult { + run_id: String::new(), + node_to_community: partition_state.node_to_comm.clone(), + hierarchy_levels, + community_count, + quality_score, + iteration_count, + }; + + let state = Self { + partition_state, + current_edges: graph.edges.clone(), + node_count: graph.node_count, + hierarchy_levels: partition.hierarchy_levels.clone(), + }; + + Ok((state, partition)) + } + + /// Apply a delta graph to update the community structure incrementally. + /// + /// Delta edges use weight to encode changes: + /// - Positive weight (or `None` = 1.0): add or strengthen an edge + /// - Negative weight: remove or weaken an edge + /// + /// Consumes `self` and returns the updated state + new partition. + pub fn update( + mut self, + delta: &GraphInput, + config: &RunConfig, + ) -> Result<(Self, PartitionResult), HitLeidenError> { + config + .validate() + .map_err(|e| HitLeidenError::InvalidInput(e.to_string()))?; + + // Expand node count if delta introduces new nodes + let new_node_count = self.node_count.max(delta.node_count); + if new_node_count > self.node_count { + self.expand_to(new_node_count); + } + + // Run incremental HIT-Leiden + let iteration_count = crate::core::algorithm::hit_leiden::hit_leiden( + &mut self.partition_state, + delta, + config.resolution, + config.mode, + ); + + canonicalize_community_ids_in_place(&mut self.partition_state.node_to_comm); + + // Update stored edges by applying the delta + self.apply_edge_delta(delta); + + // Update the finest hierarchy level (level 0) with the new partition. + // Coarser levels are approximately stable for small incremental changes. + if !self.hierarchy_levels.is_empty() { + self.hierarchy_levels[0] = self.partition_state.node_to_comm.clone(); + // Extend hierarchy levels if node count grew + for level in &mut self.hierarchy_levels { + if level.len() < self.node_count { + let old_len = level.len(); + level.resize(self.node_count, 0); + for i in old_len..self.node_count { + level[i] = i; + } + } + } + } else { + self.hierarchy_levels = vec![self.partition_state.node_to_comm.clone()]; + } + + let full_graph = GraphInput { + dataset_id: None, + node_count: self.node_count, + edges: self.current_edges.clone(), + }; + + let community_count = count_communities(&self.partition_state.node_to_comm); + let quality_score = + compute_modularity(&full_graph, &self.partition_state.node_to_comm); + + let partition = PartitionResult { + run_id: String::new(), + node_to_community: self.partition_state.node_to_comm.clone(), + hierarchy_levels: self.hierarchy_levels.clone(), + community_count, + quality_score, + iteration_count, + }; + + Ok((self, partition)) + } + + /// Get the current node count. + pub fn node_count(&self) -> usize { + self.node_count + } + + /// Get the current edge count. + pub fn edge_count(&self) -> usize { + self.current_edges.len() + } + + /// Decompose the state into its constituent parts for storage. + /// + /// Returns `(partition_state, edges, node_count, hierarchy_levels)`. + /// The caller is responsible for storing the `PartitionState` (the only + /// unique data); edges and node count can typically be derived from an + /// external source (e.g. a database edge table). + pub fn into_parts( + self, + ) -> ( + PartitionState, + Vec<(usize, usize, Option)>, + usize, + Vec>, + ) { + ( + self.partition_state, + self.current_edges, + self.node_count, + self.hierarchy_levels, + ) + } + + /// Reconstruct a `CommunityState` from previously decomposed parts. + /// + /// Use this when restoring from persisted storage where the + /// `PartitionState` was stored separately from the graph edges. + pub fn from_parts( + partition_state: PartitionState, + edges: Vec<(usize, usize, Option)>, + node_count: usize, + hierarchy_levels: Vec>, + ) -> Self { + Self { + partition_state, + current_edges: edges, + node_count, + hierarchy_levels, + } + } + + /// Borrow the internal partition state (for serialization without consuming). + pub fn partition_state(&self) -> &PartitionState { + &self.partition_state + } + + /// Get the stored hierarchy levels. + /// + /// Level 0 = finest (most communities), last = coarsest (fewest). + /// Each entry maps node index → community ID at that level. + pub fn hierarchy_levels(&self) -> &[Vec] { + &self.hierarchy_levels + } + + /// Derive parent-child relationships between communities at successive + /// hierarchy levels. + /// + /// For each community at level L, determines its parent at level L+1 by + /// majority vote: the parent is whichever community at L+1 contains the + /// most nodes from the child community at L. + pub fn community_tree(&self) -> Vec { + let mut relations = Vec::new(); + for level in 0..self.hierarchy_levels.len().saturating_sub(1) { + let child_assignments = &self.hierarchy_levels[level]; + let parent_assignments = &self.hierarchy_levels[level + 1]; + + // Group nodes by their child community, count parent community votes + let mut child_to_parent_votes: HashMap> = + HashMap::new(); + let len = child_assignments.len().min(parent_assignments.len()); + for i in 0..len { + let child_comm = child_assignments[i]; + let parent_comm = parent_assignments[i]; + *child_to_parent_votes + .entry(child_comm) + .or_insert_with(HashMap::new) + .entry(parent_comm) + .or_insert(0) += 1; + } + + // For each child community, pick the parent with the most votes + for (child_comm, votes) in &child_to_parent_votes { + if let Some((&parent_comm, _)) = votes.iter().max_by_key(|(_, &count)| count) { + relations.push(CommunityRelation { + child_community: *child_comm, + child_level: level, + parent_community: parent_comm, + parent_level: level + 1, + }); + } + } + } + relations + } + + /// Expand state vectors to accommodate new nodes. + fn expand_to(&mut self, new_count: usize) { + let old = self.node_count; + self.node_count = new_count; + + // Extend partition state vectors + self.partition_state.node_to_comm.resize(new_count, 0); + for i in old..new_count { + self.partition_state.node_to_comm[i] = i; + } + self.partition_state.comm_weights.resize(new_count, 0.0); + self.partition_state.node_weights.resize(new_count, 0.0); + + for mapping in &mut self.partition_state.community_mapping_per_level { + let prev_len = mapping.len(); + mapping.resize(new_count, 0); + for i in prev_len..new_count { + mapping[i] = i; + } + } + for mapping in &mut self.partition_state.refined_community_mapping_per_level { + let prev_len = mapping.len(); + mapping.resize(new_count, 0); + for i in prev_len..new_count { + mapping[i] = i; + } + } + for mapping in &mut self.partition_state.previous_subcommunity_mapping_per_level { + let prev_len = mapping.len(); + mapping.resize(new_count, 0); + for i in prev_len..new_count { + mapping[i] = i; + } + } + for mapping in &mut self.partition_state.current_subcommunity_mapping_per_level { + let prev_len = mapping.len(); + mapping.resize(new_count, 0); + for i in prev_len..new_count { + mapping[i] = i; + } + } + } + + /// Apply delta edges to the stored edge list. + fn apply_edge_delta(&mut self, delta: &GraphInput) { + // Build a map of canonical edges -> weight + let mut edge_map: HashMap<(usize, usize), f64> = + HashMap::with_capacity(self.current_edges.len()); + for &(u, v, w) in &self.current_edges { + let key = if u <= v { (u, v) } else { (v, u) }; + *edge_map.entry(key).or_insert(0.0) += w.unwrap_or(1.0); + } + + // Apply delta + for &(u, v, w) in &delta.edges { + let key = if u <= v { (u, v) } else { (v, u) }; + *edge_map.entry(key).or_insert(0.0) += w.unwrap_or(1.0); + } + + // Rebuild edge list, filtering out zero/negative weight edges + self.current_edges = edge_map + .into_iter() + .filter(|(_, w)| *w > 1e-12) + .map(|((u, v), w)| (u, v, Some(w))) + .collect(); + } +} + +/// Count unique community IDs. +fn count_communities(node_to_comm: &[usize]) -> usize { + let mut seen = HashSet::new(); + for &c in node_to_comm { + seen.insert(c); + } + seen.len() +} From 51d8072f793108faf7b111a66a8cec6cdba06039 Mon Sep 17 00:00:00 2001 From: Naadir Jeewa Date: Sun, 8 Mar 2026 16:37:07 +0000 Subject: [PATCH 5/6] test: add integration tests for CommunityState hierarchy features Tests cover: - hierarchy_levels populated after initial() - hierarchy_levels preserved and extended after update() - community_tree() returns valid parent-child relations - into_parts/from_parts roundtrip preserves hierarchy - incremental update works after restore from parts Co-Authored-By: Claude Opus 4.6 --- .../test_incremental_community_state.rs | 249 ++++++++++++++++++ tests/integration_tests.rs | 2 + 2 files changed, 251 insertions(+) create mode 100644 tests/integration/test_incremental_community_state.rs diff --git a/tests/integration/test_incremental_community_state.rs b/tests/integration/test_incremental_community_state.rs new file mode 100644 index 0000000..5592f7b --- /dev/null +++ b/tests/integration/test_incremental_community_state.rs @@ -0,0 +1,249 @@ +// Copyright 2026 naadir jeewa +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +use hit_leiden::{CommunityState, GraphInput, RunConfig}; + +#[test] +fn initial_then_incremental_update() { + // Two triangles connected by a weak link + let graph = GraphInput::from_edges(vec![ + (0, 1, Some(1.0)), + (1, 2, Some(1.0)), + (2, 0, Some(1.0)), + (3, 4, Some(1.0)), + (4, 5, Some(1.0)), + (5, 3, Some(1.0)), + ]); + + let config = RunConfig::default(); + let (state, partition) = CommunityState::initial(&graph, &config).expect("initial run"); + + assert_eq!(partition.node_to_community.len(), 6); + assert!(partition.community_count >= 1); + assert_eq!(state.node_count(), 6); + assert_eq!(state.edge_count(), 6); + + // Add a bridge between the two triangles and a new node + let delta = GraphInput::from_edges(vec![ + (2, 3, Some(1.0)), + (5, 6, Some(1.0)), + (6, 7, Some(1.0)), + (7, 5, Some(1.0)), + ]); + + let (updated_state, updated_partition) = + state.update(&delta, &config).expect("incremental update"); + + // Should now have 8 nodes + assert_eq!(updated_partition.node_to_community.len(), 8); + assert_eq!(updated_state.node_count(), 8); + // Original 6 edges + 4 new = 10 edges (canonical, deduplicated) + assert!(updated_state.edge_count() >= 9); +} + +#[test] +fn edge_removal_via_negative_weight() { + let graph = GraphInput::from_edges(vec![ + (0, 1, Some(1.0)), + (1, 2, Some(1.0)), + (2, 0, Some(1.0)), + ]); + + let config = RunConfig::default(); + let (state, _) = CommunityState::initial(&graph, &config).expect("initial run"); + assert_eq!(state.edge_count(), 3); + + // Remove edge (0,1) by adding negative weight + let delta = GraphInput { + dataset_id: None, + node_count: 3, + edges: vec![(0, 1, Some(-1.0))], + }; + + let (updated_state, updated_partition) = + state.update(&delta, &config).expect("update with removal"); + + assert_eq!(updated_partition.node_to_community.len(), 3); + // One edge removed + assert_eq!(updated_state.edge_count(), 2); +} + +#[test] +fn hierarchy_levels_populated_after_initial() { + let graph = GraphInput::from_unweighted_edges(vec![ + (0, 1), + (1, 2), + (2, 0), + (3, 4), + (4, 5), + (5, 3), + ]); + + let (state, partition) = + CommunityState::initial(&graph, &RunConfig::default()).expect("initial run"); + + let levels = state.hierarchy_levels(); + assert!(!levels.is_empty(), "hierarchy_levels should be non-empty after initial()"); + + // Level 0 (finest) should have one entry per node + assert_eq!(levels[0].len(), 6); + + // PartitionResult should also carry the same hierarchy + assert_eq!(partition.hierarchy_levels.len(), levels.len()); + assert_eq!(partition.hierarchy_levels[0], levels[0]); +} + +#[test] +fn hierarchy_levels_preserved_after_update() { + let graph = GraphInput::from_unweighted_edges(vec![ + (0, 1), + (1, 2), + (2, 0), + (3, 4), + (4, 5), + (5, 3), + ]); + + let config = RunConfig::default(); + let (state, _) = CommunityState::initial(&graph, &config).expect("initial run"); + let initial_level_count = state.hierarchy_levels().len(); + + // Add a bridge and new nodes + let delta = GraphInput::from_edges(vec![ + (2, 3, Some(1.0)), + (5, 6, Some(1.0)), + (6, 7, Some(1.0)), + ]); + + let (updated_state, updated_partition) = + state.update(&delta, &config).expect("incremental update"); + + let levels = updated_state.hierarchy_levels(); + assert!(!levels.is_empty(), "hierarchy_levels should be non-empty after update()"); + // Level count should be at least what we had initially + assert!( + levels.len() >= initial_level_count, + "should not lose hierarchy levels after update" + ); + + // Level 0 should cover all 8 nodes + assert_eq!(levels[0].len(), 8); + // All levels should cover all 8 nodes + for (i, level) in levels.iter().enumerate() { + assert_eq!( + level.len(), + 8, + "level {} should have entries for all 8 nodes", + i + ); + } + + // PartitionResult should match + assert_eq!(updated_partition.hierarchy_levels.len(), levels.len()); +} + +#[test] +fn community_tree_returns_valid_relations() { + // Use a larger graph to get multiple hierarchy levels + let graph = GraphInput::from_unweighted_edges(vec![ + (0, 1), + (1, 2), + (2, 0), + (3, 4), + (4, 5), + (5, 3), + (6, 7), + (7, 8), + (8, 6), + ]); + + let (state, _) = + CommunityState::initial(&graph, &RunConfig::default()).expect("initial run"); + + let tree = state.community_tree(); + let levels = state.hierarchy_levels(); + + if levels.len() > 1 { + // If we have multiple levels, we should have some relations + assert!( + !tree.is_empty(), + "community_tree() should return relations when multiple levels exist" + ); + + for rel in &tree { + // Parent level should be exactly one above child level + assert_eq!( + rel.parent_level, + rel.child_level + 1, + "parent_level should be child_level + 1" + ); + // Levels should be within bounds + assert!( + rel.child_level < levels.len(), + "child_level out of bounds" + ); + assert!( + rel.parent_level < levels.len(), + "parent_level out of bounds" + ); + } + } + // If only 1 level, tree should be empty (no parent-child to derive) + if levels.len() <= 1 { + assert!( + tree.is_empty(), + "community_tree() should be empty with only one level" + ); + } +} + +#[test] +fn into_parts_from_parts_roundtrip_preserves_hierarchy() { + let graph = GraphInput::from_unweighted_edges(vec![ + (0, 1), + (1, 2), + (2, 0), + (3, 4), + (4, 5), + (5, 3), + ]); + + let config = RunConfig::default(); + let (state, _) = CommunityState::initial(&graph, &config).expect("initial run"); + + let original_levels = state.hierarchy_levels().to_vec(); + let original_node_count = state.node_count(); + let original_edge_count = state.edge_count(); + + // Decompose + let (partition_state, edges, node_count, hierarchy_levels) = state.into_parts(); + + assert_eq!(node_count, original_node_count); + assert_eq!(hierarchy_levels, original_levels); + + // Reconstruct + let restored = CommunityState::from_parts(partition_state, edges, node_count, hierarchy_levels); + + assert_eq!(restored.node_count(), original_node_count); + assert_eq!(restored.edge_count(), original_edge_count); + assert_eq!(restored.hierarchy_levels(), &original_levels[..]); + + // Restored state should still support incremental updates + let delta = GraphInput::from_edges(vec![(2, 3, Some(1.0))]); + let (updated, partition) = restored.update(&delta, &config).expect("update after restore"); + assert_eq!(partition.node_to_community.len(), 6); + assert!(!updated.hierarchy_levels().is_empty()); +} diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index f5fd9d8..ea744be 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -28,3 +28,5 @@ mod test_neo4j_snapshot_parity; mod test_release_gate_live_query_ineligible; #[path = "integration/test_throughput_equivalence.rs"] mod test_throughput_equivalence; +#[path = "integration/test_incremental_community_state.rs"] +mod test_incremental_community_state; From f3fa1aa25eecc896bf19cbaadf57247ca6ff4aa9 Mon Sep 17 00:00:00 2001 From: Naadir Jeewa Date: Sun, 8 Mar 2026 16:37:13 +0000 Subject: [PATCH 6/6] docs: add library usage examples and hierarchy documentation to README Add Quick Start sections covering: - Simple edges-in/partition-out usage with run_simple() - Configured runs with RunConfig builders - Weighted edges and dataset identifiers - Incremental updates with CommunityState - Hierarchy access for DRIFT search - CLI usage Co-Authored-By: Claude Opus 4.6 --- README.md | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6ed40d5..0490630 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,125 @@ downstream summarisation to keep pace with document ingestion. ## Quick Start +### Library usage + +Add to your `Cargo.toml`: + +```toml +[dependencies] +hit_leiden = { path = "path/to/hit-leiden/usability" } +# Optional: enable serde support without pulling in profiling deps +# hit_leiden = { path = "...", features = ["serde"] } +``` + +Simplest possible usage — edges in, partition out: + +```rust +use hit_leiden::PartitionResult; + +let edges = vec![(0, 1, None), (1, 2, None), (2, 0, None), (3, 4, None), (4, 5, None), (5, 3, None)]; +let partition: PartitionResult = hit_leiden::run_simple(edges)?; + +for (node, &community) in partition.node_to_community.iter().enumerate() { + println!("node {} -> community {}", node, community); +} +``` + +With configuration: + +```rust +use hit_leiden::{GraphInput, RunConfig, RunMode}; + +let graph = GraphInput::from_unweighted_edges(vec![(0, 1), (1, 2), (2, 0)]); +let config = RunConfig::default() + .with_mode(RunMode::Deterministic) + .with_resolution(1.0) + .with_max_iterations(10); + +let outcome = hit_leiden::run(&graph, &config)?; +let partition = outcome.into_partition()?; +println!("{} communities found (Q={:.4})", partition.community_count, partition.quality_score); +``` + +With weighted edges and a dataset identifier: + +```rust +use hit_leiden::GraphInput; + +let graph = GraphInput { + dataset_id: Some("my-knowledge-graph".to_string()), + node_count: 4, + edges: vec![(0, 1, Some(1.0)), (1, 2, Some(0.5)), (2, 3, Some(1.0))], +}; + +let partition = hit_leiden::run_simple(graph.edges.clone())?; +``` + +Incremental updates — persist state, restore it later, and apply a delta: + +```rust +use hit_leiden::{CommunityState, GraphInput, RunConfig}; + +// Initial community detection +let edges = GraphInput::from_unweighted_edges(vec![ + (0, 1), (1, 2), (2, 0), (3, 4), (4, 5), (5, 3), +]); +let (state, partition) = CommunityState::initial(&edges, &RunConfig::default())?; + +// Persist state (enable the `serde` feature) +// let bytes = serde_json::to_vec(&state)?; + +// Later: restore and apply a delta graph +// let state: CommunityState = serde_json::from_slice(&bytes)?; +let delta = GraphInput::from_edges(vec![ + (2, 3, Some(1.0)), // bridge the two triangles + (5, 6, Some(1.0)), // add new nodes +]); +let (updated_state, updated_partition) = state.update(&delta, &RunConfig::default())?; +println!("{} communities", updated_partition.community_count); +``` + +Hierarchy access — traverse the community tree for [DRIFT search](https://microsoft.github.io/graphrag/query/drift_search/): + +```rust +use hit_leiden::{CommunityState, GraphInput, RunConfig}; + +let edges = GraphInput::from_unweighted_edges(vec![ + (0, 1), (1, 2), (2, 0), (3, 4), (4, 5), (5, 3), +]); +let (state, partition) = CommunityState::initial(&edges, &RunConfig::default())?; + +// hierarchy_levels: level 0 = finest, last = coarsest +// Each entry maps node index → community ID at that level +let levels = state.hierarchy_levels(); +println!("{} hierarchy levels", levels.len()); + +// Derive parent-child relationships between communities at successive levels +let tree = state.community_tree(); +for rel in &tree { + println!( + "community {} (level {}) -> parent {} (level {})", + rel.child_community, rel.child_level, + rel.parent_community, rel.parent_level, + ); +} + +// Decomposed storage: persist only the unique algorithm state +let (partition_state, edges_out, node_count, hierarchy) = state.into_parts(); +// Store partition_state via serde; edges and hierarchy can be derived from DB + +// Restore later +let restored = CommunityState::from_parts(partition_state, edges_out, node_count, hierarchy); +``` + +### CLI usage + ```sh -# Build -cargo build --release +# Build (CLI requires the `cli` feature) +cargo build --release --features cli # Run on an edge list (one "src dst [weight]" per line) -cargo run --release -- run --source file --path graph.txt +cargo run --release --features cli -- run --graph-source graph.txt # Run benchmarks cargo bench