From 16839e4a27eef850b0f3c2ca14d3ffd49e35e825 Mon Sep 17 00:00:00 2001 From: hczphn Date: Sat, 21 Feb 2026 22:12:07 +0000 Subject: [PATCH 1/5] Add unchecked device memory export, LogUp query count API, lightweight prove, and configurable CPU count - Add `export_device_memories_unchecked()` for exporting device memories without state assertion, enabling memory optimization workflows where context is dropped before proving - Add `prove_lightweight()` to ExpanderNoOverSubscribe, allowing prove without holding computation_graph or prover_setup references - Add `final_check_with_query_count()` to LogUpSingleKeyTable and LogUpRangeProofTable for hint-free logup verification with externally provided query counts - Support `ZKML_NUM_CPUS` env var to override physical CPU detection for MPI process count Co-Authored-By: Claude Opus 4.6 --- circuit-std-rs/src/logup.rs | 48 +++++++++++++++++++ expander_compiler/src/zkcuda/context.rs | 11 +++++ .../api_no_oversubscribe.rs | 12 +++++ .../expander_parallelized/client_utils.rs | 6 ++- 4 files changed, 76 insertions(+), 1 deletion(-) diff --git a/circuit-std-rs/src/logup.rs b/circuit-std-rs/src/logup.rs index a88ba102..d99b8b7c 100644 --- a/circuit-std-rs/src/logup.rs +++ b/circuit-std-rs/src/logup.rs @@ -328,6 +328,35 @@ impl LogUpSingleKeyTable { assert_eq_rational(builder, &v_table, &v_query); } + + pub fn final_check_with_query_count>( + &mut self, + builder: &mut B, + query_count: &[Variable], + ) { + if self.table.is_empty() || self.query_keys.is_empty() { + panic!("empty table or empty query"); + } + + let value_len = self.table[0].len(); + + let alpha = builder.get_random_value(); + let randomness = get_column_randomness(builder, value_len); + + let table_combined = combine_columns(builder, &self.table, &randomness); + let v_table = logup_poly_val(builder, &table_combined, query_count, &alpha); + + let query_combined = combine_columns(builder, &self.query_results, &randomness); + let one = builder.constant(1); + let v_query = logup_poly_val( + builder, + &query_combined, + &vec![one; query_combined.len()], + &alpha, + ); + + assert_eq_rational(builder, &v_table, &v_query); + } } pub struct LogUpRangeProofTable { @@ -455,6 +484,25 @@ impl LogUpRangeProofTable { ); assert_eq_rational(builder, &v_table, &v_query); } + + pub fn final_check_with_query_count>( + &mut self, + builder: &mut B, + query_count: &[Variable], + ) { + let alpha = builder.get_random_value(); + + let v_table = logup_poly_val(builder, &self.table_keys, query_count, &alpha); + + let one = builder.constant(1); + let v_query = logup_poly_val( + builder, + &self.query_keys, + &vec![one; self.query_keys.len()], + &alpha, + ); + assert_eq_rational(builder, &v_table, &v_query); + } } pub fn query_count_hint(inputs: &[F], outputs: &mut [F]) -> Result<(), Error> { diff --git a/expander_compiler/src/zkcuda/context.rs b/expander_compiler/src/zkcuda/context.rs index 40081f72..a929ffac 100644 --- a/expander_compiler/src/zkcuda/context.rs +++ b/expander_compiler/src/zkcuda/context.rs @@ -888,6 +888,17 @@ impl>> Context { ContextState::WitnessDone, "Please finish computation graph and witness solving before exporting device memories." ); + self.export_device_memories_impl() + } + + /// Export device memories without checking the context state. + /// Use this when you need to export memories outside the normal workflow, + /// e.g., for memory optimization where you want to export and then drop the context. + pub fn export_device_memories_unchecked(&self) -> Vec>> { + self.export_device_memories_impl() + } + + fn export_device_memories_impl(&self) -> Vec>> { self.device_memories .iter() .map(|dm| { diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/api_no_oversubscribe.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/api_no_oversubscribe.rs index 7d7fed98..6a559fa1 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/api_no_oversubscribe.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/api_no_oversubscribe.rs @@ -73,3 +73,15 @@ where wait_async(ClientHttpHelper::request_exit()) } } + +impl ExpanderNoOverSubscribe +where + as ExpanderPCS>>::Commitment: + AsRef< as ExpanderPCS>>::Commitment>, +{ + /// Lightweight prove that doesn't require computation_graph or prover_setup. + /// Use this after setup() to allow releasing those large data structures before proving. + pub fn prove_lightweight(device_memories: Vec>>) { + client_send_witness_and_prove::(device_memories); + } +} diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs index 42315b39..bf4a07c4 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs @@ -112,7 +112,11 @@ where let mpi_size = if allow_oversubscribe { max_parallel_count } else { - let num_cpus = prev_power_of_two(num_cpus::get_physical()); + let num_cpus = std::env::var("ZKML_NUM_CPUS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or_else(num_cpus::get_physical); + let num_cpus = prev_power_of_two(num_cpus); if max_parallel_count > num_cpus { num_cpus } else { From 639a18bd279ba649d2692488b80840e054ae6b9e Mon Sep 17 00:00:00 2001 From: hczphn Date: Sun, 22 Feb 2026 01:25:37 +0000 Subject: [PATCH 2/5] Reduce peak memory during prove by releasing witness shared memory early Add witness_ack shared memory signaling between client and server: - Client resets a 1-byte ack signal before writing witness - Server signals ack after reading witness into MPI shared memory - Client polls for ack, then immediately releases witness shared memory and calls malloc_trim to return memory to OS - Prove request runs concurrently via tokio async, so witness memory is freed while proving is in progress - Skip reading PCS setup from shared memory (return default) since the client does not need it after setup Co-Authored-By: Claude Opus 4.6 --- .../expander_parallelized/client_utils.rs | 39 ++++++++++++++-- .../expander_parallelized/server_ctrl.rs | 3 ++ .../shared_memory_utils.rs | 44 +++++++++++++++++++ 3 files changed, 83 insertions(+), 3 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs index bf4a07c4..a83b7219 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs @@ -87,7 +87,7 @@ where C: GKREngine, ECCConfig: Config, { - let setup_timer = Timer::new("setup", true); + let setup_timer = Timer::new("new setup", true); println!("Starting server with binary: {server_binary}"); let mut bytes = vec![]; @@ -140,7 +140,11 @@ where setup_timer.stop(); - SharedMemoryEngine::read_pcs_setup_from_shared_memory() + // Skip reading PCS setup from shared memory; return default to reduce memory + ( + ExpanderProverSetup::default(), + ExpanderVerifierSetup::default(), + ) } pub fn client_send_witness_and_prove( @@ -152,8 +156,37 @@ where { let timer = Timer::new("prove", true); + // Reset ack signal, then write witness + SharedMemoryEngine::reset_witness_ack(); SharedMemoryEngine::write_witness_to_shared_memory::(device_memories); - wait_async(ClientHttpHelper::request_prove()); + + extern "C" { + fn malloc_trim(pad: usize) -> i32; + } + unsafe { + malloc_trim(0); + } + + // Async: send prove request + poll for witness ack to release shared memory early + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let prove_handle = tokio::spawn(async { + ClientHttpHelper::request_prove().await; + }); + + // Poll witness_ack; once server confirms read, release witness shared memory + tokio::task::spawn_blocking(|| { + SharedMemoryEngine::wait_for_witness_read_complete(); + unsafe { + super::shared_memory_utils::SHARED_MEMORY.witness = None; + malloc_trim(0); + } + }) + .await + .expect("Witness cleanup task failed"); + + prove_handle.await.expect("Prove task failed"); + }); let proof = SharedMemoryEngine::read_proof_from_shared_memory(); diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_ctrl.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_ctrl.rs index 27919a50..f51dd509 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_ctrl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_ctrl.rs @@ -149,6 +149,9 @@ where let mut witness_win = state.wt_shared_memory_win.lock().await; S::setup_shared_witness(&state.global_mpi_config, &mut witness, &mut witness_win); + // Signal client: witness has been read, shared memory can be released + SharedMemoryEngine::signal_witness_read_complete(); + let prover_setup_guard = state.prover_setup.lock().await; let computation_graph = state.computation_graph.lock().await; diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs index 648f33a8..27889463 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs @@ -18,12 +18,15 @@ pub struct SharedMemory { pub pcs_setup: Option, pub witness: Option, pub proof: Option, + /// 1-byte signal: 0 = witness not read, 1 = server finished reading witness + pub witness_ack: Option, } pub static mut SHARED_MEMORY: SharedMemory = SharedMemory { pcs_setup: None, witness: None, proof: None, + witness_ack: None, }; pub struct SharedMemoryEngine {} @@ -106,6 +109,47 @@ impl SharedMemoryEngine { Self::read_object_from_shared_memory("pcs_setup", 0) } + /// Client: reset witness_ack to 0 (call before writing witness) + pub fn reset_witness_ack() { + unsafe { + Self::allocate_shared_memory_if_necessary( + &mut SHARED_MEMORY.witness_ack, + "witness_ack", + 1, + ); + let ptr = SHARED_MEMORY.witness_ack.as_mut().unwrap().as_ptr(); + std::ptr::write_volatile(ptr, 0u8); + } + } + + /// Server: set witness_ack to 1 (call after reading witness) + pub fn signal_witness_read_complete() { + let shmem = ShmemConf::new() + .flink("witness_ack") + .open() + .expect("Failed to open witness_ack shared memory"); + unsafe { + std::ptr::write_volatile(shmem.as_ptr(), 1u8); + } + } + + /// Client: poll until witness_ack becomes 1 + pub fn wait_for_witness_read_complete() { + unsafe { + let ptr = SHARED_MEMORY + .witness_ack + .as_ref() + .expect("witness_ack not initialized, call reset_witness_ack first") + .as_ptr() as *const u8; + loop { + if std::ptr::read_volatile(ptr) != 0 { + break; + } + std::thread::sleep(std::time::Duration::from_millis(500)); + } + } + } + pub fn write_witness_to_shared_memory(values: Vec>) { let total_size = std::mem::size_of::() + values From 95038f40f1d13bb421612ee10afc7f9fc68aa182 Mon Sep 17 00:00:00 2001 From: hczphn Date: Sun, 22 Feb 2026 01:33:11 +0000 Subject: [PATCH 3/5] Address review feedback: platform-guard malloc_trim, reduce polling interval - Wrap malloc_trim calls with #[cfg(all(target_os = "linux", target_env = "gnu"))] to avoid linker errors on non-glibc platforms - Reduce witness_ack polling interval from 500ms to 10ms for faster response Co-Authored-By: Claude Opus 4.6 --- .../expander_parallelized/client_utils.rs | 19 ++++++++++++++----- .../shared_memory_utils.rs | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs index a83b7219..fe1159d8 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs @@ -160,11 +160,14 @@ where SharedMemoryEngine::reset_witness_ack(); SharedMemoryEngine::write_witness_to_shared_memory::(device_memories); - extern "C" { - fn malloc_trim(pad: usize) -> i32; - } - unsafe { - malloc_trim(0); + #[cfg(all(target_os = "linux", target_env = "gnu"))] + { + extern "C" { + fn malloc_trim(pad: usize) -> i32; + } + unsafe { + malloc_trim(0); + } } // Async: send prove request + poll for witness ack to release shared memory early @@ -179,6 +182,12 @@ where SharedMemoryEngine::wait_for_witness_read_complete(); unsafe { super::shared_memory_utils::SHARED_MEMORY.witness = None; + } + #[cfg(all(target_os = "linux", target_env = "gnu"))] + unsafe { + extern "C" { + fn malloc_trim(pad: usize) -> i32; + } malloc_trim(0); } }) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs index 27889463..c5584674 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs @@ -145,7 +145,7 @@ impl SharedMemoryEngine { if std::ptr::read_volatile(ptr) != 0 { break; } - std::thread::sleep(std::time::Duration::from_millis(500)); + std::thread::sleep(std::time::Duration::from_millis(10)); } } } From 2465d52ecca2a093e71a5471243fabb800976feb Mon Sep 17 00:00:00 2001 From: hczphn Date: Sun, 22 Feb 2026 01:47:54 +0000 Subject: [PATCH 4/5] Address review: add polling timeout, revert debug label, remove redundant malloc_trim - Add 5-minute timeout to wait_for_witness_read_complete to prevent indefinite hang if the server crashes - Revert timer label from "new setup" back to "setup" - Remove duplicate malloc_trim inside spawn_blocking (shared memory is mmap-managed, not glibc heap) Co-Authored-By: Claude Opus 4.6 --- .../expander_parallelized/client_utils.rs | 9 +-------- .../expander_parallelized/shared_memory_utils.rs | 11 ++++++++++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs index fe1159d8..41edcd36 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs @@ -87,7 +87,7 @@ where C: GKREngine, ECCConfig: Config, { - let setup_timer = Timer::new("new setup", true); + let setup_timer = Timer::new("setup", true); println!("Starting server with binary: {server_binary}"); let mut bytes = vec![]; @@ -183,13 +183,6 @@ where unsafe { super::shared_memory_utils::SHARED_MEMORY.witness = None; } - #[cfg(all(target_os = "linux", target_env = "gnu"))] - unsafe { - extern "C" { - fn malloc_trim(pad: usize) -> i32; - } - malloc_trim(0); - } }) .await .expect("Witness cleanup task failed"); diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs index c5584674..b03aa639 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/shared_memory_utils.rs @@ -133,8 +133,11 @@ impl SharedMemoryEngine { } } - /// Client: poll until witness_ack becomes 1 + /// Client: poll until witness_ack becomes 1, with a timeout to avoid hanging + /// if the server crashes. pub fn wait_for_witness_read_complete() { + const TIMEOUT: std::time::Duration = std::time::Duration::from_secs(300); + let start = std::time::Instant::now(); unsafe { let ptr = SHARED_MEMORY .witness_ack @@ -145,6 +148,12 @@ impl SharedMemoryEngine { if std::ptr::read_volatile(ptr) != 0 { break; } + if start.elapsed() > TIMEOUT { + panic!( + "Timed out waiting for server to read witness ({}s)", + TIMEOUT.as_secs() + ); + } std::thread::sleep(std::time::Duration::from_millis(10)); } } From dc557cbfbb36f2445dce5b94f0f5799cf04e749f Mon Sep 17 00:00:00 2001 From: hczphn Date: Sun, 22 Feb 2026 02:28:14 +0000 Subject: [PATCH 5/5] fix: restore verifier setup from shared memory to fix verify panic The previous optimization skipped reading PCS setup from shared memory and returned empty defaults, which caused verify to panic on v_keys lookup (unwrap on None). Co-Authored-By: Claude Opus 4.6 --- .../expander_parallelized/client_utils.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs index 41edcd36..64b39c03 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/client_utils.rs @@ -140,11 +140,11 @@ where setup_timer.stop(); - // Skip reading PCS setup from shared memory; return default to reduce memory - ( - ExpanderProverSetup::default(), - ExpanderVerifierSetup::default(), - ) + // Prover setup not needed on client side (server does the proving). + // Verifier setup is required for verification, so read it from shared memory. + let (_prover_setup, verifier_setup) = + SharedMemoryEngine::read_pcs_setup_from_shared_memory::(); + (ExpanderProverSetup::default(), verifier_setup) } pub fn client_send_witness_and_prove(