apache · viiccwen · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
@@ -145,7 +145,8 @@ pub(crate) fn stream_encode<E: ChunkEncoder>(
     let num_samples = reader_core.total_rows;
 
     // Allocate output state vector
-    let total_state_vector = GpuStateVector::new_batch(&engine.device, num_samples, num_qubits)?;
+    let total_state_vector =
+        GpuStateVector::new_batch(&engine.device, num_samples, num_qubits, engine.precision())?;
     const PIPELINE_EVENT_SLOTS: usize = 2;
     let ctx = PipelineContext::new(&engine.device, PIPELINE_EVENT_SLOTS)?;
 

@@ -40,8 +40,8 @@ use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
 use cudarc::driver::{DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
 use qdp_kernels::{
-    launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm, launch_l2_norm_batch,
-    launch_l2_norm_f32,
+    launch_amplitude_encode, launch_amplitude_encode_batch, launch_amplitude_encode_batch_f32,
+    launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, launch_l2_norm_f32,
 };
 #[cfg(target_os = "linux")]
 use std::ffi::c_void;
@@ -206,7 +206,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         // Allocate single large GPU buffer for all states
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         // Upload input data to GPU
@@ -386,7 +386,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         let input_batch_d = input_batch_d as *const f64;
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
         let inv_norms_gpu = {
             crate::profile_scope!("GPU::BatchNormKernel");
@@ -457,6 +457,109 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        let state_len = 1 << num_qubits;
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Sample size {} exceeds state vector size {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatchF32");
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
+        };
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernelF32");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!(
+                    "Failed to allocate f32 norm buffer: {:?}",
+                    e
+                ))
+            })?;
+            let ret = unsafe {
+                launch_l2_norm_batch_f32(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel (f32 batch) failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+        {
+            crate::profile_scope!("GPU::NormValidationF32");
+            let host_inv_norms = device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| {
+                MahoutError::Cuda(format!("Failed to copy f32 norms to host: {:?}", e))
+            })?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm (f32 batch)".to_string(),
+                ));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunchF32");
+            use cudarc::driver::DevicePtr;
+            let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch_f32(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Amplitude encode batch (f32) kernel failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+        Ok(batch_state_vector)
+    }
+
     fn name(&self) -> &'static str {
         "amplitude"
     }

@@ -168,7 +168,7 @@ impl QuantumEncoder for AngleEncoder {
 
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         let input_bytes = std::mem::size_of_val(batch_data);
@@ -337,7 +337,7 @@ impl QuantumEncoder for AngleEncoder {
         }
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
             MahoutError::InvalidInput(
@@ -412,7 +412,7 @@ impl AngleEncoder {
     ) -> Result<GpuStateVector> {
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {

@@ -169,7 +169,7 @@ impl QuantumEncoder for BasisEncoder {
         // Allocate batch state vector
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         // Upload basis indices to GPU
@@ -298,7 +298,7 @@ impl QuantumEncoder for BasisEncoder {
         let basis_indices_d = input_batch_d as *const usize;
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
         let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
             MahoutError::InvalidInput(

@@ -190,7 +190,7 @@ impl QuantumEncoder for IqpEncoder {
 
         let batch_state_vector = {
             crate::profile_scope!("GPU::AllocBatch");
-            GpuStateVector::new_batch(device, num_samples, num_qubits)?
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
         };
 
         let input_bytes = std::mem::size_of_val(batch_data);

@@ -134,6 +134,28 @@ pub trait QuantumEncoder: Send + Sync {
             self.name()
         )))
     }
+
+    /// Encode f32 batch from existing GPU pointer (zero copy). Default: not supported.
+    ///
+    /// # Safety
+    /// Caller must ensure `input_batch_d` points to valid GPU memory with at least
+    /// `num_samples * sample_size` f32 elements on the same device as `device`,
+    /// and `stream` is a valid CUDA stream or null.
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _input_batch_d: *const f32,
+        _num_samples: usize,
+        _sample_size: usize,
+        _num_qubits: usize,
+        _stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_batch_from_gpu_ptr_f32 not supported for {}",
+            self.name()
+        )))
+    }
 }
 
 // Encoding implementations

@@ -344,7 +344,13 @@ impl GpuStateVector {
 
     /// Create GPU state vector for a batch of samples
     /// Allocates num_samples * 2^qubits complex numbers on GPU
-    pub fn new_batch(_device: &Arc<CudaDevice>, num_samples: usize, qubits: usize) -> Result<Self> {
+    #[cfg(target_os = "linux")]
+    pub fn new_batch(
+        _device: &Arc<CudaDevice>,
+        num_samples: usize,
+        qubits: usize,
+        precision: Precision,
+    ) -> Result<Self> {
         let single_state_size: usize = 1usize << qubits;
         let total_elements = num_samples.checked_mul(single_state_size).ok_or_else(|| {
             MahoutError::MemoryAllocation(format!(
@@ -353,50 +359,84 @@ impl GpuStateVector {
             ))
         })?;
 
-        #[cfg(target_os = "linux")]
-        {
-            let requested_bytes = total_elements
-                .checked_mul(std::mem::size_of::<CuDoubleComplex>())
-                .ok_or_else(|| {
-                    MahoutError::MemoryAllocation(format!(
-                        "Requested GPU allocation size overflow (elements={})",
-                        total_elements
-                    ))
-                })?;
+        let buffer = match precision {
+            Precision::Float32 => {
+                let requested_bytes = total_elements
+                    .checked_mul(std::mem::size_of::<CuComplex>())
+                    .ok_or_else(|| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Requested GPU allocation size overflow (elements={})",
+                            total_elements
+                        ))
+                    })?;
 
-            // Pre-flight check
-            ensure_device_memory_available(
-                requested_bytes,
-                "batch state vector allocation",
-                Some(qubits),
-            )?;
+                ensure_device_memory_available(
+                    requested_bytes,
+                    "batch state vector allocation (f32)",
+                    Some(qubits),
+                )?;
 
-            let slice =
-                unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }.map_err(|e| {
+                let slice = unsafe { _device.alloc::<CuComplex>(total_elements) }.map_err(|e| {
                     map_allocation_error(
                         requested_bytes,
-                        "batch state vector allocation",
+                        "batch state vector allocation (f32)",
                         Some(qubits),
                         e,
                     )
                 })?;
 
-            Ok(Self {
-                buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
-                num_qubits: qubits,
-                size_elements: total_elements,
-                num_samples: Some(num_samples),
-                device_id: _device.ordinal(),
-            })
-        }
+                BufferStorage::F32(GpuBufferRaw { slice })
+            }
+            Precision::Float64 => {
+                let requested_bytes = total_elements
+                    .checked_mul(std::mem::size_of::<CuDoubleComplex>())
+                    .ok_or_else(|| {
+                        MahoutError::MemoryAllocation(format!(
+                            "Requested GPU allocation size overflow (elements={})",
+                            total_elements
+                        ))
+                    })?;
 
-        #[cfg(not(target_os = "linux"))]
-        {
-            Err(MahoutError::Cuda(
-                "CUDA is only available on Linux. This build does not support GPU operations."
-                    .to_string(),
-            ))
-        }
+                ensure_device_memory_available(
+                    requested_bytes,
+                    "batch state vector allocation",
+                    Some(qubits),
+                )?;
+
+                let slice =
+                    unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }.map_err(|e| {
+                        map_allocation_error(
+                            requested_bytes,
+                            "batch state vector allocation",
+                            Some(qubits),
+                            e,
+                        )
+                    })?;
+
+                BufferStorage::F64(GpuBufferRaw { slice })
+            }
+        };
+
+        Ok(Self {
+            buffer: Arc::new(buffer),
+            num_qubits: qubits,
+            size_elements: total_elements,
+            num_samples: Some(num_samples),
+            device_id: _device.ordinal(),
+        })
+    }
+
+    #[cfg(not(target_os = "linux"))]
+    pub fn new_batch(
+        _device: &Arc<CudaDevice>,
+        _num_samples: usize,
+        _qubits: usize,
+        _precision: Precision,
+    ) -> Result<Self> {
+        Err(MahoutError::Cuda(
+            "CUDA is only available on Linux. This build does not support GPU operations."
+                .to_string(),
+        ))
     }
 
     /// Convert the state vector to the requested precision (GPU-side).