Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion qdp/qdp-core/src/encoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ pub(crate) fn stream_encode<E: ChunkEncoder>(
let num_samples = reader_core.total_rows;

// Allocate output state vector
let total_state_vector = GpuStateVector::new_batch(&engine.device, num_samples, num_qubits)?;
let total_state_vector =
GpuStateVector::new_batch(&engine.device, num_samples, num_qubits, engine.precision())?;
const PIPELINE_EVENT_SLOTS: usize = 2;
let ctx = PipelineContext::new(&engine.device, PIPELINE_EVENT_SLOTS)?;

Expand Down
111 changes: 107 additions & 4 deletions qdp/qdp-core/src/gpu/encodings/amplitude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
use cudarc::driver::{DevicePtr, DevicePtrMut};
#[cfg(target_os = "linux")]
use qdp_kernels::{
launch_amplitude_encode, launch_amplitude_encode_batch, launch_l2_norm, launch_l2_norm_batch,
launch_l2_norm_f32,
launch_amplitude_encode, launch_amplitude_encode_batch, launch_amplitude_encode_batch_f32,
launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, launch_l2_norm_f32,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this file around lines 251–276 and 351–376, amplitude_encode_batch_kernel / _f32 compute input_base = sample_idx * input_len and then do reinterpret_cast<const double2*>(input_batch + input_base) + elem_pair / float2. For odd input_len and sample_idx > 0 this base pointer is only 8‑byte (double) / 4‑byte (float) aligned, not 16‑byte, so the double2/float2 loads are potentially misaligned. This alignment pattern already existed in the original f64 batch kernel and this PR copies it into the new f32 batch path; please either enforce even input_len at the Rust call‑site or rework the kernels to index from a properly aligned double2* / float2* base pointer with a scalar fallback.

};
#[cfg(target_os = "linux")]
use std::ffi::c_void;
Expand Down Expand Up @@ -206,7 +206,7 @@ impl QuantumEncoder for AmplitudeEncoder {
// Allocate single large GPU buffer for all states
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};

// Upload input data to GPU
Expand Down Expand Up @@ -386,7 +386,7 @@ impl QuantumEncoder for AmplitudeEncoder {
let input_batch_d = input_batch_d as *const f64;
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernel");
Expand Down Expand Up @@ -457,6 +457,109 @@ impl QuantumEncoder for AmplitudeEncoder {
Ok(batch_state_vector)
}

#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr_f32(
&self,
device: &Arc<CudaDevice>,
input_batch_d: *const f32,
num_samples: usize,
sample_size: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<GpuStateVector> {
let state_len = 1 << num_qubits;
if num_samples == 0 {
return Err(MahoutError::InvalidInput(
"Number of samples cannot be zero".into(),
));
}
if sample_size == 0 {
return Err(MahoutError::InvalidInput(
"Sample size cannot be zero".into(),
));
}
if sample_size > state_len {
return Err(MahoutError::InvalidInput(format!(
"Sample size {} exceeds state vector size {} (2^{} qubits)",
sample_size, state_len, num_qubits
)));
}
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatchF32");
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
};
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernelF32");
use cudarc::driver::DevicePtrMut;
let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
MahoutError::MemoryAllocation(format!(
"Failed to allocate f32 norm buffer: {:?}",
e
))
})?;
let ret = unsafe {
launch_l2_norm_batch_f32(
input_batch_d,
num_samples,
sample_size,
*buffer.device_ptr_mut() as *mut f32,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Norm reduction kernel (f32 batch) failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
buffer
};
{
crate::profile_scope!("GPU::NormValidationF32");
let host_inv_norms = device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| {
MahoutError::Cuda(format!("Failed to copy f32 norms to host: {:?}", e))
})?;
if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm (f32 batch)".to_string(),
));
}
}
{
crate::profile_scope!("GPU::BatchKernelLaunchF32");
use cudarc::driver::DevicePtr;
let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
MahoutError::InvalidInput(
"Batch state vector precision mismatch (expected float32 buffer)".to_string(),
)
})?;
let ret = unsafe {
launch_amplitude_encode_batch_f32(
input_batch_d,
state_ptr as *mut c_void,
*inv_norms_gpu.device_ptr() as *const f32,
num_samples,
sample_size,
state_len,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Amplitude encode batch (f32) kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}
{
crate::profile_scope!("GPU::Synchronize");
sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
Ok(batch_state_vector)
}

fn name(&self) -> &'static str {
"amplitude"
}
Expand Down
6 changes: 3 additions & 3 deletions qdp/qdp-core/src/gpu/encodings/angle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ impl QuantumEncoder for AngleEncoder {

let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};

let input_bytes = std::mem::size_of_val(batch_data);
Expand Down Expand Up @@ -337,7 +337,7 @@ impl QuantumEncoder for AngleEncoder {
}
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
Expand Down Expand Up @@ -412,7 +412,7 @@ impl AngleEncoder {
) -> Result<GpuStateVector> {
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};

let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
Expand Down
4 changes: 2 additions & 2 deletions qdp/qdp-core/src/gpu/encodings/basis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ impl QuantumEncoder for BasisEncoder {
// Allocate batch state vector
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};

// Upload basis indices to GPU
Expand Down Expand Up @@ -298,7 +298,7 @@ impl QuantumEncoder for BasisEncoder {
let basis_indices_d = input_batch_d as *const usize;
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};
let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
MahoutError::InvalidInput(
Expand Down
2 changes: 1 addition & 1 deletion qdp/qdp-core/src/gpu/encodings/iqp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ impl QuantumEncoder for IqpEncoder {

let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch");
GpuStateVector::new_batch(device, num_samples, num_qubits)?
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float64)?
};

let input_bytes = std::mem::size_of_val(batch_data);
Expand Down
22 changes: 22 additions & 0 deletions qdp/qdp-core/src/gpu/encodings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,28 @@ pub trait QuantumEncoder: Send + Sync {
self.name()
)))
}

/// Encode f32 batch from existing GPU pointer (zero copy). Default: not supported.
///
/// # Safety
/// Caller must ensure `input_batch_d` points to valid GPU memory with at least
/// `num_samples * sample_size` f32 elements on the same device as `device`,
/// and `stream` is a valid CUDA stream or null.
#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr_f32(
&self,
_device: &Arc<CudaDevice>,
_input_batch_d: *const f32,
_num_samples: usize,
_sample_size: usize,
_num_qubits: usize,
_stream: *mut c_void,
) -> Result<GpuStateVector> {
Err(MahoutError::NotImplemented(format!(
"encode_batch_from_gpu_ptr_f32 not supported for {}",
self.name()
)))
}
}

// Encoding implementations
Expand Down
110 changes: 75 additions & 35 deletions qdp/qdp-core/src/gpu/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,13 @@ impl GpuStateVector {

/// Create GPU state vector for a batch of samples
/// Allocates num_samples * 2^qubits complex numbers on GPU
pub fn new_batch(_device: &Arc<CudaDevice>, num_samples: usize, qubits: usize) -> Result<Self> {
#[cfg(target_os = "linux")]
pub fn new_batch(
_device: &Arc<CudaDevice>,
num_samples: usize,
qubits: usize,
precision: Precision,
) -> Result<Self> {
let single_state_size: usize = 1usize << qubits;
let total_elements = num_samples.checked_mul(single_state_size).ok_or_else(|| {
MahoutError::MemoryAllocation(format!(
Expand All @@ -353,50 +359,84 @@ impl GpuStateVector {
))
})?;

#[cfg(target_os = "linux")]
{
let requested_bytes = total_elements
.checked_mul(std::mem::size_of::<CuDoubleComplex>())
.ok_or_else(|| {
MahoutError::MemoryAllocation(format!(
"Requested GPU allocation size overflow (elements={})",
total_elements
))
})?;
let buffer = match precision {
Precision::Float32 => {
let requested_bytes = total_elements
.checked_mul(std::mem::size_of::<CuComplex>())
.ok_or_else(|| {
MahoutError::MemoryAllocation(format!(
"Requested GPU allocation size overflow (elements={})",
total_elements
))
})?;

// Pre-flight check
ensure_device_memory_available(
requested_bytes,
"batch state vector allocation",
Some(qubits),
)?;
ensure_device_memory_available(
requested_bytes,
"batch state vector allocation (f32)",
Some(qubits),
)?;

let slice =
unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }.map_err(|e| {
let slice = unsafe { _device.alloc::<CuComplex>(total_elements) }.map_err(|e| {
map_allocation_error(
requested_bytes,
"batch state vector allocation",
"batch state vector allocation (f32)",
Some(qubits),
e,
)
})?;

Ok(Self {
buffer: Arc::new(BufferStorage::F64(GpuBufferRaw { slice })),
num_qubits: qubits,
size_elements: total_elements,
num_samples: Some(num_samples),
device_id: _device.ordinal(),
})
}
BufferStorage::F32(GpuBufferRaw { slice })
}
Precision::Float64 => {
let requested_bytes = total_elements
.checked_mul(std::mem::size_of::<CuDoubleComplex>())
.ok_or_else(|| {
MahoutError::MemoryAllocation(format!(
"Requested GPU allocation size overflow (elements={})",
total_elements
))
})?;

#[cfg(not(target_os = "linux"))]
{
Err(MahoutError::Cuda(
"CUDA is only available on Linux. This build does not support GPU operations."
.to_string(),
))
}
ensure_device_memory_available(
requested_bytes,
"batch state vector allocation",
Some(qubits),
)?;

let slice =
unsafe { _device.alloc::<CuDoubleComplex>(total_elements) }.map_err(|e| {
map_allocation_error(
requested_bytes,
"batch state vector allocation",
Some(qubits),
e,
)
})?;

BufferStorage::F64(GpuBufferRaw { slice })
}
};

Ok(Self {
buffer: Arc::new(buffer),
num_qubits: qubits,
size_elements: total_elements,
num_samples: Some(num_samples),
device_id: _device.ordinal(),
})
}

#[cfg(not(target_os = "linux"))]
pub fn new_batch(
_device: &Arc<CudaDevice>,
_num_samples: usize,
_qubits: usize,
_precision: Precision,
) -> Result<Self> {
Err(MahoutError::Cuda(
"CUDA is only available on Linux. This build does not support GPU operations."
.to_string(),
))
}

/// Convert the state vector to the requested precision (GPU-side).
Expand Down
Loading