From b6e9ab83cbd9d8519653d894dc7bc1379748b904 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sat, 18 Apr 2026 16:56:04 +0100 Subject: [PATCH 1/3] Revert "Polyfill interleave() until the upstream fearless_simd PR is merged" This reverts commit af675daeed9ae0ad10715ef8c984ba8a7c54de2b. --- Cargo.lock | 3 +-- Cargo.toml | 5 ++++- src/kernels/codelets.rs | 30 ++++++++---------------------- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 702d8cb..9ec8a79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -380,8 +380,7 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "fearless_simd" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76258897e51fd156ee03b6246ea53f3e0eb395d0b327e9961c4fc4c8b2fa151a" +source = "git+https://github.com/Shnatsel/fearless_simd.git?branch=interleave#7642be30901bc4a3702ebe51dfb0e4f81dadcd99" [[package]] name = "fftw" diff --git a/Cargo.toml b/Cargo.toml index aa13898..091e7b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,4 +61,7 @@ debug = true [package.metadata.docs.rs] all-features = true -[lints.rust] \ No newline at end of file +[lints.rust] + +[patch.crates-io] +fearless_simd = {git = "https://github.com/Shnatsel/fearless_simd.git", branch = "interleave"} \ No newline at end of file diff --git a/src/kernels/codelets.rs b/src/kernels/codelets.rs index 765dfb9..cd828f3 100644 --- a/src/kernels/codelets.rs +++ b/src/kernels/codelets.rs @@ -7,20 +7,6 @@ use fearless_simd::{ f32x4, f32x8, f64x4, Simd, SimdBase, SimdCombine, SimdFloat, SimdFrom, SimdSplit, }; -/// Equivalent to `a.interleave(b)` — returns `(a.zip_low(b), a.zip_high(b))`. -/// Slow polyfill for -#[inline(always)] -fn interleave_f64x4(a: f64x4, b: f64x4) -> (f64x4, f64x4) { - (a.zip_low(b), a.zip_high(b)) -} - -/// Equivalent to `a.interleave(b)` — returns `(a.zip_low(b), a.zip_high(b))`. -/// Slow polyfill for -#[inline(always)] -fn interleave_f32x4(a: f32x4, b: f32x4) -> (f32x4, f32x4) { - (a.zip_low(b), a.zip_high(b)) -} - /// FFT-16 codelet for `f64`: executes stages 0-3 (chunk_size 2 through 16) in a single function. /// /// Register-resident implementation: all 16 complex values are loaded into f64x4 vectors, @@ -47,10 +33,10 @@ fn fft_dit_codelet_16_simd_f64(simd: S, reals: &mut [f64], imags: &mut for (re, im) in reals.chunks_exact_mut(16).zip(imags.chunks_exact_mut(16)) { macro_rules! transpose4x4_f64 { ($g0:expr, $g1:expr, $g2:expr, $g3:expr) => {{ - let (t0, t1) = interleave_f64x4($g0, $g2); - let (t2, t3) = interleave_f64x4($g1, $g3); - let (r0, r1) = interleave_f64x4(t0, t2); - let (r2, r3) = interleave_f64x4(t1, t3); + let (t0, t1) = $g0.interleave($g2); + let (t2, t3) = $g1.interleave($g3); + let (r0, r1) = t0.interleave(t2); + let (r2, r3) = t1.interleave(t3); (r0, r1, r2, r3) }}; } @@ -250,10 +236,10 @@ fn fft_dit_codelet_32_simd_f32(simd: S, reals: &mut [f32], imags: &mut { macro_rules! transpose4x4 { ($g0:expr, $g1:expr, $g2:expr, $g3:expr) => {{ - let (t0, t1) = interleave_f32x4($g0, $g2); - let (t2, t3) = interleave_f32x4($g1, $g3); - let (r0, r1) = interleave_f32x4(t0, t2); - let (r2, r3) = interleave_f32x4(t1, t3); + let (t0, t1) = $g0.interleave($g2); + let (t2, t3) = $g1.interleave($g3); + let (r0, r1) = t0.interleave(t2); + let (r2, r3) = t1.interleave(t3); (r0, r1, r2, r3) }}; } From c69b5f6ed8fb6ed28996bd491225c1fcecc3334b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sat, 18 Apr 2026 18:25:24 +0100 Subject: [PATCH 2/3] Repoint to fearless_simd main now that my PR is merged --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9ec8a79..1e50ee7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -380,7 +380,7 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "fearless_simd" version = "0.4.0" -source = "git+https://github.com/Shnatsel/fearless_simd.git?branch=interleave#7642be30901bc4a3702ebe51dfb0e4f81dadcd99" +source = "git+https://github.com/linebender/fearless_simd#8fcafea970000a3122467afc7ec51c24a0387482" [[package]] name = "fftw" diff --git a/Cargo.toml b/Cargo.toml index 091e7b8..9b89135 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,4 +64,4 @@ all-features = true [lints.rust] [patch.crates-io] -fearless_simd = {git = "https://github.com/Shnatsel/fearless_simd.git", branch = "interleave"} \ No newline at end of file +fearless_simd = {git = "https://github.com/linebender/fearless_simd"} \ No newline at end of file From b53f44851b68fa1476a0c5af1a4166dc3a31b3a9 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sat, 18 Apr 2026 18:34:02 +0100 Subject: [PATCH 3/3] Use interleave() instead of zip_low()/zip_high() in BRAVO --- benches/bit_reversal.rs | 6 ++---- src/algorithms/bravo.rs | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/benches/bit_reversal.rs b/benches/bit_reversal.rs index 53904ae..84331fa 100644 --- a/benches/bit_reversal.rs +++ b/benches/bit_reversal.rs @@ -89,8 +89,7 @@ mod old_bravo { let idx1 = i + offset + stride; let vec0 = chunks_a[idx0]; let vec1 = chunks_a[idx1]; - chunks_a[idx0] = vec0.zip_low(vec1); - chunks_a[idx1] = vec0.zip_high(vec1); + (chunks_a[idx0], chunks_a[idx1]) = vec0.interleave(vec1); } i += stride * 2; } @@ -117,8 +116,7 @@ mod old_bravo { let idx1 = i + offset + stride; let vec0 = chunks_b[idx0]; let vec1 = chunks_b[idx1]; - chunks_b[idx0] = vec0.zip_low(vec1); - chunks_b[idx1] = vec0.zip_high(vec1); + (chunks_b[idx0], chunks_b[idx1]) = vec0.interleave(vec1); } i += stride * 2; } diff --git a/src/algorithms/bravo.rs b/src/algorithms/bravo.rs index beae7d7..360385f 100644 --- a/src/algorithms/bravo.rs +++ b/src/algorithms/bravo.rs @@ -137,8 +137,7 @@ macro_rules! impl_bit_rev_bravo { let idx1 = i + offset + stride; let vec0 = chunks_a[idx0]; let vec1 = chunks_a[idx1]; - chunks_a[idx0] = vec0.zip_low(vec1); - chunks_a[idx1] = vec0.zip_high(vec1); + (chunks_a[idx0], chunks_a[idx1]) = vec0.interleave(vec1); } i += stride * 2; } @@ -167,8 +166,7 @@ macro_rules! impl_bit_rev_bravo { let idx1 = i + offset + stride; let vec0 = chunks_b[idx0]; let vec1 = chunks_b[idx1]; - chunks_b[idx0] = vec0.zip_low(vec1); - chunks_b[idx1] = vec0.zip_high(vec1); + (chunks_b[idx0], chunks_b[idx1]) = vec0.interleave(vec1); } i += stride * 2; }