From 7fec806dc68d17e6a9c90acf0e1f489111a0d635 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sun, 15 Feb 2026 00:22:53 +0100 Subject: [PATCH 1/5] lock stdout when printing a intrinsic test failure --- crates/intrinsic-test/src/common/compare.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/crates/intrinsic-test/src/common/compare.rs b/crates/intrinsic-test/src/common/compare.rs index 5214349171..c22d7fd4ec 100644 --- a/crates/intrinsic-test/src/common/compare.rs +++ b/crates/intrinsic-test/src/common/compare.rs @@ -109,13 +109,26 @@ pub fn compare_outputs( } }) .inspect(|(intrinsic, diffs)| { - println!("Difference for intrinsic: {intrinsic}"); + use std::io::Write; + + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + writeln!(out, "Difference for intrinsic: {intrinsic}").unwrap(); diffs.into_iter().for_each(|diff| match diff { - diff::Result::Left(c) => println!("C: {c}"), - diff::Result::Right(rust) => println!("Rust: {rust}"), + diff::Result::Left(c) => { + writeln!(out, "C: {c}").unwrap(); + } + diff::Result::Right(rust) => { + writeln!(out, "Rust: {rust}").unwrap(); + } _ => (), }); - println!("****************************************************************"); + writeln!( + out, + "****************************************************************" + ) + .unwrap(); }) .count(); From 7ee9c7776d098916b375d0a3f44454e9c83bc6f5 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sat, 14 Feb 2026 22:11:00 +0100 Subject: [PATCH 2/5] use `intrinsics::simd` for aarch64 deinterleaving loads --- .../src/arm_shared/neon/generated.rs | 72 +++---------------- crates/core_arch/src/macros.rs | 69 ++++++++++++++++++ .../spec/neon/arm_shared.spec.yml | 26 +++---- 3 files changed, 87 insertions(+), 80 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index b6951907eb..7b4f69a375 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -22079,14 +22079,7 @@ pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v2f32.p0" - )] - fn _vld3_f32(ptr: *const float32x2_t) -> float32x2x3_t; - } - _vld3_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 2, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f32)"] @@ -22098,14 +22091,7 @@ pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4f32.p0" - )] - fn _vld3q_f32(ptr: *const float32x4_t) -> float32x4x3_t; - } - _vld3q_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 4, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s8)"] @@ -22117,14 +22103,7 @@ pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v8i8.p0" - )] - fn _vld3_s8(ptr: *const int8x8_t) -> int8x8x3_t; - } - _vld3_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 8, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s8)"] @@ -22136,14 +22115,7 @@ pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v16i8.p0" - )] - fn _vld3q_s8(ptr: *const int8x16_t) -> int8x16x3_t; - } - _vld3q_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 16, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s16)"] @@ -22155,14 +22127,7 @@ pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4i16.p0" - )] - fn _vld3_s16(ptr: *const int16x4_t) -> int16x4x3_t; - } - _vld3_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 4, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s16)"] @@ -22174,14 +22139,7 @@ pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v8i16.p0" - )] - fn _vld3q_s16(ptr: *const int16x8_t) -> int16x8x3_t; - } - _vld3q_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 8, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s32)"] @@ -22193,14 +22151,7 @@ pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v2i32.p0" - )] - fn _vld3_s32(ptr: *const int32x2_t) -> int32x2x3_t; - } - _vld3_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 2, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s32)"] @@ -22212,14 +22163,7 @@ pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4i32.p0" - )] - fn _vld3q_s32(ptr: *const int32x4_t) -> int32x4x3_t; - } - _vld3q_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 4, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"] diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs index 353829633f..d40ce51c74 100644 --- a/crates/core_arch/src/macros.rs +++ b/crates/core_arch/src/macros.rs @@ -186,3 +186,72 @@ macro_rules! simd_masked_store { $crate::intrinsics::simd::simd_masked_store::<_, _, _, { $align }>($mask, $ptr, $default) }; } + +pub(crate) const fn deinterleave_mask() +-> [u32; LANES] { + // Produces: [K, K+N, K+2N, ...] + let mut out = [0u32; LANES]; + let mut i = 0usize; + while i < LANES { + out[i] = (i * N + K) as u32; + i += 1; + } + out +} + +#[allow(unused)] +macro_rules! deinterleaving_load { + ($elem:ty, $lanes:literal, 2, $ptr:expr) => {{ + use $crate::core_arch::macros::deinterleave_mask; + use $crate::core_arch::simd::Simd; + use $crate::{mem::transmute, ptr}; + + type V = Simd<$elem, $lanes>; + type W = Simd<$elem, { $lanes * 2 }>; + + let w: W = ptr::read_unaligned($ptr as *const W); + + let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 0>()); + let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 1>()); + + transmute((v0, v1)) + }}; + + ($elem:ty, $lanes:literal, 3, $ptr:expr) => {{ + use $crate::core_arch::macros::deinterleave_mask; + use $crate::core_arch::simd::Simd; + use $crate::{mem::transmute, ptr}; + + type V = Simd<$elem, $lanes>; + type W = Simd<$elem, { $lanes * 3 }>; + + let w: W = ptr::read_unaligned($ptr as *const W); + + let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 0>()); + let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 1>()); + let v2: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 2>()); + + transmute((v0, v1, v2)) + }}; + + ($elem:ty, $lanes:literal, 4, $ptr:expr) => {{ + use $crate::core_arch::macros::deinterleave_mask; + use $crate::core_arch::simd::Simd; + use $crate::{mem::transmute, ptr}; + + type V = Simd<$elem, $lanes>; + type W = Simd<$elem, { $lanes * 4 }>; + + let w: W = ptr::read_unaligned($ptr as *const W); + + let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 0>()); + let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 1>()); + let v2: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 2>()); + let v3: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 3>()); + + transmute((v0, v1, v2, v3)) + }}; +} + +#[allow(unused)] +pub(crate) use deinterleaving_load; diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 3f7adbc278..3b2e9f25ae 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -3875,23 +3875,17 @@ intrinsics: safety: unsafe: [neon] types: - - ['*const i8', int8x8x3_t, '*const int8x8_t', i8] - - ['*const i16', int16x4x3_t, '*const int16x4_t', i16] - - ['*const i32', int32x2x3_t, '*const int32x2_t', i32] - - ['*const i8', int8x16x3_t, '*const int8x16_t', i8] - - ['*const i16', int16x8x3_t, '*const int16x8_t', i16] - - ['*const i32', int32x4x3_t, '*const int32x4_t', i32] - - ['*const f32', float32x2x3_t, '*const float32x2_t', f32] - - ['*const f32', float32x4x3_t, '*const float32x4_t', f32] + - ['*const i8', int8x8x3_t, i8, "8"] + - ['*const i16', int16x4x3_t, i16, "4"] + - ['*const i32', int32x2x3_t, i32, "2"] + - ['*const i8', int8x16x3_t, i8, "16"] + - ['*const i16', int16x8x3_t, i16, "8"] + - ['*const i32', int32x4x3_t, i32, "4"] + - ['*const f32', float32x2x3_t, f32, "2"] + - ['*const f32', float32x4x3_t, f32, "4"] compose: - - LLVMLink: - name: 'vld3{neon_type[1].nox}' - arguments: - - 'ptr: {type[2]}' - links: - - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']] + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "3", a], [], true] + - name: "vld3{neon_type[1].nox}" doc: Load multiple 3-element structures to three registers From f2a3de279aa3706c20aec0bb8cb75de755b8a0e8 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sat, 14 Feb 2026 22:46:20 +0100 Subject: [PATCH 3/5] neon `ld3` --- .../src/arm_shared/neon/generated.rs | 27 ++--------------- .../spec/neon/arm_shared.spec.yml | 30 +++++++------------ 2 files changed, 13 insertions(+), 44 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 7b4f69a375..33213e58ff 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -22036,14 +22036,7 @@ pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4f16.p0" - )] - fn _vld3_f16(ptr: *const f16) -> float16x4x3_t; - } - _vld3_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 4, 3, a) } #[doc = "Load single 3-element structure and replicate to all lanes of two registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f16)"] @@ -22060,14 +22053,7 @@ pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v8f16.p0" - )] - fn _vld3q_f16(ptr: *const f16) -> float16x8x3_t; - } - _vld3q_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 8, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"] @@ -22983,14 +22969,7 @@ pub unsafe fn vld3_p64(a: *const p64) -> poly64x1x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v1i64.p0" - )] - fn _vld3_s64(ptr: *const int64x1_t) -> int64x1x3_t; - } - _vld3_s64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 3b2e9f25ae..968d5f99de 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -3669,19 +3669,11 @@ intrinsics: safety: unsafe: [neon] types: - - ["*const f16", float16x4x3_t, f16] - - ["*const f16", float16x8x3_t, f16] + - ["*const f16", float16x4x3_t, f16, "4"] + - ["*const f16", float16x8x3_t, f16, "8"] compose: - - LLVMLink: - name: "vld3.{neon_type[1]}" - arguments: - - "ptr: {type[0]}" - links: - - link: "llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[2]}.p0" - arch: aarch64,arm64ec - - FnCall: - - "_vld3{neon_type[1].nox}" - - - "a as _" + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "3", a], [], true] + - name: "vld3{neon_type[1].dup_nox}" doc: Load single 3-element structure and replicate to all lanes of two registers @@ -3900,14 +3892,12 @@ intrinsics: types: - ['*const i64', int64x1x3_t, '*const int64x1_t', i64] compose: - - LLVMLink: - name: "vld3{neon_type[1].nox}" - arguments: - - 'ptr: {type[2]}' - links: - - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld3{neon_type[1].nox}" doc: Load multiple 3-element structures to three registers From 1ccf7b7a61ba0367a102449527c03fd65acd2549 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sat, 14 Feb 2026 23:03:39 +0100 Subject: [PATCH 4/5] neon `ld4` --- .../src/arm_shared/neon/generated.rs | 99 +++---------------- .../spec/neon/arm_shared.spec.yml | 54 ++++------ 2 files changed, 29 insertions(+), 124 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 33213e58ff..45c83b880e 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -24336,14 +24336,7 @@ pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4f16.p0" - )] - fn _vld4_f16(ptr: *const f16) -> float16x4x4_t; - } - _vld4_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 4, 4, a) } #[doc = "Load single 4-element structure and replicate to all lanes of two registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f16)"] @@ -24359,14 +24352,7 @@ pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v8f16.p0" - )] - fn _vld4q_f16(ptr: *const f16) -> float16x8x4_t; - } - _vld4q_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 8, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"] @@ -24378,14 +24364,7 @@ pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v2f32.p0" - )] - fn _vld4_f32(ptr: *const float32x2_t) -> float32x2x4_t; - } - _vld4_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 2, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f32)"] @@ -24397,14 +24376,7 @@ pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4f32.p0" - )] - fn _vld4q_f32(ptr: *const float32x4_t) -> float32x4x4_t; - } - _vld4q_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 4, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s8)"] @@ -24416,14 +24388,7 @@ pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v8i8.p0" - )] - fn _vld4_s8(ptr: *const int8x8_t) -> int8x8x4_t; - } - _vld4_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 8, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s8)"] @@ -24435,14 +24400,7 @@ pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v16i8.p0" - )] - fn _vld4q_s8(ptr: *const int8x16_t) -> int8x16x4_t; - } - _vld4q_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 16, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s16)"] @@ -24454,14 +24412,7 @@ pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4i16.p0" - )] - fn _vld4_s16(ptr: *const int16x4_t) -> int16x4x4_t; - } - _vld4_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 4, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s16)"] @@ -24473,14 +24424,7 @@ pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v8i16.p0" - )] - fn _vld4q_s16(ptr: *const int16x8_t) -> int16x8x4_t; - } - _vld4q_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 8, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s32)"] @@ -24492,14 +24436,7 @@ pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v2i32.p0" - )] - fn _vld4_s32(ptr: *const int32x2_t) -> int32x2x4_t; - } - _vld4_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 2, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s32)"] @@ -24511,14 +24448,7 @@ pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4i32.p0" - )] - fn _vld4q_s32(ptr: *const int32x4_t) -> int32x4x4_t; - } - _vld4q_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 4, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"] @@ -25379,14 +25309,7 @@ pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v1i64.p0" - )] - fn _vld4_s64(ptr: *const int64x1_t) -> int64x1x4_t; - } - _vld4_s64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 968d5f99de..8e10fff984 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -4357,23 +4357,16 @@ intrinsics: safety: unsafe: [neon] types: - - ['*const i8', int8x8x4_t, i8, '*const int8x8_t'] - - ['*const i32', int32x4x4_t, i32, '*const int32x4_t'] - - ['*const i16', int16x4x4_t, i16, '*const int16x4_t'] - - ['*const i32', int32x2x4_t, i32, '*const int32x2_t'] - - ['*const i8', int8x16x4_t, i8, '*const int8x16_t'] - - ['*const i16', int16x8x4_t, i16, '*const int16x8_t'] - - ['*const f32', float32x2x4_t, f32, '*const float32x2_t'] - - ['*const f32', float32x4x4_t, f32, '*const float32x4_t'] + - ['*const i8', int8x8x4_t, i8, "8"] + - ['*const i32', int32x4x4_t, i32, "4"] + - ['*const i16', int16x4x4_t, i16, "4"] + - ['*const i32', int32x2x4_t, i32, "2"] + - ['*const i8', int8x16x4_t, i8, "16"] + - ['*const i16', int16x8x4_t, i16, "8"] + - ['*const f32', float32x2x4_t, f32, "2"] + - ['*const f32', float32x4x4_t, f32, "4"] compose: - - LLVMLink: - name: 'vld4{neon_type[1].nox}' - arguments: - - 'ptr: {type[3]}' - links: - - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']] + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "4", a], [], true] - name: "vld4{neon_type[1].nox}" doc: Load multiple 4-element structures to four registers @@ -4386,14 +4379,12 @@ intrinsics: types: - ['*const i64', int64x1x4_t, i64, '*const int64x1_t'] compose: - - LLVMLink: - name: 'vld4{neon_type[1].nox}' - arguments: - - 'ptr: {type[3]}' - links: - - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld4{neon_type[1].lane_nox}" doc: Load multiple 4-element structures to four registers @@ -12418,19 +12409,10 @@ intrinsics: safety: unsafe: [neon] types: - - ["*const f16", float16x4x4_t, f16] - - ["*const f16", float16x8x4_t, f16] + - ["*const f16", float16x4x4_t, f16, "4"] + - ["*const f16", float16x8x4_t, f16, "8"] compose: - - LLVMLink: - name: "vld4.{neon_type[1]}" - arguments: - - "ptr: {type[0]}" - links: - - link: "llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0" - arch: aarch64,arm64ec - - FnCall: - - "_vld4{neon_type[1].nox}" - - - "a as _" + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "4", a], [], true] - name: "vld4{neon_type[1].dup_nox}" doc: Load single 4-element structure and replicate to all lanes of two registers From 8ba832904a299714f4b7254831c65fb38b7b72da Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sat, 14 Feb 2026 23:09:35 +0100 Subject: [PATCH 5/5] neon `ld1` --- .../core_arch/src/aarch64/neon/generated.rs | 27 ++---------- .../spec/neon/aarch64.spec.yml | 42 ++++++++----------- 2 files changed, 20 insertions(+), 49 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 9a8a9ad59e..119f903de7 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -11652,14 +11652,7 @@ pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld2.v1f64.p0" - )] - fn _vld2_f64(ptr: *const float64x1_t) -> float64x1x2_t; - } - _vld2_f64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 2-element structures to two registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f64)"] @@ -12031,14 +12024,7 @@ pub unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v1f64.p0" - )] - fn _vld3_f64(ptr: *const float64x1_t) -> float64x1x3_t; - } - _vld3_f64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f64)"] @@ -12442,14 +12428,7 @@ pub unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v1f64.p0" - )] - fn _vld4_f64(ptr: *const float64x1_t) -> float64x1x4_t; - } - _vld4_f64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f64)"] diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index a10403de41..b81f04ebc0 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -3698,16 +3698,12 @@ intrinsics: types: - ["*const f64", float64x1x2_t, f64, float64x1_t] compose: - - LLVMLink: - name: "vld2.{neon_type[1]}" - arguments: - - "ptr: *const {neon_type[3]}" - links: - - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0" - arch: aarch64,arm64ec - FnCall: - - "_vld2{neon_type[1].nox}" - - - "a as _" + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld2{neon_type[1].nox}" doc: Load multiple 2-element structures to two registers @@ -4057,14 +4053,12 @@ intrinsics: types: - ['*const f64', float64x1x3_t, '*const float64x1_t', f64] compose: - - LLVMLink: - name: 'vld3{neon_type[1].nox}' - arguments: - - 'ptr: {type[2]}' - links: - - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld3{neon_type[1].nox}" doc: Load multiple 3-element structures to three registers @@ -4203,14 +4197,12 @@ intrinsics: types: - ['*const f64', float64x1x4_t, f64, '*const float64x1_t'] compose: - - LLVMLink: - name: 'vld4{neon_type[1].nox}' - arguments: - - 'ptr: {type[3]}' - links: - - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld4{neon_type[1].nox}" doc: Load multiple 4-element structures to four registers