diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 9a8a9ad59e..119f903de7 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -11652,14 +11652,7 @@ pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld2.v1f64.p0" - )] - fn _vld2_f64(ptr: *const float64x1_t) -> float64x1x2_t; - } - _vld2_f64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 2-element structures to two registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f64)"] @@ -12031,14 +12024,7 @@ pub unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v1f64.p0" - )] - fn _vld3_f64(ptr: *const float64x1_t) -> float64x1x3_t; - } - _vld3_f64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f64)"] @@ -12442,14 +12428,7 @@ pub unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v1f64.p0" - )] - fn _vld4_f64(ptr: *const float64x1_t) -> float64x1x4_t; - } - _vld4_f64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f64)"] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index b6951907eb..45c83b880e 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -22036,14 +22036,7 @@ pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4f16.p0" - )] - fn _vld3_f16(ptr: *const f16) -> float16x4x3_t; - } - _vld3_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 4, 3, a) } #[doc = "Load single 3-element structure and replicate to all lanes of two registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f16)"] @@ -22060,14 +22053,7 @@ pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v8f16.p0" - )] - fn _vld3q_f16(ptr: *const f16) -> float16x8x3_t; - } - _vld3q_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 8, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"] @@ -22079,14 +22065,7 @@ pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v2f32.p0" - )] - fn _vld3_f32(ptr: *const float32x2_t) -> float32x2x3_t; - } - _vld3_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 2, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f32)"] @@ -22098,14 +22077,7 @@ pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4f32.p0" - )] - fn _vld3q_f32(ptr: *const float32x4_t) -> float32x4x3_t; - } - _vld3q_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 4, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s8)"] @@ -22117,14 +22089,7 @@ pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v8i8.p0" - )] - fn _vld3_s8(ptr: *const int8x8_t) -> int8x8x3_t; - } - _vld3_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 8, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s8)"] @@ -22136,14 +22101,7 @@ pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v16i8.p0" - )] - fn _vld3q_s8(ptr: *const int8x16_t) -> int8x16x3_t; - } - _vld3q_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 16, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s16)"] @@ -22155,14 +22113,7 @@ pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4i16.p0" - )] - fn _vld3_s16(ptr: *const int16x4_t) -> int16x4x3_t; - } - _vld3_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 4, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s16)"] @@ -22174,14 +22125,7 @@ pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v8i16.p0" - )] - fn _vld3q_s16(ptr: *const int16x8_t) -> int16x8x3_t; - } - _vld3q_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 8, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s32)"] @@ -22193,14 +22137,7 @@ pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v2i32.p0" - )] - fn _vld3_s32(ptr: *const int32x2_t) -> int32x2x3_t; - } - _vld3_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 2, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s32)"] @@ -22212,14 +22149,7 @@ pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(ld3))] pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v4i32.p0" - )] - fn _vld3q_s32(ptr: *const int32x4_t) -> int32x4x3_t; - } - _vld3q_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 4, 3, a) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"] @@ -23039,14 +22969,7 @@ pub unsafe fn vld3_p64(a: *const p64) -> poly64x1x3_t { #[cfg(not(target_arch = "arm"))] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld3.v1i64.p0" - )] - fn _vld3_s64(ptr: *const int64x1_t) -> int64x1x3_t; - } - _vld3_s64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 3-element structures to three registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s64)"] @@ -24413,14 +24336,7 @@ pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4f16.p0" - )] - fn _vld4_f16(ptr: *const f16) -> float16x4x4_t; - } - _vld4_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 4, 4, a) } #[doc = "Load single 4-element structure and replicate to all lanes of two registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f16)"] @@ -24436,14 +24352,7 @@ pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t { #[unstable(feature = "stdarch_neon_f16", issue = "136306")] #[cfg(not(target_arch = "arm64ec"))] pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v8f16.p0" - )] - fn _vld4q_f16(ptr: *const f16) -> float16x8x4_t; - } - _vld4q_f16(a as _) + crate::core_arch::macros::deinterleaving_load!(f16, 8, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"] @@ -24455,14 +24364,7 @@ pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v2f32.p0" - )] - fn _vld4_f32(ptr: *const float32x2_t) -> float32x2x4_t; - } - _vld4_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 2, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f32)"] @@ -24474,14 +24376,7 @@ pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4f32.p0" - )] - fn _vld4q_f32(ptr: *const float32x4_t) -> float32x4x4_t; - } - _vld4q_f32(a as _) + crate::core_arch::macros::deinterleaving_load!(f32, 4, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s8)"] @@ -24493,14 +24388,7 @@ pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v8i8.p0" - )] - fn _vld4_s8(ptr: *const int8x8_t) -> int8x8x4_t; - } - _vld4_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 8, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s8)"] @@ -24512,14 +24400,7 @@ pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v16i8.p0" - )] - fn _vld4q_s8(ptr: *const int8x16_t) -> int8x16x4_t; - } - _vld4q_s8(a as _) + crate::core_arch::macros::deinterleaving_load!(i8, 16, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s16)"] @@ -24531,14 +24412,7 @@ pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4i16.p0" - )] - fn _vld4_s16(ptr: *const int16x4_t) -> int16x4x4_t; - } - _vld4_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 4, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s16)"] @@ -24550,14 +24424,7 @@ pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v8i16.p0" - )] - fn _vld4q_s16(ptr: *const int16x8_t) -> int16x8x4_t; - } - _vld4q_s16(a as _) + crate::core_arch::macros::deinterleaving_load!(i16, 8, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s32)"] @@ -24569,14 +24436,7 @@ pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v2i32.p0" - )] - fn _vld4_s32(ptr: *const int32x2_t) -> int32x2x4_t; - } - _vld4_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 2, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s32)"] @@ -24588,14 +24448,7 @@ pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(ld4))] pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v4i32.p0" - )] - fn _vld4q_s32(ptr: *const int32x4_t) -> int32x4x4_t; - } - _vld4q_s32(a as _) + crate::core_arch::macros::deinterleaving_load!(i32, 4, 4, a) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"] @@ -25456,14 +25309,7 @@ pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t { #[stable(feature = "neon_intrinsics", since = "1.59.0")] #[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { - unsafe extern "unadjusted" { - #[cfg_attr( - any(target_arch = "aarch64", target_arch = "arm64ec"), - link_name = "llvm.aarch64.neon.ld4.v1i64.p0" - )] - fn _vld4_s64(ptr: *const int64x1_t) -> int64x1x4_t; - } - _vld4_s64(a as _) + crate::ptr::read_unaligned(a.cast()) } #[doc = "Load multiple 4-element structures to four registers"] #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s64)"] diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs index 353829633f..d40ce51c74 100644 --- a/crates/core_arch/src/macros.rs +++ b/crates/core_arch/src/macros.rs @@ -186,3 +186,72 @@ macro_rules! simd_masked_store { $crate::intrinsics::simd::simd_masked_store::<_, _, _, { $align }>($mask, $ptr, $default) }; } + +pub(crate) const fn deinterleave_mask() +-> [u32; LANES] { + // Produces: [K, K+N, K+2N, ...] + let mut out = [0u32; LANES]; + let mut i = 0usize; + while i < LANES { + out[i] = (i * N + K) as u32; + i += 1; + } + out +} + +#[allow(unused)] +macro_rules! deinterleaving_load { + ($elem:ty, $lanes:literal, 2, $ptr:expr) => {{ + use $crate::core_arch::macros::deinterleave_mask; + use $crate::core_arch::simd::Simd; + use $crate::{mem::transmute, ptr}; + + type V = Simd<$elem, $lanes>; + type W = Simd<$elem, { $lanes * 2 }>; + + let w: W = ptr::read_unaligned($ptr as *const W); + + let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 0>()); + let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 1>()); + + transmute((v0, v1)) + }}; + + ($elem:ty, $lanes:literal, 3, $ptr:expr) => {{ + use $crate::core_arch::macros::deinterleave_mask; + use $crate::core_arch::simd::Simd; + use $crate::{mem::transmute, ptr}; + + type V = Simd<$elem, $lanes>; + type W = Simd<$elem, { $lanes * 3 }>; + + let w: W = ptr::read_unaligned($ptr as *const W); + + let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 0>()); + let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 1>()); + let v2: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 2>()); + + transmute((v0, v1, v2)) + }}; + + ($elem:ty, $lanes:literal, 4, $ptr:expr) => {{ + use $crate::core_arch::macros::deinterleave_mask; + use $crate::core_arch::simd::Simd; + use $crate::{mem::transmute, ptr}; + + type V = Simd<$elem, $lanes>; + type W = Simd<$elem, { $lanes * 4 }>; + + let w: W = ptr::read_unaligned($ptr as *const W); + + let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 0>()); + let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 1>()); + let v2: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 2>()); + let v3: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 3>()); + + transmute((v0, v1, v2, v3)) + }}; +} + +#[allow(unused)] +pub(crate) use deinterleaving_load; diff --git a/crates/intrinsic-test/src/common/compare.rs b/crates/intrinsic-test/src/common/compare.rs index 5214349171..c22d7fd4ec 100644 --- a/crates/intrinsic-test/src/common/compare.rs +++ b/crates/intrinsic-test/src/common/compare.rs @@ -109,13 +109,26 @@ pub fn compare_outputs( } }) .inspect(|(intrinsic, diffs)| { - println!("Difference for intrinsic: {intrinsic}"); + use std::io::Write; + + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + writeln!(out, "Difference for intrinsic: {intrinsic}").unwrap(); diffs.into_iter().for_each(|diff| match diff { - diff::Result::Left(c) => println!("C: {c}"), - diff::Result::Right(rust) => println!("Rust: {rust}"), + diff::Result::Left(c) => { + writeln!(out, "C: {c}").unwrap(); + } + diff::Result::Right(rust) => { + writeln!(out, "Rust: {rust}").unwrap(); + } _ => (), }); - println!("****************************************************************"); + writeln!( + out, + "****************************************************************" + ) + .unwrap(); }) .count(); diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml index a10403de41..b81f04ebc0 100644 --- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml @@ -3698,16 +3698,12 @@ intrinsics: types: - ["*const f64", float64x1x2_t, f64, float64x1_t] compose: - - LLVMLink: - name: "vld2.{neon_type[1]}" - arguments: - - "ptr: *const {neon_type[3]}" - links: - - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0" - arch: aarch64,arm64ec - FnCall: - - "_vld2{neon_type[1].nox}" - - - "a as _" + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld2{neon_type[1].nox}" doc: Load multiple 2-element structures to two registers @@ -4057,14 +4053,12 @@ intrinsics: types: - ['*const f64', float64x1x3_t, '*const float64x1_t', f64] compose: - - LLVMLink: - name: 'vld3{neon_type[1].nox}' - arguments: - - 'ptr: {type[2]}' - links: - - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld3{neon_type[1].nox}" doc: Load multiple 3-element structures to three registers @@ -4203,14 +4197,12 @@ intrinsics: types: - ['*const f64', float64x1x4_t, f64, '*const float64x1_t'] compose: - - LLVMLink: - name: 'vld4{neon_type[1].nox}' - arguments: - - 'ptr: {type[3]}' - links: - - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld4{neon_type[1].nox}" doc: Load multiple 4-element structures to four registers diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml index 3f7adbc278..8e10fff984 100644 --- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml +++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml @@ -3669,19 +3669,11 @@ intrinsics: safety: unsafe: [neon] types: - - ["*const f16", float16x4x3_t, f16] - - ["*const f16", float16x8x3_t, f16] + - ["*const f16", float16x4x3_t, f16, "4"] + - ["*const f16", float16x8x3_t, f16, "8"] compose: - - LLVMLink: - name: "vld3.{neon_type[1]}" - arguments: - - "ptr: {type[0]}" - links: - - link: "llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[2]}.p0" - arch: aarch64,arm64ec - - FnCall: - - "_vld3{neon_type[1].nox}" - - - "a as _" + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "3", a], [], true] + - name: "vld3{neon_type[1].dup_nox}" doc: Load single 3-element structure and replicate to all lanes of two registers @@ -3875,23 +3867,17 @@ intrinsics: safety: unsafe: [neon] types: - - ['*const i8', int8x8x3_t, '*const int8x8_t', i8] - - ['*const i16', int16x4x3_t, '*const int16x4_t', i16] - - ['*const i32', int32x2x3_t, '*const int32x2_t', i32] - - ['*const i8', int8x16x3_t, '*const int8x16_t', i8] - - ['*const i16', int16x8x3_t, '*const int16x8_t', i16] - - ['*const i32', int32x4x3_t, '*const int32x4_t', i32] - - ['*const f32', float32x2x3_t, '*const float32x2_t', f32] - - ['*const f32', float32x4x3_t, '*const float32x4_t', f32] + - ['*const i8', int8x8x3_t, i8, "8"] + - ['*const i16', int16x4x3_t, i16, "4"] + - ['*const i32', int32x2x3_t, i32, "2"] + - ['*const i8', int8x16x3_t, i8, "16"] + - ['*const i16', int16x8x3_t, i16, "8"] + - ['*const i32', int32x4x3_t, i32, "4"] + - ['*const f32', float32x2x3_t, f32, "2"] + - ['*const f32', float32x4x3_t, f32, "4"] compose: - - LLVMLink: - name: 'vld3{neon_type[1].nox}' - arguments: - - 'ptr: {type[2]}' - links: - - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']] + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "3", a], [], true] + - name: "vld3{neon_type[1].nox}" doc: Load multiple 3-element structures to three registers @@ -3906,14 +3892,12 @@ intrinsics: types: - ['*const i64', int64x1x3_t, '*const int64x1_t', i64] compose: - - LLVMLink: - name: "vld3{neon_type[1].nox}" - arguments: - - 'ptr: {type[2]}' - links: - - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld3{neon_type[1].nox}" doc: Load multiple 3-element structures to three registers @@ -4373,23 +4357,16 @@ intrinsics: safety: unsafe: [neon] types: - - ['*const i8', int8x8x4_t, i8, '*const int8x8_t'] - - ['*const i32', int32x4x4_t, i32, '*const int32x4_t'] - - ['*const i16', int16x4x4_t, i16, '*const int16x4_t'] - - ['*const i32', int32x2x4_t, i32, '*const int32x2_t'] - - ['*const i8', int8x16x4_t, i8, '*const int8x16_t'] - - ['*const i16', int16x8x4_t, i16, '*const int16x8_t'] - - ['*const f32', float32x2x4_t, f32, '*const float32x2_t'] - - ['*const f32', float32x4x4_t, f32, '*const float32x4_t'] + - ['*const i8', int8x8x4_t, i8, "8"] + - ['*const i32', int32x4x4_t, i32, "4"] + - ['*const i16', int16x4x4_t, i16, "4"] + - ['*const i32', int32x2x4_t, i32, "2"] + - ['*const i8', int8x16x4_t, i8, "16"] + - ['*const i16', int16x8x4_t, i16, "8"] + - ['*const f32', float32x2x4_t, f32, "2"] + - ['*const f32', float32x4x4_t, f32, "4"] compose: - - LLVMLink: - name: 'vld4{neon_type[1].nox}' - arguments: - - 'ptr: {type[3]}' - links: - - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']] + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "4", a], [], true] - name: "vld4{neon_type[1].nox}" doc: Load multiple 4-element structures to four registers @@ -4402,14 +4379,12 @@ intrinsics: types: - ['*const i64', int64x1x4_t, i64, '*const int64x1_t'] compose: - - LLVMLink: - name: 'vld4{neon_type[1].nox}' - arguments: - - 'ptr: {type[3]}' - links: - - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0' - arch: aarch64,arm64ec - - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']] + - FnCall: + - 'crate::ptr::read_unaligned' + - - MethodCall: + - a + - cast + - [] - name: "vld4{neon_type[1].lane_nox}" doc: Load multiple 4-element structures to four registers @@ -12434,19 +12409,10 @@ intrinsics: safety: unsafe: [neon] types: - - ["*const f16", float16x4x4_t, f16] - - ["*const f16", float16x8x4_t, f16] + - ["*const f16", float16x4x4_t, f16, "4"] + - ["*const f16", float16x8x4_t, f16, "8"] compose: - - LLVMLink: - name: "vld4.{neon_type[1]}" - arguments: - - "ptr: {type[0]}" - links: - - link: "llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0" - arch: aarch64,arm64ec - - FnCall: - - "_vld4{neon_type[1].nox}" - - - "a as _" + - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "4", a], [], true] - name: "vld4{neon_type[1].dup_nox}" doc: Load single 4-element structure and replicate to all lanes of two registers