diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index 9a8a9ad59e..119f903de7 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -11652,14 +11652,7 @@ pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld2.v1f64.p0"
-        )]
-        fn _vld2_f64(ptr: *const float64x1_t) -> float64x1x2_t;
-    }
-    _vld2_f64(a as _)
+    crate::ptr::read_unaligned(a.cast())
 }
 #[doc = "Load multiple 2-element structures to two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld2_lane_f64)"]
@@ -12031,14 +12024,7 @@ pub unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v1f64.p0"
-        )]
-        fn _vld3_f64(ptr: *const float64x1_t) -> float64x1x3_t;
-    }
-    _vld3_f64(a as _)
+    crate::ptr::read_unaligned(a.cast())
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_lane_f64)"]
@@ -12442,14 +12428,7 @@ pub unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v1f64.p0"
-        )]
-        fn _vld4_f64(ptr: *const float64x1_t) -> float64x1x4_t;
-    }
-    _vld4_f64(a as _)
+    crate::ptr::read_unaligned(a.cast())
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_lane_f64)"]
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
index b6951907eb..45c83b880e 100644
--- a/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -22036,14 +22036,7 @@ pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t {
 #[unstable(feature = "stdarch_neon_f16", issue = "136306")]
 #[cfg(not(target_arch = "arm64ec"))]
 pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v4f16.p0"
-        )]
-        fn _vld3_f16(ptr: *const f16) -> float16x4x3_t;
-    }
-    _vld3_f16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f16, 4, 3, a)
 }
 #[doc = "Load single 3-element structure and replicate to all lanes of two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f16)"]
@@ -22060,14 +22053,7 @@ pub unsafe fn vld3_f16(a: *const f16) -> float16x4x3_t {
 #[unstable(feature = "stdarch_neon_f16", issue = "136306")]
 #[cfg(not(target_arch = "arm64ec"))]
 pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v8f16.p0"
-        )]
-        fn _vld3q_f16(ptr: *const f16) -> float16x8x3_t;
-    }
-    _vld3q_f16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f16, 8, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"]
@@ -22079,14 +22065,7 @@ pub unsafe fn vld3q_f16(a: *const f16) -> float16x8x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v2f32.p0"
-        )]
-        fn _vld3_f32(ptr: *const float32x2_t) -> float32x2x3_t;
-    }
-    _vld3_f32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f32, 2, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f32)"]
@@ -22098,14 +22077,7 @@ pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v4f32.p0"
-        )]
-        fn _vld3q_f32(ptr: *const float32x4_t) -> float32x4x3_t;
-    }
-    _vld3q_f32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f32, 4, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s8)"]
@@ -22117,14 +22089,7 @@ pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v8i8.p0"
-        )]
-        fn _vld3_s8(ptr: *const int8x8_t) -> int8x8x3_t;
-    }
-    _vld3_s8(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i8, 8, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s8)"]
@@ -22136,14 +22101,7 @@ pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v16i8.p0"
-        )]
-        fn _vld3q_s8(ptr: *const int8x16_t) -> int8x16x3_t;
-    }
-    _vld3q_s8(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i8, 16, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s16)"]
@@ -22155,14 +22113,7 @@ pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v4i16.p0"
-        )]
-        fn _vld3_s16(ptr: *const int16x4_t) -> int16x4x3_t;
-    }
-    _vld3_s16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i16, 4, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s16)"]
@@ -22174,14 +22125,7 @@ pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v8i16.p0"
-        )]
-        fn _vld3q_s16(ptr: *const int16x8_t) -> int16x8x3_t;
-    }
-    _vld3q_s16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i16, 8, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s32)"]
@@ -22193,14 +22137,7 @@ pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v2i32.p0"
-        )]
-        fn _vld3_s32(ptr: *const int32x2_t) -> int32x2x3_t;
-    }
-    _vld3_s32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i32, 2, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_s32)"]
@@ -22212,14 +22149,7 @@ pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(ld3))]
 pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v4i32.p0"
-        )]
-        fn _vld3q_s32(ptr: *const int32x4_t) -> int32x4x3_t;
-    }
-    _vld3q_s32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i32, 4, 3, a)
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_f32)"]
@@ -23039,14 +22969,7 @@ pub unsafe fn vld3_p64(a: *const p64) -> poly64x1x3_t {
 #[cfg(not(target_arch = "arm"))]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld3.v1i64.p0"
-        )]
-        fn _vld3_s64(ptr: *const int64x1_t) -> int64x1x3_t;
-    }
-    _vld3_s64(a as _)
+    crate::ptr::read_unaligned(a.cast())
 }
 #[doc = "Load multiple 3-element structures to three registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3_s64)"]
@@ -24413,14 +24336,7 @@ pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t {
 #[unstable(feature = "stdarch_neon_f16", issue = "136306")]
 #[cfg(not(target_arch = "arm64ec"))]
 pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v4f16.p0"
-        )]
-        fn _vld4_f16(ptr: *const f16) -> float16x4x4_t;
-    }
-    _vld4_f16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f16, 4, 4, a)
 }
 #[doc = "Load single 4-element structure and replicate to all lanes of two registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f16)"]
@@ -24436,14 +24352,7 @@ pub unsafe fn vld4_f16(a: *const f16) -> float16x4x4_t {
 #[unstable(feature = "stdarch_neon_f16", issue = "136306")]
 #[cfg(not(target_arch = "arm64ec"))]
 pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v8f16.p0"
-        )]
-        fn _vld4q_f16(ptr: *const f16) -> float16x8x4_t;
-    }
-    _vld4q_f16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f16, 8, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"]
@@ -24455,14 +24364,7 @@ pub unsafe fn vld4q_f16(a: *const f16) -> float16x8x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v2f32.p0"
-        )]
-        fn _vld4_f32(ptr: *const float32x2_t) -> float32x2x4_t;
-    }
-    _vld4_f32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f32, 2, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_f32)"]
@@ -24474,14 +24376,7 @@ pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v4f32.p0"
-        )]
-        fn _vld4q_f32(ptr: *const float32x4_t) -> float32x4x4_t;
-    }
-    _vld4q_f32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(f32, 4, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s8)"]
@@ -24493,14 +24388,7 @@ pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v8i8.p0"
-        )]
-        fn _vld4_s8(ptr: *const int8x8_t) -> int8x8x4_t;
-    }
-    _vld4_s8(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i8, 8, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s8)"]
@@ -24512,14 +24400,7 @@ pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v16i8.p0"
-        )]
-        fn _vld4q_s8(ptr: *const int8x16_t) -> int8x16x4_t;
-    }
-    _vld4q_s8(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i8, 16, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s16)"]
@@ -24531,14 +24412,7 @@ pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v4i16.p0"
-        )]
-        fn _vld4_s16(ptr: *const int16x4_t) -> int16x4x4_t;
-    }
-    _vld4_s16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i16, 4, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s16)"]
@@ -24550,14 +24424,7 @@ pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v8i16.p0"
-        )]
-        fn _vld4q_s16(ptr: *const int16x8_t) -> int16x8x4_t;
-    }
-    _vld4q_s16(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i16, 8, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s32)"]
@@ -24569,14 +24436,7 @@ pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v2i32.p0"
-        )]
-        fn _vld4_s32(ptr: *const int32x2_t) -> int32x2x4_t;
-    }
-    _vld4_s32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i32, 2, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4q_s32)"]
@@ -24588,14 +24448,7 @@ pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(ld4))]
 pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v4i32.p0"
-        )]
-        fn _vld4q_s32(ptr: *const int32x4_t) -> int32x4x4_t;
-    }
-    _vld4q_s32(a as _)
+    crate::core_arch::macros::deinterleaving_load!(i32, 4, 4, a)
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_f32)"]
@@ -25456,14 +25309,7 @@ pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(nop))]
 pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.ld4.v1i64.p0"
-        )]
-        fn _vld4_s64(ptr: *const int64x1_t) -> int64x1x4_t;
-    }
-    _vld4_s64(a as _)
+    crate::ptr::read_unaligned(a.cast())
 }
 #[doc = "Load multiple 4-element structures to four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld4_s64)"]
diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
index 353829633f..d40ce51c74 100644
--- a/crates/core_arch/src/macros.rs
+++ b/crates/core_arch/src/macros.rs
@@ -186,3 +186,72 @@ macro_rules! simd_masked_store {
         $crate::intrinsics::simd::simd_masked_store::<_, _, _, { $align }>($mask, $ptr, $default)
     };
 }
+
+pub(crate) const fn deinterleave_mask<const LANES: usize, const N: usize, const K: usize>()
+-> [u32; LANES] {
+    // Produces: [K, K+N, K+2N, ...]
+    let mut out = [0u32; LANES];
+    let mut i = 0usize;
+    while i < LANES {
+        out[i] = (i * N + K) as u32;
+        i += 1;
+    }
+    out
+}
+
+#[allow(unused)]
+macro_rules! deinterleaving_load {
+    ($elem:ty, $lanes:literal, 2, $ptr:expr) => {{
+        use $crate::core_arch::macros::deinterleave_mask;
+        use $crate::core_arch::simd::Simd;
+        use $crate::{mem::transmute, ptr};
+
+        type V = Simd<$elem, $lanes>;
+        type W = Simd<$elem, { $lanes * 2 }>;
+
+        let w: W = ptr::read_unaligned($ptr as *const W);
+
+        let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 0>());
+        let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 1>());
+
+        transmute((v0, v1))
+    }};
+
+    ($elem:ty, $lanes:literal, 3, $ptr:expr) => {{
+        use $crate::core_arch::macros::deinterleave_mask;
+        use $crate::core_arch::simd::Simd;
+        use $crate::{mem::transmute, ptr};
+
+        type V = Simd<$elem, $lanes>;
+        type W = Simd<$elem, { $lanes * 3 }>;
+
+        let w: W = ptr::read_unaligned($ptr as *const W);
+
+        let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 0>());
+        let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 1>());
+        let v2: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 2>());
+
+        transmute((v0, v1, v2))
+    }};
+
+    ($elem:ty, $lanes:literal, 4, $ptr:expr) => {{
+        use $crate::core_arch::macros::deinterleave_mask;
+        use $crate::core_arch::simd::Simd;
+        use $crate::{mem::transmute, ptr};
+
+        type V = Simd<$elem, $lanes>;
+        type W = Simd<$elem, { $lanes * 4 }>;
+
+        let w: W = ptr::read_unaligned($ptr as *const W);
+
+        let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 0>());
+        let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 1>());
+        let v2: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 2>());
+        let v3: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 3>());
+
+        transmute((v0, v1, v2, v3))
+    }};
+}
+
+#[allow(unused)]
+pub(crate) use deinterleaving_load;
diff --git a/crates/intrinsic-test/src/common/compare.rs b/crates/intrinsic-test/src/common/compare.rs
index 5214349171..c22d7fd4ec 100644
--- a/crates/intrinsic-test/src/common/compare.rs
+++ b/crates/intrinsic-test/src/common/compare.rs
@@ -109,13 +109,26 @@ pub fn compare_outputs(
             }
         })
         .inspect(|(intrinsic, diffs)| {
-            println!("Difference for intrinsic: {intrinsic}");
+            use std::io::Write;
+
+            let stdout = std::io::stdout();
+            let mut out = stdout.lock();
+
+            writeln!(out, "Difference for intrinsic: {intrinsic}").unwrap();
             diffs.into_iter().for_each(|diff| match diff {
-                diff::Result::Left(c) => println!("C: {c}"),
-                diff::Result::Right(rust) => println!("Rust: {rust}"),
+                diff::Result::Left(c) => {
+                    writeln!(out, "C: {c}").unwrap();
+                }
+                diff::Result::Right(rust) => {
+                    writeln!(out, "Rust: {rust}").unwrap();
+                }
                 _ => (),
             });
-            println!("****************************************************************");
+            writeln!(
+                out,
+                "****************************************************************"
+            )
+            .unwrap();
         })
         .count();
 
diff --git a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index a10403de41..b81f04ebc0 100644
--- a/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -3698,16 +3698,12 @@ intrinsics:
     types:
       - ["*const f64", float64x1x2_t, f64, float64x1_t]
     compose:
-      - LLVMLink:
-          name: "vld2.{neon_type[1]}"
-          arguments:
-            - "ptr: *const {neon_type[3]}"
-          links:
-            - link: "llvm.aarch64.neon.ld2.v{neon_type[1].lane}{type[2]}.p0"
-              arch: aarch64,arm64ec
       - FnCall:
-          - "_vld2{neon_type[1].nox}"
-          - - "a as _"
+          - 'crate::ptr::read_unaligned'
+          - - MethodCall:
+                - a
+                - cast
+                - []
 
   - name: "vld2{neon_type[1].nox}"
     doc: Load multiple 2-element structures to two registers
@@ -4057,14 +4053,12 @@ intrinsics:
     types:
       - ['*const f64', float64x1x3_t, '*const float64x1_t', f64]
     compose:
-      - LLVMLink:
-          name: 'vld3{neon_type[1].nox}'
-          arguments:
-            - 'ptr: {type[2]}'
-          links:
-            - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']]
+      - FnCall:
+          - 'crate::ptr::read_unaligned'
+          - - MethodCall:
+                - a
+                - cast
+                - []
 
   - name: "vld3{neon_type[1].nox}"
     doc: Load multiple 3-element structures to three registers
@@ -4203,14 +4197,12 @@ intrinsics:
     types:
       - ['*const f64', float64x1x4_t, f64, '*const float64x1_t']
     compose:
-      - LLVMLink:
-          name: 'vld4{neon_type[1].nox}'
-          arguments:
-            - 'ptr: {type[3]}'
-          links:
-            - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']]
+      - FnCall:
+          - 'crate::ptr::read_unaligned'
+          - - MethodCall:
+                - a
+                - cast
+                - []
 
   - name: "vld4{neon_type[1].nox}"
     doc: Load multiple 4-element structures to four registers
diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
index 3f7adbc278..8e10fff984 100644
--- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -3669,19 +3669,11 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - ["*const f16", float16x4x3_t, f16]
-      - ["*const f16", float16x8x3_t, f16]
+      - ["*const f16", float16x4x3_t, f16, "4"]
+      - ["*const f16", float16x8x3_t, f16, "8"]
     compose:
-      - LLVMLink:
-          name: "vld3.{neon_type[1]}"
-          arguments:
-            - "ptr: {type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[2]}.p0"
-              arch: aarch64,arm64ec
-      - FnCall:
-          - "_vld3{neon_type[1].nox}"
-          - - "a as _"
+      - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "3", a], [], true]
+
 
   - name: "vld3{neon_type[1].dup_nox}"
     doc: Load single 3-element structure and replicate to all lanes of two registers
@@ -3875,23 +3867,17 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - ['*const i8', int8x8x3_t, '*const int8x8_t', i8]
-      - ['*const i16', int16x4x3_t, '*const int16x4_t', i16]
-      - ['*const i32', int32x2x3_t, '*const int32x2_t', i32]
-      - ['*const i8', int8x16x3_t, '*const int8x16_t', i8]
-      - ['*const i16', int16x8x3_t, '*const int16x8_t', i16]
-      - ['*const i32', int32x4x3_t, '*const int32x4_t', i32]
-      - ['*const f32', float32x2x3_t, '*const float32x2_t', f32]
-      - ['*const f32', float32x4x3_t, '*const float32x4_t', f32]
+      - ['*const i8', int8x8x3_t, i8, "8"]
+      - ['*const i16', int16x4x3_t, i16, "4"]
+      - ['*const i32', int32x2x3_t, i32, "2"]
+      - ['*const i8', int8x16x3_t, i8, "16"]
+      - ['*const i16', int16x8x3_t, i16, "8"]
+      - ['*const i32', int32x4x3_t, i32, "4"]
+      - ['*const f32', float32x2x3_t, f32, "2"]
+      - ['*const f32', float32x4x3_t, f32, "4"]
     compose:
-      - LLVMLink:
-          name: 'vld3{neon_type[1].nox}'
-          arguments:
-            - 'ptr: {type[2]}'
-          links:
-            - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']]
+      - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "3", a], [], true]
+
 
   - name: "vld3{neon_type[1].nox}"
     doc: Load multiple 3-element structures to three registers
@@ -3906,14 +3892,12 @@ intrinsics:
     types:
       - ['*const i64', int64x1x3_t, '*const int64x1_t', i64]
     compose:
-      - LLVMLink:
-          name: "vld3{neon_type[1].nox}"
-          arguments:
-            - 'ptr: {type[2]}'
-          links:
-            - link: 'llvm.aarch64.neon.ld3.v{neon_type[1].lane}{type[3]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vld3{neon_type[1].nox}', ['a as _']]
+      - FnCall:
+          - 'crate::ptr::read_unaligned'
+          - - MethodCall:
+                - a
+                - cast
+                - []
 
   - name: "vld3{neon_type[1].nox}"
     doc: Load multiple 3-element structures to three registers
@@ -4373,23 +4357,16 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - ['*const i8', int8x8x4_t, i8, '*const int8x8_t']
-      - ['*const i32', int32x4x4_t, i32, '*const int32x4_t']
-      - ['*const i16', int16x4x4_t, i16, '*const int16x4_t']
-      - ['*const i32', int32x2x4_t, i32, '*const int32x2_t']
-      - ['*const i8', int8x16x4_t, i8, '*const int8x16_t']
-      - ['*const i16', int16x8x4_t, i16, '*const int16x8_t']
-      - ['*const f32', float32x2x4_t, f32, '*const float32x2_t']
-      - ['*const f32', float32x4x4_t, f32, '*const float32x4_t']
+      - ['*const i8', int8x8x4_t, i8, "8"]
+      - ['*const i32', int32x4x4_t, i32, "4"]
+      - ['*const i16', int16x4x4_t, i16, "4"]
+      - ['*const i32', int32x2x4_t, i32, "2"]
+      - ['*const i8', int8x16x4_t, i8, "16"]
+      - ['*const i16', int16x8x4_t, i16, "8"]
+      - ['*const f32', float32x2x4_t, f32, "2"]
+      - ['*const f32', float32x4x4_t, f32, "4"]
     compose:
-      - LLVMLink:
-          name: 'vld4{neon_type[1].nox}'
-          arguments:
-            - 'ptr: {type[3]}'
-          links:
-            - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']]
+      - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "4", a], [], true]
 
   - name: "vld4{neon_type[1].nox}"
     doc: Load multiple 4-element structures to four registers
@@ -4402,14 +4379,12 @@ intrinsics:
     types:
       - ['*const i64', int64x1x4_t, i64, '*const int64x1_t']
     compose:
-      - LLVMLink:
-          name: 'vld4{neon_type[1].nox}'
-          arguments:
-            - 'ptr: {type[3]}'
-          links:
-            - link: 'llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0'
-              arch: aarch64,arm64ec
-      - FnCall: ['_vld4{neon_type[1].nox}', ['a as _']]
+      - FnCall:
+          - 'crate::ptr::read_unaligned'
+          - - MethodCall:
+                - a
+                - cast
+                - []
 
   - name: "vld4{neon_type[1].lane_nox}"
     doc: Load multiple 4-element structures to four registers
@@ -12434,19 +12409,10 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - ["*const f16", float16x4x4_t, f16]
-      - ["*const f16", float16x8x4_t, f16]
+      - ["*const f16", float16x4x4_t, f16, "4"]
+      - ["*const f16", float16x8x4_t, f16, "8"]
     compose:
-      - LLVMLink:
-          name: "vld4.{neon_type[1]}"
-          arguments:
-            - "ptr: {type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.ld4.v{neon_type[1].lane}{type[2]}.p0"
-              arch: aarch64,arm64ec
-      - FnCall:
-          - "_vld4{neon_type[1].nox}"
-          - - "a as _"
+      - FnCall: ["crate::core_arch::macros::deinterleaving_load!", [{ Type: "{type[2]}" }, "{type[3]}", "4", a], [], true]
 
   - name: "vld4{neon_type[1].dup_nox}"
     doc: Load single 4-element structure and replicate to all lanes of two registers