rust-lang · folkertdev · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
@@ -2315,7 +2315,7 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
     unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
 }
 
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
@@ -2324,10 +2324,31 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpacksswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
+    unsafe {
+        let max = simd_splat(i16::from(i8::MAX));
+        let min = simd_splat(i16::from(i8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to i8 conversions
+            32, 34, 36, 38, 40, 42, 44, 46, // b-lo
+            16, 18, 20, 22, 24, 26, 28, 30, // a-hi
+            48, 50, 52, 54, 56, 58, 60, 62, // b-hi
+        ];
+        let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
@@ -2336,10 +2357,31 @@ pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpackssdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
+    unsafe {
+        let max = simd_splat(i32::from(i16::MAX));
+        let min = simd_splat(i32::from(i16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 16] = [
+            00, 02, 04, 06, // a-lo i32 to i16 conversions
+            16, 18, 20, 22, // b-lo
+            08, 10, 12, 14, // a-hi
+            24, 26, 28, 30, // b-hi
+        ];
+        let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
-/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
@@ -2348,10 +2390,31 @@ pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpackuswb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
+    unsafe {
+        let max = simd_splat(i16::from(u8::MAX));
+        let min = simd_splat(i16::from(u8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
+            .as_m256i()
+            .as_i8x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to u8 conversions
+            32, 34, 36, 38, 40, 42, 44, 46, // b-lo
+            16, 18, 20, 22, 24, 26, 28, 30, // a-hi
+            48, 50, 52, 54, 56, 58, 60, 62, // b-hi
+        ];
+        let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
-/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
@@ -2360,7 +2423,28 @@ pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpackusdw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
+    unsafe {
+        let max = simd_splat(i32::from(u16::MAX));
+        let min = simd_splat(i32::from(u16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
+            .as_m256i()
+            .as_i16x16();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 16] = [
+            00, 02, 04, 06, // a-lo i32 to u16 conversions
+            16, 18, 20, 22, // b-lo
+            08, 10, 12, 14, // a-hi
+            24, 26, 28, 30, // b-hi
+        ];
+        let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m256i()
+    }
 }
 
 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
@@ -3827,14 +3911,6 @@ unsafe extern "C" {
     fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
     fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx2.packsswb"]
-    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
-    #[link_name = "llvm.x86.avx2.packssdw"]
-    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
-    #[link_name = "llvm.x86.avx2.packuswb"]
-    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
-    #[link_name = "llvm.x86.avx2.packusdw"]
-    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
     #[link_name = "llvm.x86.avx2.psad.bw"]
     fn psadbw(a: u8x32, b: u8x32) -> u64x4;
     #[link_name = "llvm.x86.avx2.psign.b"]

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
@@ -6524,7 +6524,32 @@ pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
 pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) }
+    unsafe {
+        let max = simd_splat(i32::from(i16::MAX));
+        let min = simd_splat(i32::from(i16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06,
+            32, 34, 36, 38,
+            08, 10, 12, 14,
+            40, 42, 44, 46,
+            16, 18, 20, 22,
+            48, 50, 52, 54,
+            24, 26, 28, 30,
+            56, 58, 60, 62,
+        ];
+        let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6619,7 +6644,32 @@ pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
 pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) }
+    unsafe {
+        let max = simd_splat(i16::from(i8::MAX));
+        let min = simd_splat(i16::from(i8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 64] = [
+            000, 002, 004, 006, 008, 010, 012, 014,
+            064, 066, 068, 070, 072, 074, 076, 078,
+            016, 018, 020, 022, 024, 026, 028, 030,
+            080, 082, 084, 086, 088, 090, 092, 094,
+            032, 034, 036, 038, 040, 042, 044, 046,
+            096, 098, 100, 102, 104, 106, 108, 110,
+            048, 050, 052, 054, 056, 058, 060, 062,
+            112, 114, 116, 118, 120, 122, 124, 126,
+        ];
+        let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6714,7 +6764,32 @@ pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
 pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) }
+    unsafe {
+        let max = simd_splat(i32::from(u16::MAX));
+        let min = simd_splat(i32::from(u16::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+        let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min)
+            .as_m512i()
+            .as_i16x32();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 32] = [
+            00, 02, 04, 06,
+            32, 34, 36, 38,
+            08, 10, 12, 14,
+            40, 42, 44, 46,
+            16, 18, 20, 22,
+            48, 50, 52, 54,
+            24, 26, 28, 30,
+            56, 58, 60, 62,
+        ];
+        let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6809,7 +6884,32 @@ pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
 pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) }
+    unsafe {
+        let max = simd_splat(i16::from(u8::MAX));
+        let min = simd_splat(i16::from(u8::MIN));
+
+        let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+        let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min)
+            .as_m512i()
+            .as_i8x64();
+
+        #[rustfmt::skip]
+        const IDXS: [u32; 64] = [
+            000, 002, 004, 006, 008, 010, 012, 014,
+            064, 066, 068, 070, 072, 074, 076, 078,
+            016, 018, 020, 022, 024, 026, 028, 030,
+            080, 082, 084, 086, 088, 090, 092, 094,
+            032, 034, 036, 038, 040, 042, 044, 046,
+            096, 098, 100, 102, 104, 106, 108, 110,
+            048, 050, 052, 054, 056, 058, 060, 062,
+            112, 114, 116, 118, 120, 122, 124, 126,
+        ];
+        let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS);
+
+        result.as_m512i()
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12606,15 +12706,6 @@ unsafe extern "C" {
     #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
     fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.packssdw.512"]
-    fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;
-    #[link_name = "llvm.x86.avx512.packsswb.512"]
-    fn vpacksswb(a: i16x32, b: i16x32) -> i8x64;
-    #[link_name = "llvm.x86.avx512.packusdw.512"]
-    fn vpackusdw(a: i32x16, b: i32x16) -> u16x32;
-    #[link_name = "llvm.x86.avx512.packuswb.512"]
-    fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;
-
     #[link_name = "llvm.x86.avx512.psll.w.512"]
     fn vpsllw(a: i16x32, count: i16x8) -> i16x32;