diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 04a88e461f..ca4ca9a2de 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -2315,7 +2315,7 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) } } -/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers /// using signed saturation /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16) @@ -2324,10 +2324,31 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpacksswb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) } + unsafe { + let max = simd_splat(i16::from(i8::MAX)); + let min = simd_splat(i16::from(i8::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min) + .as_m256i() + .as_i8x32(); + let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min) + .as_m256i() + .as_i8x32(); + + #[rustfmt::skip] + const IDXS: [u32; 32] = [ + 00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to i8 conversions + 32, 34, 36, 38, 40, 42, 44, 46, // b-lo + 16, 18, 20, 22, 24, 26, 28, 30, // a-hi + 48, 50, 52, 54, 56, 58, 60, 62, // b-hi + ]; + let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m256i() + } } -/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers /// using signed saturation /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32) @@ -2336,10 +2357,31 @@ pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpackssdw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) } + unsafe { + let max = simd_splat(i32::from(i16::MAX)); + let min = simd_splat(i32::from(i16::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min) + .as_m256i() + .as_i16x16(); + let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min) + .as_m256i() + .as_i16x16(); + + #[rustfmt::skip] + const IDXS: [u32; 16] = [ + 00, 02, 04, 06, // a-lo i32 to i16 conversions + 16, 18, 20, 22, // b-lo + 08, 10, 12, 14, // a-hi + 24, 26, 28, 30, // b-hi + ]; + let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m256i() + } } -/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers /// using unsigned saturation /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16) @@ -2348,10 +2390,31 @@ pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpackuswb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) } + unsafe { + let max = simd_splat(i16::from(u8::MAX)); + let min = simd_splat(i16::from(u8::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min) + .as_m256i() + .as_i8x32(); + let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min) + .as_m256i() + .as_i8x32(); + + #[rustfmt::skip] + const IDXS: [u32; 32] = [ + 00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to u8 conversions + 32, 34, 36, 38, 40, 42, 44, 46, // b-lo + 16, 18, 20, 22, 24, 26, 28, 30, // a-hi + 48, 50, 52, 54, 56, 58, 60, 62, // b-hi + ]; + let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m256i() + } } -/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers /// using unsigned saturation /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32) @@ -2360,7 +2423,28 @@ pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpackusdw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) } + unsafe { + let max = simd_splat(i32::from(u16::MAX)); + let min = simd_splat(i32::from(u16::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min) + .as_m256i() + .as_i16x16(); + let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min) + .as_m256i() + .as_i16x16(); + + #[rustfmt::skip] + const IDXS: [u32; 16] = [ + 00, 02, 04, 06, // a-lo i32 to u16 conversions + 16, 18, 20, 22, // b-lo + 08, 10, 12, 14, // a-hi + 24, 26, 28, 30, // b-hi + ]; + let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m256i() + } } /// Permutes packed 32-bit integers from `a` according to the content of `b`. @@ -3827,14 +3911,6 @@ unsafe extern "C" { fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.packsswb"] - fn packsswb(a: i16x16, b: i16x16) -> i8x32; - #[link_name = "llvm.x86.avx2.packssdw"] - fn packssdw(a: i32x8, b: i32x8) -> i16x16; - #[link_name = "llvm.x86.avx2.packuswb"] - fn packuswb(a: i16x16, b: i16x16) -> u8x32; - #[link_name = "llvm.x86.avx2.packusdw"] - fn packusdw(a: i32x8, b: i32x8) -> u16x16; #[link_name = "llvm.x86.avx2.psad.bw"] fn psadbw(a: u8x32, b: u8x32) -> u64x4; #[link_name = "llvm.x86.avx2.psign.b"] diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 3ba171c0fa..78801e8902 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -6524,7 +6524,32 @@ pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpackssdw))] pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) } + unsafe { + let max = simd_splat(i32::from(i16::MAX)); + let min = simd_splat(i32::from(i16::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min) + .as_m512i() + .as_i16x32(); + let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min) + .as_m512i() + .as_i16x32(); + + #[rustfmt::skip] + const IDXS: [u32; 32] = [ + 00, 02, 04, 06, + 32, 34, 36, 38, + 08, 10, 12, 14, + 40, 42, 44, 46, + 16, 18, 20, 22, + 48, 50, 52, 54, + 24, 26, 28, 30, + 56, 58, 60, 62, + ]; + let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m512i() + } } /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6619,7 +6644,32 @@ pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpacksswb))] pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) } + unsafe { + let max = simd_splat(i16::from(i8::MAX)); + let min = simd_splat(i16::from(i8::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min) + .as_m512i() + .as_i8x64(); + let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min) + .as_m512i() + .as_i8x64(); + + #[rustfmt::skip] + const IDXS: [u32; 64] = [ + 000, 002, 004, 006, 008, 010, 012, 014, + 064, 066, 068, 070, 072, 074, 076, 078, + 016, 018, 020, 022, 024, 026, 028, 030, + 080, 082, 084, 086, 088, 090, 092, 094, + 032, 034, 036, 038, 040, 042, 044, 046, + 096, 098, 100, 102, 104, 106, 108, 110, + 048, 050, 052, 054, 056, 058, 060, 062, + 112, 114, 116, 118, 120, 122, 124, 126, + ]; + let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m512i() + } } /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6714,7 +6764,32 @@ pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpackusdw))] pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) } + unsafe { + let max = simd_splat(i32::from(u16::MAX)); + let min = simd_splat(i32::from(u16::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min) + .as_m512i() + .as_i16x32(); + let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min) + .as_m512i() + .as_i16x32(); + + #[rustfmt::skip] + const IDXS: [u32; 32] = [ + 00, 02, 04, 06, + 32, 34, 36, 38, + 08, 10, 12, 14, + 40, 42, 44, 46, + 16, 18, 20, 22, + 48, 50, 52, 54, + 24, 26, 28, 30, + 56, 58, 60, 62, + ]; + let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m512i() + } } /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6809,7 +6884,32 @@ pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpackuswb))] pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i { - unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) } + unsafe { + let max = simd_splat(i16::from(u8::MAX)); + let min = simd_splat(i16::from(u8::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min) + .as_m512i() + .as_i8x64(); + let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min) + .as_m512i() + .as_i8x64(); + + #[rustfmt::skip] + const IDXS: [u32; 64] = [ + 000, 002, 004, 006, 008, 010, 012, 014, + 064, 066, 068, 070, 072, 074, 076, 078, + 016, 018, 020, 022, 024, 026, 028, 030, + 080, 082, 084, 086, 088, 090, 092, 094, + 032, 034, 036, 038, 040, 042, 044, 046, + 096, 098, 100, 102, 104, 106, 108, 110, + 048, 050, 052, 054, 056, 058, 060, 062, + 112, 114, 116, 118, 120, 122, 124, 126, + ]; + let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m512i() + } } /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -12606,15 +12706,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.pmaddubs.w.512"] fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32; - #[link_name = "llvm.x86.avx512.packssdw.512"] - fn vpackssdw(a: i32x16, b: i32x16) -> i16x32; - #[link_name = "llvm.x86.avx512.packsswb.512"] - fn vpacksswb(a: i16x32, b: i16x32) -> i8x64; - #[link_name = "llvm.x86.avx512.packusdw.512"] - fn vpackusdw(a: i32x16, b: i32x16) -> u16x32; - #[link_name = "llvm.x86.avx512.packuswb.512"] - fn vpackuswb(a: i16x32, b: i16x32) -> u8x64; - #[link_name = "llvm.x86.avx512.psll.w.512"] fn vpsllw(a: i16x32, count: i16x8) -> i16x32; diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index f339a003df..fbf62c362f 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -1484,7 +1484,7 @@ pub const fn _mm_move_epi64(a: __m128i) -> __m128i { } } -/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers /// using signed saturation. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16) @@ -1493,10 +1493,27 @@ pub const fn _mm_move_epi64(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(packsswb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(packsswb(a.as_i16x8(), b.as_i16x8())) } + unsafe { + let max = simd_splat(i16::from(i8::MAX)); + let min = simd_splat(i16::from(i8::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min) + .as_m128i() + .as_i8x16(); + let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min) + .as_m128i() + .as_i8x16(); + + // Shuffle the low i8 of each i16 from two concatenated vectors into + // the low bits of the result register. + const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]; + let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m128i() + } } -/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers /// using signed saturation. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32) @@ -1505,10 +1522,23 @@ pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(packssdw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(packssdw(a.as_i32x4(), b.as_i32x4())) } + unsafe { + let max = simd_splat(i32::from(i16::MAX)); + let min = simd_splat(i32::from(i16::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min); + let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min); + + let clamped_a: i16x4 = simd_cast(clamped_a); + let clamped_b: i16x4 = simd_cast(clamped_b); + + let a: i64 = transmute(clamped_a); + let b: i64 = transmute(clamped_b); + i64x2::new(a, b).as_m128i() + } } -/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers +/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers /// using unsigned saturation. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16) @@ -1517,7 +1547,26 @@ pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(packuswb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(packuswb(a.as_i16x8(), b.as_i16x8())) } + unsafe { + let max = simd_splat(i16::from(u8::MAX)); + let min = simd_splat(i16::from(u8::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i16x8(), max), min) + .as_m128i() + .as_i8x16(); + let clamped_b = simd_imax(simd_imin(b.as_i16x8(), max), min) + .as_m128i() + .as_i8x16(); + + // Shuffle the low bytes of each i16 from two concatenated vectors into + // the low bits of the result register. + // Without `simd_shuffle`, this intrinsic will cause the AVX-512BW + // `_mm_mask_packus_epi16` and `_mm_maskz_packus_epi16` tests to fail. + const IDXS: [u32; 16] = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]; + let result: i8x16 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m128i() + } } /// Returns the `imm8` element of `a`. @@ -3217,12 +3266,6 @@ unsafe extern "C" { fn cvtps2dq(a: __m128) -> i32x4; #[link_name = "llvm.x86.sse2.maskmov.dqu"] fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8); - #[link_name = "llvm.x86.sse2.packsswb.128"] - fn packsswb(a: i16x8, b: i16x8) -> i8x16; - #[link_name = "llvm.x86.sse2.packssdw.128"] - fn packssdw(a: i32x4, b: i32x4) -> i16x8; - #[link_name = "llvm.x86.sse2.packuswb.128"] - fn packuswb(a: i16x8, b: i16x8) -> u8x16; #[link_name = "llvm.x86.sse2.max.sd"] fn maxsd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse2.max.pd"] diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index 7ad4306f36..8036f24e24 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -418,7 +418,7 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { unsafe { simd_imin(a.as_u32x4(), b.as_u32x4()).as_m128i() } } -/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers +/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers /// using unsigned saturation /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32) @@ -427,7 +427,24 @@ pub const fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(packusdw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) } + unsafe { + let max = simd_splat(i32::from(u16::MAX)); + let min = simd_splat(i32::from(u16::MIN)); + + let clamped_a = simd_imax(simd_imin(a.as_i32x4(), max), min) + .as_m128i() + .as_i16x8(); + let clamped_b = simd_imax(simd_imin(b.as_i32x4(), max), min) + .as_m128i() + .as_i16x8(); + + // Shuffle the low u16 of each i32 from two concatenated vectors into + // the low bits of the result register. + const IDXS: [u32; 8] = [0, 2, 4, 6, 8, 10, 12, 14]; + let result: i16x8 = simd_shuffle!(clamped_a, clamped_b, IDXS); + + result.as_m128i() + } } /// Compares packed 64-bit integers in `a` and `b` for equality @@ -1166,8 +1183,6 @@ pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i { unsafe extern "C" { #[link_name = "llvm.x86.sse41.insertps"] fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; - #[link_name = "llvm.x86.sse41.packusdw"] - fn packusdw(a: i32x4, b: i32x4) -> u16x8; #[link_name = "llvm.x86.sse41.dppd"] fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d; #[link_name = "llvm.x86.sse41.dpps"]