Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 92 additions & 16 deletions crates/core_arch/src/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2315,7 +2315,7 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
}

/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
/// using signed saturation
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
Expand All @@ -2324,10 +2324,31 @@ pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpacksswb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
unsafe {
let max = simd_splat(i16::from(i8::MAX));
let min = simd_splat(i16::from(i8::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
.as_m256i()
.as_i8x32();
let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
.as_m256i()
.as_i8x32();

#[rustfmt::skip]
const IDXS: [u32; 32] = [
00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to i8 conversions
32, 34, 36, 38, 40, 42, 44, 46, // b-lo
16, 18, 20, 22, 24, 26, 28, 30, // a-hi
48, 50, 52, 54, 56, 58, 60, 62, // b-hi
];
let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m256i()
}
}

/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
/// using signed saturation
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
Expand All @@ -2336,10 +2357,31 @@ pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpackssdw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
unsafe {
let max = simd_splat(i32::from(i16::MAX));
let min = simd_splat(i32::from(i16::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
.as_m256i()
.as_i16x16();
let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
.as_m256i()
.as_i16x16();

#[rustfmt::skip]
const IDXS: [u32; 16] = [
00, 02, 04, 06, // a-lo i32 to i16 conversions
16, 18, 20, 22, // b-lo
08, 10, 12, 14, // a-hi
24, 26, 28, 30, // b-hi
];
let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m256i()
}
}

/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
/// Converts packed signed 16-bit integers from `a` and `b` to packed 8-bit integers
/// using unsigned saturation
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
Expand All @@ -2348,10 +2390,31 @@ pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpackuswb))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
unsafe {
let max = simd_splat(i16::from(u8::MAX));
let min = simd_splat(i16::from(u8::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i16x16(), max), min)
.as_m256i()
.as_i8x32();
let clamped_b = simd_imax(simd_imin(b.as_i16x16(), max), min)
.as_m256i()
.as_i8x32();

#[rustfmt::skip]
const IDXS: [u32; 32] = [
00, 02, 04, 06, 08, 10, 12, 14, // a-lo i16 to u8 conversions
32, 34, 36, 38, 40, 42, 44, 46, // b-lo
16, 18, 20, 22, 24, 26, 28, 30, // a-hi
48, 50, 52, 54, 56, 58, 60, 62, // b-hi
];
let result: i8x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m256i()
}
}

/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
/// Converts packed signed 32-bit integers from `a` and `b` to packed 16-bit integers
/// using unsigned saturation
///
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
Expand All @@ -2360,7 +2423,28 @@ pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vpackusdw))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
unsafe {
let max = simd_splat(i32::from(u16::MAX));
let min = simd_splat(i32::from(u16::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i32x8(), max), min)
.as_m256i()
.as_i16x16();
let clamped_b = simd_imax(simd_imin(b.as_i32x8(), max), min)
.as_m256i()
.as_i16x16();

#[rustfmt::skip]
const IDXS: [u32; 16] = [
00, 02, 04, 06, // a-lo i32 to u16 conversions
16, 18, 20, 22, // b-lo
08, 10, 12, 14, // a-hi
24, 26, 28, 30, // b-hi
];
let result: i16x16 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m256i()
}
}

/// Permutes packed 32-bit integers from `a` according to the content of `b`.
Expand Down Expand Up @@ -3827,14 +3911,6 @@ unsafe extern "C" {
fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
#[link_name = "llvm.x86.avx2.pmul.hr.sw"]
fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
#[link_name = "llvm.x86.avx2.packsswb"]
fn packsswb(a: i16x16, b: i16x16) -> i8x32;
#[link_name = "llvm.x86.avx2.packssdw"]
fn packssdw(a: i32x8, b: i32x8) -> i16x16;
#[link_name = "llvm.x86.avx2.packuswb"]
fn packuswb(a: i16x16, b: i16x16) -> u8x32;
#[link_name = "llvm.x86.avx2.packusdw"]
fn packusdw(a: i32x8, b: i32x8) -> u16x16;
#[link_name = "llvm.x86.avx2.psad.bw"]
fn psadbw(a: u8x32, b: u8x32) -> u64x4;
#[link_name = "llvm.x86.avx2.psign.b"]
Expand Down
117 changes: 104 additions & 13 deletions crates/core_arch/src/x86/avx512bw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6524,7 +6524,32 @@ pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpackssdw))]
pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) }
unsafe {
let max = simd_splat(i32::from(i16::MAX));
let min = simd_splat(i32::from(i16::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
.as_m512i()
.as_i16x32();
let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min)
.as_m512i()
.as_i16x32();

#[rustfmt::skip]
const IDXS: [u32; 32] = [
00, 02, 04, 06,
32, 34, 36, 38,
08, 10, 12, 14,
40, 42, 44, 46,
16, 18, 20, 22,
48, 50, 52, 54,
24, 26, 28, 30,
56, 58, 60, 62,
];
let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m512i()
}
}

/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Expand Down Expand Up @@ -6619,7 +6644,32 @@ pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpacksswb))]
pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) }
unsafe {
let max = simd_splat(i16::from(i8::MAX));
let min = simd_splat(i16::from(i8::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
.as_m512i()
.as_i8x64();
let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min)
.as_m512i()
.as_i8x64();

#[rustfmt::skip]
const IDXS: [u32; 64] = [
000, 002, 004, 006, 008, 010, 012, 014,
064, 066, 068, 070, 072, 074, 076, 078,
016, 018, 020, 022, 024, 026, 028, 030,
080, 082, 084, 086, 088, 090, 092, 094,
032, 034, 036, 038, 040, 042, 044, 046,
096, 098, 100, 102, 104, 106, 108, 110,
048, 050, 052, 054, 056, 058, 060, 062,
112, 114, 116, 118, 120, 122, 124, 126,
];
let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m512i()
}
}

/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Expand Down Expand Up @@ -6714,7 +6764,32 @@ pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpackusdw))]
pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) }
unsafe {
let max = simd_splat(i32::from(u16::MAX));
let min = simd_splat(i32::from(u16::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i32x16(), max), min)
.as_m512i()
.as_i16x32();
let clamped_b = simd_imax(simd_imin(b.as_i32x16(), max), min)
.as_m512i()
.as_i16x32();

#[rustfmt::skip]
const IDXS: [u32; 32] = [
00, 02, 04, 06,
32, 34, 36, 38,
08, 10, 12, 14,
40, 42, 44, 46,
16, 18, 20, 22,
48, 50, 52, 54,
24, 26, 28, 30,
56, 58, 60, 62,
];
let result: i16x32 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m512i()
}
}

/// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Expand Down Expand Up @@ -6809,7 +6884,32 @@ pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpackuswb))]
pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) }
unsafe {
let max = simd_splat(i16::from(u8::MAX));
let min = simd_splat(i16::from(u8::MIN));

let clamped_a = simd_imax(simd_imin(a.as_i16x32(), max), min)
.as_m512i()
.as_i8x64();
let clamped_b = simd_imax(simd_imin(b.as_i16x32(), max), min)
.as_m512i()
.as_i8x64();

#[rustfmt::skip]
const IDXS: [u32; 64] = [
000, 002, 004, 006, 008, 010, 012, 014,
064, 066, 068, 070, 072, 074, 076, 078,
016, 018, 020, 022, 024, 026, 028, 030,
080, 082, 084, 086, 088, 090, 092, 094,
032, 034, 036, 038, 040, 042, 044, 046,
096, 098, 100, 102, 104, 106, 108, 110,
048, 050, 052, 054, 056, 058, 060, 062,
112, 114, 116, 118, 120, 122, 124, 126,
];
let result: i8x64 = simd_shuffle!(clamped_a, clamped_b, IDXS);

result.as_m512i()
}
}

/// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Expand Down Expand Up @@ -12606,15 +12706,6 @@ unsafe extern "C" {
#[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;

#[link_name = "llvm.x86.avx512.packssdw.512"]
fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;
#[link_name = "llvm.x86.avx512.packsswb.512"]
fn vpacksswb(a: i16x32, b: i16x32) -> i8x64;
#[link_name = "llvm.x86.avx512.packusdw.512"]
fn vpackusdw(a: i32x16, b: i32x16) -> u16x32;
#[link_name = "llvm.x86.avx512.packuswb.512"]
fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;

#[link_name = "llvm.x86.avx512.psll.w.512"]
fn vpsllw(a: i16x32, count: i16x8) -> i16x32;

Expand Down
Loading