From ea0b7d1e1a02cf23f9350050507c1420224c148b Mon Sep 17 00:00:00 2001 From: Atul Bhosale Date: Thu, 19 Dec 2019 23:14:36 +0530 Subject: [PATCH] Format code using 'cargo fmt' --- benches/destride.rs | 56 +- benches/intrin.rs | 70 +- benches/usage.rs | 210 +- examples/main.rs | 45 +- src/arch/unknown/intrin/abs.rs | 264 +- src/arch/unknown/intrin/cmp.rs | 2 +- src/arch/unknown/intrin/destride.rs | 8 +- src/arch/unknown/intrin/downcast.rs | 388 +- src/arch/unknown/intrin/endian.rs | 107 +- src/arch/unknown/intrin/eq.rs | 4 +- src/arch/unknown/intrin/hadd.rs | 82 +- src/arch/unknown/intrin/hsub.rs | 82 +- src/arch/unknown/intrin/merge.rs | 183 +- src/arch/unknown/intrin/mod.rs | 6 +- src/arch/unknown/intrin/recip.rs | 2 +- src/arch/unknown/intrin/round.rs | 2 +- src/arch/unknown/intrin/rsqrt.rs | 2 +- src/arch/unknown/intrin/saturating_add.rs | 2 +- src/arch/unknown/intrin/saturating_hadd.rs | 72 +- src/arch/unknown/intrin/saturating_hsub.rs | 72 +- src/arch/unknown/intrin/saturating_sub.rs | 2 +- src/arch/unknown/intrin/sqrt.rs | 2 +- src/arch/unknown/intrin/sum.rs | 4 +- src/arch/unknown/intrin/transmute.rs | 2 +- src/arch/unknown/intrin/upcast.rs | 609 +-- src/arch/unknown/mod.rs | 2 +- src/arch/unknown/vec_patterns.rs | 751 ++-- src/arch/unknown/vecs.rs | 132 +- src/arch/x86/intrin/abs.rs | 276 +- src/arch/x86/intrin/cmp.rs | 6 +- src/arch/x86/intrin/destride.rs | 88 +- src/arch/x86/intrin/downcast.rs | 396 +- src/arch/x86/intrin/endian.rs | 384 +- src/arch/x86/intrin/eq.rs | 8 +- src/arch/x86/intrin/hadd.rs | 585 ++- src/arch/x86/intrin/hsub.rs | 585 ++- src/arch/x86/intrin/merge.rs | 790 +++- src/arch/x86/intrin/mod.rs | 6 +- src/arch/x86/intrin/popcnt.rs | 72 +- src/arch/x86/intrin/recip.rs | 7 +- src/arch/x86/intrin/round.rs | 8 +- src/arch/x86/intrin/rsqrt.rs | 6 +- src/arch/x86/intrin/saturating_add.rs | 6 +- src/arch/x86/intrin/saturating_hadd.rs | 508 ++- src/arch/x86/intrin/saturating_hsub.rs | 508 ++- src/arch/x86/intrin/saturating_sub.rs | 6 +- src/arch/x86/intrin/sqrt.rs | 6 +- src/arch/x86/intrin/sum.rs | 150 +- src/arch/x86/intrin/transmute.rs | 8 +- src/arch/x86/intrin/upcast.rs | 769 ++-- src/arch/x86/mod.rs | 2 +- src/arch/x86/vec_patterns.rs | 3903 ++++++++++++++------ src/arch/x86/vecs.rs | 57 +- src/debug.rs | 44 +- src/into_iters.rs | 24 +- src/intrin/destride.rs | 2 +- src/intrin/endian.rs | 2 +- src/intrin/eq.rs | 42 +- src/intrin/hadd.rs | 50 +- src/intrin/hsub.rs | 50 +- src/intrin/macros.rs | 1 - src/intrin/mod.rs | 26 +- src/intrin/popcnt.rs | 2 +- src/intrin/saturating_hadd.rs | 80 +- src/intrin/saturating_hsub.rs | 85 +- src/intrin/sum.rs | 22 +- src/intrin/transmute.rs | 4 +- src/intrin/upcast.rs | 35 +- src/iters.rs | 306 +- src/lib.rs | 21 +- src/prelude.rs | 12 +- src/stride.rs | 224 +- src/stride_zip.rs | 58 +- src/vec_patterns.rs | 6 +- src/vecs.rs | 17 +- src/zip.rs | 203 +- tests/iters.rs | 8 +- tests/kernel.rs | 38 +- tests/zip.rs | 65 +- 79 files changed, 10071 insertions(+), 3659 deletions(-) diff --git a/benches/destride.rs b/benches/destride.rs index ddae096..2d0b44b 100644 --- a/benches/destride.rs +++ b/benches/destride.rs @@ -1,12 +1,14 @@ #![feature(stdsimd, test)] -#[cfg(test)] extern crate test; -#[macro_use] extern crate faster; +#[cfg(test)] +extern crate test; +#[macro_use] +extern crate faster; #[cfg(test)] mod destride { use faster::prelude::*; - use test::{Bencher, black_box}; + use test::{black_box, Bencher}; #[bench] #[cfg(feature = "std")] @@ -79,8 +81,12 @@ mod destride { fn destride_four_naiive(b: &mut Bencher) { let a = [0u8; 4096]; b.iter(|| { - (&a[..]).stride_four(tuplify!(4, u8s(0))).zip() - .simd_do_each(|x| { black_box(x); }); + (&a[..]) + .stride_four(tuplify!(4, u8s(0))) + .zip() + .simd_do_each(|x| { + black_box(x); + }); }) } @@ -89,8 +95,12 @@ mod destride { fn destride_two_naiive(b: &mut Bencher) { let a = [0u8; 4096]; b.iter(|| { - (&a[..]).stride_two(tuplify!(2, u8s(0))).zip() - .simd_do_each(|x| { black_box(x); }); + (&a[..]) + .stride_two(tuplify!(2, u8s(0))) + .zip() + .simd_do_each(|x| { + black_box(x); + }); }) } @@ -99,8 +109,12 @@ mod destride { fn destride_four_naiive_16(b: &mut Bencher) { let a = [0u16; 4096]; b.iter(|| { - (&a[..]).stride_four(tuplify!(4, u16s(0))).zip() - .simd_do_each(|x| { black_box(x); }); + (&a[..]) + .stride_four(tuplify!(4, u16s(0))) + .zip() + .simd_do_each(|x| { + black_box(x); + }); }) } @@ -109,8 +123,12 @@ mod destride { fn destride_two_naiive_16(b: &mut Bencher) { let a = [0u16; 4096]; b.iter(|| { - (&a[..]).stride_two(tuplify!(2, u16s(0))).zip() - .simd_do_each(|x| { black_box(x); }); + (&a[..]) + .stride_two(tuplify!(2, u16s(0))) + .zip() + .simd_do_each(|x| { + black_box(x); + }); }) } @@ -119,8 +137,12 @@ mod destride { fn destride_four_naiive_32(b: &mut Bencher) { let a = [0u32; 4096]; b.iter(|| { - (&a[..]).stride_four(tuplify!(4, u32s(0))).zip() - .simd_do_each(|x| { black_box(x); }); + (&a[..]) + .stride_four(tuplify!(4, u32s(0))) + .zip() + .simd_do_each(|x| { + black_box(x); + }); }) } @@ -129,8 +151,12 @@ mod destride { fn destride_two_naiive_32(b: &mut Bencher) { let a = [0u32; 4096]; b.iter(|| { - (&a[..]).stride_two(tuplify!(2, u32s(0))).zip() - .simd_do_each(|x| { black_box(x); }); + (&a[..]) + .stride_two(tuplify!(2, u32s(0))) + .zip() + .simd_do_each(|x| { + black_box(x); + }); }) } } diff --git a/benches/intrin.rs b/benches/intrin.rs index 5bbca64..00ba032 100644 --- a/benches/intrin.rs +++ b/benches/intrin.rs @@ -1,7 +1,8 @@ #![feature(test, stdsimd)] -#[cfg(test)] extern crate test; extern crate faster; +#[cfg(test)] +extern crate test; const ARRAY_F32: &[f32] = &[-123.456f32; 1024]; @@ -10,19 +11,29 @@ macro_rules! bench_intrin_1 { #[bench] #[cfg(feature = "std")] fn $scalar_name(b: &mut Bencher) { - b.iter(|| { black_box( - crate::ARRAY_F32.iter().map(|v| { $scalar_fn(*v) }).collect::>() - )}) + b.iter(|| { + black_box( + crate::ARRAY_F32 + .iter() + .map(|v| $scalar_fn(*v)) + .collect::>(), + ) + }) } #[bench] #[cfg(feature = "std")] fn $simd_name(b: &mut Bencher) { - b.iter(|| { black_box( - crate::ARRAY_F32.simd_iter(f32s(0.0)).simd_map(|v| { $simd_fn(v) }).scalar_collect() - )}); + b.iter(|| { + black_box( + crate::ARRAY_F32 + .simd_iter(f32s(0.0)) + .simd_map(|v| $simd_fn(v)) + .scalar_collect(), + ) + }); } - } + }; } macro_rules! bench_intrin_2 { @@ -30,34 +41,51 @@ macro_rules! bench_intrin_2 { #[bench] #[cfg(feature = "std")] fn $scalar_name(b: &mut Bencher) { - b.iter(|| { black_box( - crate::ARRAY_F32.iter().map(|v| { v.$scalar_fn(*v) }).collect::>() - )}) + b.iter(|| { + black_box( + crate::ARRAY_F32 + .iter() + .map(|v| v.$scalar_fn(*v)) + .collect::>(), + ) + }) } #[bench] #[cfg(feature = "std")] fn $simd_name(b: &mut Bencher) { - b.iter(|| { black_box( - crate::ARRAY_F32.simd_iter(f32s(0.0)).simd_map(|v| {v.$simd_fn(v) }).scalar_collect() - )}); + b.iter(|| { + black_box( + crate::ARRAY_F32 + .simd_iter(f32s(0.0)) + .simd_map(|v| v.$simd_fn(v)) + .scalar_collect(), + ) + }); } - } + }; } - #[cfg(test)] mod intrin { use faster::prelude::*; - use test::{Bencher, black_box}; + use test::{black_box, Bencher}; bench_intrin_1!(abs_simd, |x: f32s| x.abs(), abs_scala, |x: f32| x.abs()); bench_intrin_1!(ceil_simd, |x: f32s| x.ceil(), ceil_scala, |x: f32| x.ceil()); - bench_intrin_1!(floor_simd, |x: f32s| x.floor(), floor_scala, |x: f32| x.floor()); + bench_intrin_1!(floor_simd, |x: f32s| x.floor(), floor_scala, |x: f32| x + .floor()); bench_intrin_2!(min_simd, min, min_scala, min); bench_intrin_2!(max_simd, max, max_scala, max); - bench_intrin_1!(recip_simd, |x: f32s| x.recip(), recip_scala, |x: f32| 1.0f32 / x); - bench_intrin_1!(round_simd, |x: f32s| x.round(), round_scala, |x: f32| x.round()); + bench_intrin_1!( + recip_simd, + |x: f32s| x.recip(), + recip_scala, + |x: f32| 1.0f32 / x + ); + bench_intrin_1!(round_simd, |x: f32s| x.round(), round_scala, |x: f32| x + .round()); bench_intrin_1!(sqrt_simd, |x: f32s| x.sqrt(), sqrt_scala, |x: f32| x.sqrt()); - bench_intrin_1!(trunc_simd, |x: f32s| x.trunc(), trunc_scala, |x: f32| x.trunc()); + bench_intrin_1!(trunc_simd, |x: f32s| x.trunc(), trunc_scala, |x: f32| x + .trunc()); } diff --git a/benches/usage.rs b/benches/usage.rs index 026c78e..c335569 100644 --- a/benches/usage.rs +++ b/benches/usage.rs @@ -1,29 +1,32 @@ #![feature(stdsimd, test)] -#[cfg(test)] extern crate test; -#[macro_use] extern crate faster; +#[cfg(test)] +extern crate test; +#[macro_use] +extern crate faster; #[cfg(test)] mod usage { use faster::prelude::*; - use test::{Bencher, black_box}; + use test::{black_box, Bencher}; #[bench] #[cfg(feature = "std")] fn nop_simd(b: &mut Bencher) { b.iter(|| { black_box( - [0u8; 1024].simd_iter(u8s(0)).simd_map(|v| v).scalar_collect()) + [0u8; 1024] + .simd_iter(u8s(0)) + .simd_map(|v| v) + .scalar_collect(), + ) }); } #[bench] #[cfg(feature = "std")] fn nop_scalar(b: &mut Bencher) { - b.iter(|| { - black_box( - [0u8; 1024].iter().map(|e| *e).collect::>()) - }); + b.iter(|| black_box([0u8; 1024].iter().map(|e| *e).collect::>())); } #[bench] @@ -31,9 +34,13 @@ mod usage { fn map_simd(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1024].simd_iter(f32s(0.0)).simd_map(|v| { - f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - f32s(4.0) - f32s(2.0) - }).scalar_collect()) + [-123.456f32; 1024] + .simd_iter(f32s(0.0)) + .simd_map(|v| { + f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - f32s(4.0) - f32s(2.0) + }) + .scalar_collect(), + ) }) } @@ -72,11 +79,14 @@ mod usage { fn high_latency(b: &mut Bencher) { let mut out = [0f32; 1024]; b.iter(|| { - [-123.456f32; 1024].simd_iter(f32s(0.0)).simd_map(|v| { - let (a, b) = (v * f32s(1.20)).upcast(); - (a.sqrt() * f64s(3.141592653589793)).saturating_downcast( - b.sqrt() * f64s(3.141592653589793)) - }).scalar_fill(&mut out); + [-123.456f32; 1024] + .simd_iter(f32s(0.0)) + .simd_map(|v| { + let (a, b) = (v * f32s(1.20)).upcast(); + (a.sqrt() * f64s(3.141592653589793)) + .saturating_downcast(b.sqrt() * f64s(3.141592653589793)) + }) + .scalar_fill(&mut out); }); } @@ -86,8 +96,8 @@ mod usage { b.iter(|| { for (i, v) in [-123.456f32; 1024].simd_iter(f32s(0.0)).enumerate() { let (a, b) = (v * f32s(1.20)).upcast(); - let ans = (a.sqrt() * f64s(3.141592653589793)).saturating_downcast( - b.sqrt() * f64s(3.141592653589793)); + let ans = (a.sqrt() * f64s(3.141592653589793)) + .saturating_downcast(b.sqrt() * f64s(3.141592653589793)); ans.store(&mut out, i * ans.width()); } }) @@ -97,7 +107,11 @@ mod usage { fn high_latency_unrolled(b: &mut Bencher) { let mut out = [0f32; 1024]; b.iter(|| { - for (i, v) in [-123.456f32; 1024].simd_iter(f32s(0.0)).unroll(8).enumerate() { + for (i, v) in [-123.456f32; 1024] + .simd_iter(f32s(0.0)) + .unroll(8) + .enumerate() + { macro_rules! compute { ($($idx:expr),*) => { $( @@ -144,9 +158,13 @@ mod usage { let mut into = [0f32; 1024]; b.iter(|| { black_box( - [-123.456f32; 1024].simd_iter(f32s(0.0)).simd_map(|v| { - f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - f32s(4.0) - f32s(2.0) - }).scalar_fill(&mut into)); + [-123.456f32; 1024] + .simd_iter(f32s(0.0)) + .simd_map(|v| { + f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - f32s(4.0) - f32s(2.0) + }) + .scalar_fill(&mut into), + ); }) } @@ -155,9 +173,13 @@ mod usage { fn map_uneven_simd(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1025].simd_iter(f32s(0.0)).simd_map(|v| { - f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - f32s(4.0) - f32s(2.0) - }).scalar_collect()) + [-123.456f32; 1025] + .simd_iter(f32s(0.0)) + .simd_map(|v| { + f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - f32s(4.0) - f32s(2.0) + }) + .scalar_collect(), + ) }) } @@ -166,10 +188,11 @@ mod usage { fn map_scalar(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1024].iter() - .map(|v| { 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt() - - 4.0 - 2.0 }) - .collect::>()) + [-123.456f32; 1024] + .iter() + .map(|v| 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt() - 4.0 - 2.0) + .collect::>(), + ) }); } @@ -177,8 +200,13 @@ mod usage { fn reduce_simd(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1024].simd_iter(f32s(0.0)) - .simd_reduce(f32s(0.0), |a, v| a + f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt()).sum()) + [-123.456f32; 1024] + .simd_iter(f32s(0.0)) + .simd_reduce(f32s(0.0), |a, v| { + a + f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() + }) + .sum(), + ) }) } @@ -186,17 +214,22 @@ mod usage { fn reduce_uneven_simd(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1025].simd_iter(f32s(0.0)) - .simd_reduce(f32s(0.0), |a, v| a + f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt()).sum()) + [-123.456f32; 1025] + .simd_iter(f32s(0.0)) + .simd_reduce(f32s(0.0), |a, v| { + a + f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() + }) + .sum(), + ) }) } #[bench] fn reduce_scalar(b: &mut Bencher) { b.iter(|| { - black_box( - [-123.456f32; 1024].iter() - .fold(0.0, |a, v| a + 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt())) + black_box([-123.456f32; 1024].iter().fold(0.0, |a, v| { + a + 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt() + })) }) } @@ -206,11 +239,17 @@ mod usage { // TODO: Why is this so slow? Cache locality? b.iter(|| { black_box( - (&[-123.456f32; 1026][..]).stride_nine(tuplify!(9, f32s(0.0))).zip() + (&[-123.456f32; 1026][..]) + .stride_nine(tuplify!(9, f32s(0.0))) + .zip() .simd_map(|(a, b, c, d, e, f, g, h, i)| { - (a * e * i) + (b * f * g) + (c * d * h) - (c * e * g) - (b * d * i) - (a * f * h) + (a * e * i) + (b * f * g) + (c * d * h) + - (c * e * g) + - (b * d * i) + - (a * f * h) }) - .scalar_collect()) + .scalar_collect(), + ) }) } @@ -219,9 +258,16 @@ mod usage { fn determinant3_scalar(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1026].chunks(9).map(|m| { - (m[0] * m[4] * m[8]) + (m[1] * m[5] * m[6]) + (m[2] * m[3] * m[7]) - (m[2] * m[4] * m[6]) - (m[1] * m[3] * m[8]) - (m[0] * m[5] * m[7]) - }).collect::>()) + [-123.456f32; 1026] + .chunks(9) + .map(|m| { + (m[0] * m[4] * m[8]) + (m[1] * m[5] * m[6]) + (m[2] * m[3] * m[7]) + - (m[2] * m[4] * m[6]) + - (m[1] * m[3] * m[8]) + - (m[0] * m[5] * m[7]) + }) + .collect::>(), + ) }) } @@ -231,10 +277,12 @@ mod usage { // TODO: Why is this so slow? Cache locality? b.iter(|| { black_box( - (&[-123.456f32; 1024][..]).stride_four(tuplify!(4, f32s(0.0))).zip() - .simd_map(|(a, b, c, d)| { - a * d - b * c - }).scalar_collect()) + (&[-123.456f32; 1024][..]) + .stride_four(tuplify!(4, f32s(0.0))) + .zip() + .simd_map(|(a, b, c, d)| a * d - b * c) + .scalar_collect(), + ) }) } @@ -243,9 +291,11 @@ mod usage { fn determinant2_scalar(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1024].chunks(4).map(|m| { - m[0] * m[3] - m[1] * m[2] - }).collect::>()) + [-123.456f32; 1024] + .chunks(4) + .map(|m| m[0] * m[3] - m[1] * m[2]) + .collect::>(), + ) }) } @@ -254,12 +304,16 @@ mod usage { fn zip_simd(b: &mut Bencher) { b.iter(|| { black_box( - (&[-123i32; 1024][..]).stride_two(tuplify!(2, i32s(0))).zip() + (&[-123i32; 1024][..]) + .stride_two(tuplify!(2, i32s(0))) + .zip() .simd_map(|(a, b)| { let (aa, ab): (i64s, i64s) = a.upcast(); let (ba, bb): (i64s, i64s) = b.upcast(); (aa.abs() + ba.abs()).saturating_downcast(ab.abs() + bb.abs()) - }).scalar_collect()) + }) + .scalar_collect(), + ) }) } @@ -268,9 +322,11 @@ mod usage { fn zip_scalar(b: &mut Bencher) { b.iter(|| { black_box( - [-123i32; 1024].chunks(2).map(|a| { - ((a[0] as f64).abs() + (a[1] as f64).abs()) as f32 - }).collect::>()) + [-123i32; 1024] + .chunks(2) + .map(|a| ((a[0] as f64).abs() + (a[1] as f64).abs()) as f32) + .collect::>(), + ) }) } @@ -279,9 +335,12 @@ mod usage { fn zip_nop_simd(b: &mut Bencher) { b.iter(|| { black_box( - (&[-123.456f32; 1024][..]).stride_two(tuplify!(2, f32s(0.0))).zip() + (&[-123.456f32; 1024][..]) + .stride_two(tuplify!(2, f32s(0.0))) + .zip() .simd_map(|(a, b)| a + b) - .scalar_collect()) + .scalar_collect(), + ) }) } @@ -290,9 +349,11 @@ mod usage { fn zip_nop_scalar(b: &mut Bencher) { b.iter(|| { black_box( - [-123.456f32; 1024].chunks(2).map(|a| { - a[0] + a[1] - }).collect::>()) + [-123.456f32; 1024] + .chunks(2) + .map(|a| a[0] + a[1]) + .collect::>(), + ) }) } @@ -303,7 +364,8 @@ mod usage { let mut i = 0; [123u8; 1024].simd_iter(u8s(0)).simd_do_each(|v| { let (a, b): (u16s, u16s) = v.upcast(); - let third = ((a + u16s(55)) / u16s(64) + u16s(143)).saturating_downcast((b + u16s(55)) / u16s(64) + u16s(143)); + let third = ((a + u16s(55)) / u16s(64) + u16s(143)) + .saturating_downcast((b + u16s(55)) / u16s(64) + u16s(143)); let fourth = ((v + u8s(55)) & u8s(0x3f)) + u8s(128); // Make some room for interleaving @@ -320,10 +382,22 @@ mod usage { // Interleave a constant 0xf09f with the third and fourth bytes, // and store into out buffer - u32s(0xf09f0000).merge_interleaved(tfa).be_u8s().store(&mut out, i); - u32s(0xf09f0000).merge_interleaved(tfb).be_u8s().store(&mut out, i + v.width()); - u32s(0xf09f0000).merge_interleaved(tfc).be_u8s().store(&mut out, i + v.width() * 2); - u32s(0xf09f0000).merge_interleaved(tfd).be_u8s().store(&mut out, i + v.width() * 3); + u32s(0xf09f0000) + .merge_interleaved(tfa) + .be_u8s() + .store(&mut out, i); + u32s(0xf09f0000) + .merge_interleaved(tfb) + .be_u8s() + .store(&mut out, i + v.width()); + u32s(0xf09f0000) + .merge_interleaved(tfc) + .be_u8s() + .store(&mut out, i + v.width() * 2); + u32s(0xf09f0000) + .merge_interleaved(tfd) + .be_u8s() + .store(&mut out, i + v.width() * 3); i += v.width() * 4; }); out @@ -349,7 +423,9 @@ mod usage { fn stride_zip_naive(b: &mut Bencher) { let a = [0u8; 4096]; b.iter(|| { - (&a[..]).stride_two(tuplify!(2, u8s(0))).zip() + (&a[..]) + .stride_two(tuplify!(2, u8s(0))) + .zip() .simd_map(|(a, b)| a + b) .scalar_collect() }) @@ -360,7 +436,9 @@ mod usage { fn stride_zip(b: &mut Bencher) { let a = [0u8; 4096]; b.iter(|| { - (&a[..]).simd_iter(u8s(0)).stride_zip() + (&a[..]) + .simd_iter(u8s(0)) + .stride_zip() .simd_map(|(a, b)| a + b) .scalar_collect() }) diff --git a/examples/main.rs b/examples/main.rs index e2536f7..380d238 100644 --- a/examples/main.rs +++ b/examples/main.rs @@ -11,34 +11,51 @@ use faster::*; #[cfg(feature = "std")] fn main() { - let lots_of_84s = (&[-10i8; 33][..]).simd_iter(i8s(0)) + let lots_of_84s = (&[-10i8; 33][..]) + .simd_iter(i8s(0)) .simd_map(|v| i8s(9) * v.abs().be_i8s() - i8s(4) - i8s(2)) .simd_map(|v| v) .scalar_collect(); - let lots_of_3s = (&[-123.456f32; 128][..]).simd_iter(f32s(0.0)) - .simd_map(|v| { f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - - f32s(4.0) - f32s(2.0) }) + let lots_of_3s = (&[-123.456f32; 128][..]) + .simd_iter(f32s(0.0)) + .simd_map(|v| f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - f32s(4.0) - f32s(2.0)) .scalar_collect(); - let lots_of_3s_sc = (&[-123.456f32; 128][..]).iter() - .map(|v| { 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt() - - 4.0 - 2.0 }) + let lots_of_3s_sc = (&[-123.456f32; 128][..]) + .iter() + .map(|v| 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt() - 4.0 - 2.0) .collect::>(); let mut some_u8s = [0u8; 100]; - let filled_u8s = (&[5u8; 100][..]).simd_iter(u8s(0)) + let filled_u8s = (&[5u8; 100][..]) + .simd_iter(u8s(0)) .simd_map(|vector| vector * u8s(2)) .scalar_fill(&mut some_u8s); - let reduced = (&[-1.0f32; 128][..]).simd_iter(f32s(0.0)) - .simd_reduce(f32s(0.0), |a, v| a + v.abs().sqrt().sqrt().floor()).sum(); - - let strided = (0..20u32).collect::>().as_slice() - .stride_two(tuplify!(2, u32s(99))).zip().simd_map(|(a, b)| a + b) + let reduced = (&[-1.0f32; 128][..]) + .simd_iter(f32s(0.0)) + .simd_reduce(f32s(0.0), |a, v| a + v.abs().sqrt().sqrt().floor()) + .sum(); + + let strided = (0..20u32) + .collect::>() + .as_slice() + .stride_two(tuplify!(2, u32s(99))) + .zip() + .simd_map(|(a, b)| a + b) .scalar_collect(); - println!("{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n", lots_of_84s, lots_of_3s, lots_of_3s_sc, filled_u8s, filled_u8s.len(), reduced, strided); + println!( + "{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n", + lots_of_84s, + lots_of_3s, + lots_of_3s_sc, + filled_u8s, + filled_u8s.len(), + reduced, + strided + ); } #[cfg(not(feature = "std"))] diff --git a/src/arch/unknown/intrin/abs.rs b/src/arch/unknown/intrin/abs.rs index 6632940..8f28412 100644 --- a/src/arch/unknown/intrin/abs.rs +++ b/src/arch/unknown/intrin/abs.rs @@ -5,29 +5,30 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::abs::Abs; use crate::arch::current::vecs::*; use crate::core::mem::transmute; +use crate::intrin::abs::Abs; impl Abs for f32x4 { type Out = f32x4; #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs(), - self.extract(2).abs(), - self.extract(3).abs()) + Self::Out::new( + self.extract(0).abs(), + self.extract(1).abs(), + self.extract(2).abs(), + self.extract(3).abs(), + ) } } impl Abs for f64x2 { type Out = f64x2; - + #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs()) + Self::Out::new(self.extract(0).abs(), self.extract(1).abs()) } } @@ -36,14 +37,16 @@ impl Abs for f32x8 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs(), - self.extract(2).abs(), - self.extract(3).abs(), - self.extract(4).abs(), - self.extract(5).abs(), - self.extract(6).abs(), - self.extract(7).abs()) + Self::Out::new( + self.extract(0).abs(), + self.extract(1).abs(), + self.extract(2).abs(), + self.extract(3).abs(), + self.extract(4).abs(), + self.extract(5).abs(), + self.extract(6).abs(), + self.extract(7).abs(), + ) } } @@ -52,10 +55,12 @@ impl Abs for f64x4 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs(), - self.extract(2).abs(), - self.extract(3).abs()) + Self::Out::new( + self.extract(0).abs(), + self.extract(1).abs(), + self.extract(2).abs(), + self.extract(3).abs(), + ) } } @@ -64,22 +69,24 @@ impl Abs for i8x16 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }, - unsafe { transmute::(self.extract(8).overflowing_abs().0) }, - unsafe { transmute::(self.extract(9).overflowing_abs().0) }, - unsafe { transmute::(self.extract(10).overflowing_abs().0) }, - unsafe { transmute::(self.extract(11).overflowing_abs().0) }, - unsafe { transmute::(self.extract(12).overflowing_abs().0) }, - unsafe { transmute::(self.extract(13).overflowing_abs().0) }, - unsafe { transmute::(self.extract(14).overflowing_abs().0) }, - unsafe { transmute::(self.extract(15).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + unsafe { transmute::(self.extract(8).overflowing_abs().0) }, + unsafe { transmute::(self.extract(9).overflowing_abs().0) }, + unsafe { transmute::(self.extract(10).overflowing_abs().0) }, + unsafe { transmute::(self.extract(11).overflowing_abs().0) }, + unsafe { transmute::(self.extract(12).overflowing_abs().0) }, + unsafe { transmute::(self.extract(13).overflowing_abs().0) }, + unsafe { transmute::(self.extract(14).overflowing_abs().0) }, + unsafe { transmute::(self.extract(15).overflowing_abs().0) }, + ) } } @@ -88,26 +95,30 @@ impl Abs for i16x8 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + ) } } impl Abs for i32x4 { type Out = u32x4; - + #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + ) } } @@ -116,62 +127,66 @@ impl Abs for i8x32 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }, - unsafe { transmute::(self.extract(8).overflowing_abs().0) }, - unsafe { transmute::(self.extract(9).overflowing_abs().0) }, - unsafe { transmute::(self.extract(10).overflowing_abs().0) }, - unsafe { transmute::(self.extract(11).overflowing_abs().0) }, - unsafe { transmute::(self.extract(12).overflowing_abs().0) }, - unsafe { transmute::(self.extract(13).overflowing_abs().0) }, - unsafe { transmute::(self.extract(14).overflowing_abs().0) }, - unsafe { transmute::(self.extract(15).overflowing_abs().0) }, - unsafe { transmute::(self.extract(16).overflowing_abs().0) }, - unsafe { transmute::(self.extract(17).overflowing_abs().0) }, - unsafe { transmute::(self.extract(18).overflowing_abs().0) }, - unsafe { transmute::(self.extract(19).overflowing_abs().0) }, - unsafe { transmute::(self.extract(20).overflowing_abs().0) }, - unsafe { transmute::(self.extract(21).overflowing_abs().0) }, - unsafe { transmute::(self.extract(22).overflowing_abs().0) }, - unsafe { transmute::(self.extract(23).overflowing_abs().0) }, - unsafe { transmute::(self.extract(24).overflowing_abs().0) }, - unsafe { transmute::(self.extract(25).overflowing_abs().0) }, - unsafe { transmute::(self.extract(26).overflowing_abs().0) }, - unsafe { transmute::(self.extract(27).overflowing_abs().0) }, - unsafe { transmute::(self.extract(28).overflowing_abs().0) }, - unsafe { transmute::(self.extract(29).overflowing_abs().0) }, - unsafe { transmute::(self.extract(30).overflowing_abs().0) }, - unsafe { transmute::(self.extract(31).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + unsafe { transmute::(self.extract(8).overflowing_abs().0) }, + unsafe { transmute::(self.extract(9).overflowing_abs().0) }, + unsafe { transmute::(self.extract(10).overflowing_abs().0) }, + unsafe { transmute::(self.extract(11).overflowing_abs().0) }, + unsafe { transmute::(self.extract(12).overflowing_abs().0) }, + unsafe { transmute::(self.extract(13).overflowing_abs().0) }, + unsafe { transmute::(self.extract(14).overflowing_abs().0) }, + unsafe { transmute::(self.extract(15).overflowing_abs().0) }, + unsafe { transmute::(self.extract(16).overflowing_abs().0) }, + unsafe { transmute::(self.extract(17).overflowing_abs().0) }, + unsafe { transmute::(self.extract(18).overflowing_abs().0) }, + unsafe { transmute::(self.extract(19).overflowing_abs().0) }, + unsafe { transmute::(self.extract(20).overflowing_abs().0) }, + unsafe { transmute::(self.extract(21).overflowing_abs().0) }, + unsafe { transmute::(self.extract(22).overflowing_abs().0) }, + unsafe { transmute::(self.extract(23).overflowing_abs().0) }, + unsafe { transmute::(self.extract(24).overflowing_abs().0) }, + unsafe { transmute::(self.extract(25).overflowing_abs().0) }, + unsafe { transmute::(self.extract(26).overflowing_abs().0) }, + unsafe { transmute::(self.extract(27).overflowing_abs().0) }, + unsafe { transmute::(self.extract(28).overflowing_abs().0) }, + unsafe { transmute::(self.extract(29).overflowing_abs().0) }, + unsafe { transmute::(self.extract(30).overflowing_abs().0) }, + unsafe { transmute::(self.extract(31).overflowing_abs().0) }, + ) } } impl Abs for i16x16 { type Out = u16x16; - + #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }, - unsafe { transmute::(self.extract(8).overflowing_abs().0) }, - unsafe { transmute::(self.extract(9).overflowing_abs().0) }, - unsafe { transmute::(self.extract(10).overflowing_abs().0) }, - unsafe { transmute::(self.extract(11).overflowing_abs().0) }, - unsafe { transmute::(self.extract(12).overflowing_abs().0) }, - unsafe { transmute::(self.extract(13).overflowing_abs().0) }, - unsafe { transmute::(self.extract(14).overflowing_abs().0) }, - unsafe { transmute::(self.extract(15).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + unsafe { transmute::(self.extract(8).overflowing_abs().0) }, + unsafe { transmute::(self.extract(9).overflowing_abs().0) }, + unsafe { transmute::(self.extract(10).overflowing_abs().0) }, + unsafe { transmute::(self.extract(11).overflowing_abs().0) }, + unsafe { transmute::(self.extract(12).overflowing_abs().0) }, + unsafe { transmute::(self.extract(13).overflowing_abs().0) }, + unsafe { transmute::(self.extract(14).overflowing_abs().0) }, + unsafe { transmute::(self.extract(15).overflowing_abs().0) }, + ) } } @@ -180,14 +195,16 @@ impl Abs for i32x8 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + ) } } @@ -196,8 +213,10 @@ impl Abs for i64x2 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + ) } } @@ -206,10 +225,12 @@ impl Abs for i64x4 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + ) } } @@ -218,14 +239,15 @@ impl Abs for i64x8 { #[inline(always)] fn abs(&self) -> Self::Out { - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + ) } } - diff --git a/src/arch/unknown/intrin/cmp.rs b/src/arch/unknown/intrin/cmp.rs index c30c2cc..d2acf1d 100644 --- a/src/arch/unknown/intrin/cmp.rs +++ b/src/arch/unknown/intrin/cmp.rs @@ -6,8 +6,8 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::cmp::*; +use crate::vecs::*; rust_fallback_impl_binary! { impl Cmp for u8x16 where "__undefined" { diff --git a/src/arch/unknown/intrin/destride.rs b/src/arch/unknown/intrin/destride.rs index 5408c44..fc748b4 100644 --- a/src/arch/unknown/intrin/destride.rs +++ b/src/arch/unknown/intrin/destride.rs @@ -23,7 +23,9 @@ impl Destride for u8x16 { impl Destride for u8x32 { #[inline(always)] fn destride_two(self, other: Self) -> (Self, Self) { - destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) + destride_two_polyfill!( + self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ) } #[inline(always)] @@ -47,7 +49,9 @@ impl Destride for i8x16 { impl Destride for i8x32 { #[inline(always)] fn destride_two(self, other: Self) -> (Self, Self) { - destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) + destride_two_polyfill!( + self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ) } #[inline(always)] diff --git a/src/arch/unknown/intrin/downcast.rs b/src/arch/unknown/intrin/downcast.rs index e510b9d..b28ae56 100644 --- a/src/arch/unknown/intrin/downcast.rs +++ b/src/arch/unknown/intrin/downcast.rs @@ -11,263 +11,291 @@ use crate::intrin::downcast::*; impl Downcast for i32x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i16x8 { - i16x8::new(self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(3).min(0x00007FFF).max(-0x00008000) as i16) + i16x8::new( + self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + ) } } impl Downcast for i64x2 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i32x4 { - i32x4::new(self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32) + i32x4::new( + self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + ) } } impl Downcast for u64x2 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u32x4 { - u32x4::new(self.extract(0).min(0xFFFFFFFF) as u32, - self.extract(1).min(0x7FFFFFFF) as u32, - other.extract(0).min(0x7FFFFFFF) as u32, - other.extract(1).min(0x7FFFFFFF) as u32) + u32x4::new( + self.extract(0).min(0xFFFFFFFF) as u32, + self.extract(1).min(0x7FFFFFFF) as u32, + other.extract(0).min(0x7FFFFFFF) as u32, + other.extract(1).min(0x7FFFFFFF) as u32, + ) } } impl Downcast for f64x2 { #[inline(always)] fn saturating_downcast(self, other: Self) -> f32x4 { - f32x4::new(self.extract(0) as f32, - self.extract(1) as f32, - other.extract(0) as f32, - other.extract(1) as f32) + f32x4::new( + self.extract(0) as f32, + self.extract(1) as f32, + other.extract(0) as f32, + other.extract(1) as f32, + ) } } impl Downcast for i16x8 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i8x16 { - i8x16::new(self.extract(0).min(0x007F).max(-0x0080) as i8, - self.extract(1).min(0x007F).max(-0x0080) as i8, - self.extract(2).min(0x007F).max(-0x0080) as i8, - self.extract(3).min(0x007F).max(-0x0080) as i8, - self.extract(4).min(0x007F).max(-0x0080) as i8, - self.extract(5).min(0x007F).max(-0x0080) as i8, - self.extract(6).min(0x007F).max(-0x0080) as i8, - self.extract(7).min(0x007F).max(-0x0080) as i8, - other.extract(0).min(0x007F).max(-0x0080) as i8, - other.extract(1).min(0x007F).max(-0x0080) as i8, - other.extract(2).min(0x007F).max(-0x0080) as i8, - other.extract(3).min(0x007F).max(-0x0080) as i8, - other.extract(4).min(0x007F).max(-0x0080) as i8, - other.extract(5).min(0x007F).max(-0x0080) as i8, - other.extract(6).min(0x007F).max(-0x0080) as i8, - other.extract(7).min(0x007F).max(-0x0080) as i8) + i8x16::new( + self.extract(0).min(0x007F).max(-0x0080) as i8, + self.extract(1).min(0x007F).max(-0x0080) as i8, + self.extract(2).min(0x007F).max(-0x0080) as i8, + self.extract(3).min(0x007F).max(-0x0080) as i8, + self.extract(4).min(0x007F).max(-0x0080) as i8, + self.extract(5).min(0x007F).max(-0x0080) as i8, + self.extract(6).min(0x007F).max(-0x0080) as i8, + self.extract(7).min(0x007F).max(-0x0080) as i8, + other.extract(0).min(0x007F).max(-0x0080) as i8, + other.extract(1).min(0x007F).max(-0x0080) as i8, + other.extract(2).min(0x007F).max(-0x0080) as i8, + other.extract(3).min(0x007F).max(-0x0080) as i8, + other.extract(4).min(0x007F).max(-0x0080) as i8, + other.extract(5).min(0x007F).max(-0x0080) as i8, + other.extract(6).min(0x007F).max(-0x0080) as i8, + other.extract(7).min(0x007F).max(-0x0080) as i8, + ) } } impl Downcast for u32x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u16x8 { - u16x8::new(self.extract(0).min(0x0000FFFF) as u16, - self.extract(1).min(0x0000FFFF) as u16, - self.extract(2).min(0x0000FFFF) as u16, - self.extract(3).min(0x0000FFFF) as u16, - other.extract(0).min(0x0000FFFF) as u16, - other.extract(1).min(0x0000FFFF) as u16, - other.extract(2).min(0x0000FFFF) as u16, - other.extract(3).min(0x0000FFFF) as u16) + u16x8::new( + self.extract(0).min(0x0000FFFF) as u16, + self.extract(1).min(0x0000FFFF) as u16, + self.extract(2).min(0x0000FFFF) as u16, + self.extract(3).min(0x0000FFFF) as u16, + other.extract(0).min(0x0000FFFF) as u16, + other.extract(1).min(0x0000FFFF) as u16, + other.extract(2).min(0x0000FFFF) as u16, + other.extract(3).min(0x0000FFFF) as u16, + ) } } impl Downcast for u16x8 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u8x16 { - u8x16::new(self.extract(0).min(0x00FF) as u8, - self.extract(1).min(0x00FF) as u8, - self.extract(2).min(0x00FF) as u8, - self.extract(3).min(0x00FF) as u8, - self.extract(4).min(0x00FF) as u8, - self.extract(5).min(0x00FF) as u8, - self.extract(6).min(0x00FF) as u8, - self.extract(7).min(0x00FF) as u8, - other.extract(0).min(0x00FF) as u8, - other.extract(1).min(0x00FF) as u8, - other.extract(2).min(0x00FF) as u8, - other.extract(3).min(0x00FF) as u8, - other.extract(4).min(0x00FF) as u8, - other.extract(5).min(0x00FF) as u8, - other.extract(6).min(0x00FF) as u8, - other.extract(7).min(0x00FF) as u8) + u8x16::new( + self.extract(0).min(0x00FF) as u8, + self.extract(1).min(0x00FF) as u8, + self.extract(2).min(0x00FF) as u8, + self.extract(3).min(0x00FF) as u8, + self.extract(4).min(0x00FF) as u8, + self.extract(5).min(0x00FF) as u8, + self.extract(6).min(0x00FF) as u8, + self.extract(7).min(0x00FF) as u8, + other.extract(0).min(0x00FF) as u8, + other.extract(1).min(0x00FF) as u8, + other.extract(2).min(0x00FF) as u8, + other.extract(3).min(0x00FF) as u8, + other.extract(4).min(0x00FF) as u8, + other.extract(5).min(0x00FF) as u8, + other.extract(6).min(0x00FF) as u8, + other.extract(7).min(0x00FF) as u8, + ) } } impl Downcast for i32x8 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i16x16 { - i16x16::new(self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(4).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(5).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(6).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(7).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(3).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(4).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(5).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(6).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(7).min(0x00007FFF).max(-0x00008000) as i16) + i16x16::new( + self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(4).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(5).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(6).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(7).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(4).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(5).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(6).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(7).min(0x00007FFF).max(-0x00008000) as i16, + ) } } impl Downcast for i16x16 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i8x32 { - i8x32::new(self.extract(0).min(0x007F).max(-0x0080) as i8, - self.extract(1).min(0x007F).max(-0x0080) as i8, - self.extract(2).min(0x007F).max(-0x0080) as i8, - self.extract(3).min(0x007F).max(-0x0080) as i8, - self.extract(4).min(0x007F).max(-0x0080) as i8, - self.extract(5).min(0x007F).max(-0x0080) as i8, - self.extract(6).min(0x007F).max(-0x0080) as i8, - self.extract(7).min(0x007F).max(-0x0080) as i8, - self.extract(8).min(0x007F).max(-0x0080) as i8, - self.extract(9).min(0x007F).max(-0x0080) as i8, - self.extract(10).min(0x007F).max(-0x0080) as i8, - self.extract(11).min(0x007F).max(-0x0080) as i8, - self.extract(12).min(0x007F).max(-0x0080) as i8, - self.extract(13).min(0x007F).max(-0x0080) as i8, - self.extract(14).min(0x007F).max(-0x0080) as i8, - self.extract(15).min(0x007F).max(-0x0080) as i8, - other.extract(0).min(0x007F).max(-0x0080) as i8, - other.extract(1).min(0x007F).max(-0x0080) as i8, - other.extract(2).min(0x007F).max(-0x0080) as i8, - other.extract(3).min(0x007F).max(-0x0080) as i8, - other.extract(4).min(0x007F).max(-0x0080) as i8, - other.extract(5).min(0x007F).max(-0x0080) as i8, - other.extract(6).min(0x007F).max(-0x0080) as i8, - other.extract(7).min(0x007F).max(-0x0080) as i8, - other.extract(8).min(0x007F).max(-0x0080) as i8, - other.extract(9).min(0x007F).max(-0x0080) as i8, - other.extract(10).min(0x007F).max(-0x0080) as i8, - other.extract(11).min(0x007F).max(-0x0080) as i8, - other.extract(12).min(0x007F).max(-0x0080) as i8, - other.extract(13).min(0x007F).max(-0x0080) as i8, - other.extract(14).min(0x007F).max(-0x0080) as i8, - other.extract(15).min(0x007F).max(-0x0080) as i8) + i8x32::new( + self.extract(0).min(0x007F).max(-0x0080) as i8, + self.extract(1).min(0x007F).max(-0x0080) as i8, + self.extract(2).min(0x007F).max(-0x0080) as i8, + self.extract(3).min(0x007F).max(-0x0080) as i8, + self.extract(4).min(0x007F).max(-0x0080) as i8, + self.extract(5).min(0x007F).max(-0x0080) as i8, + self.extract(6).min(0x007F).max(-0x0080) as i8, + self.extract(7).min(0x007F).max(-0x0080) as i8, + self.extract(8).min(0x007F).max(-0x0080) as i8, + self.extract(9).min(0x007F).max(-0x0080) as i8, + self.extract(10).min(0x007F).max(-0x0080) as i8, + self.extract(11).min(0x007F).max(-0x0080) as i8, + self.extract(12).min(0x007F).max(-0x0080) as i8, + self.extract(13).min(0x007F).max(-0x0080) as i8, + self.extract(14).min(0x007F).max(-0x0080) as i8, + self.extract(15).min(0x007F).max(-0x0080) as i8, + other.extract(0).min(0x007F).max(-0x0080) as i8, + other.extract(1).min(0x007F).max(-0x0080) as i8, + other.extract(2).min(0x007F).max(-0x0080) as i8, + other.extract(3).min(0x007F).max(-0x0080) as i8, + other.extract(4).min(0x007F).max(-0x0080) as i8, + other.extract(5).min(0x007F).max(-0x0080) as i8, + other.extract(6).min(0x007F).max(-0x0080) as i8, + other.extract(7).min(0x007F).max(-0x0080) as i8, + other.extract(8).min(0x007F).max(-0x0080) as i8, + other.extract(9).min(0x007F).max(-0x0080) as i8, + other.extract(10).min(0x007F).max(-0x0080) as i8, + other.extract(11).min(0x007F).max(-0x0080) as i8, + other.extract(12).min(0x007F).max(-0x0080) as i8, + other.extract(13).min(0x007F).max(-0x0080) as i8, + other.extract(14).min(0x007F).max(-0x0080) as i8, + other.extract(15).min(0x007F).max(-0x0080) as i8, + ) } } impl Downcast for u32x8 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u16x16 { - u16x16::new(self.extract(0).min(0x0000FFFF) as u16, - self.extract(1).min(0x0000FFFF) as u16, - self.extract(2).min(0x0000FFFF) as u16, - self.extract(3).min(0x0000FFFF) as u16, - self.extract(4).min(0x0000FFFF) as u16, - self.extract(5).min(0x0000FFFF) as u16, - self.extract(6).min(0x0000FFFF) as u16, - self.extract(7).min(0x0000FFFF) as u16, - other.extract(0).min(0x0000FFFF) as u16, - other.extract(1).min(0x0000FFFF) as u16, - other.extract(2).min(0x0000FFFF) as u16, - other.extract(3).min(0x0000FFFF) as u16, - other.extract(4).min(0x0000FFFF) as u16, - other.extract(5).min(0x0000FFFF) as u16, - other.extract(6).min(0x0000FFFF) as u16, - other.extract(7).min(0x0000FFFF) as u16) + u16x16::new( + self.extract(0).min(0x0000FFFF) as u16, + self.extract(1).min(0x0000FFFF) as u16, + self.extract(2).min(0x0000FFFF) as u16, + self.extract(3).min(0x0000FFFF) as u16, + self.extract(4).min(0x0000FFFF) as u16, + self.extract(5).min(0x0000FFFF) as u16, + self.extract(6).min(0x0000FFFF) as u16, + self.extract(7).min(0x0000FFFF) as u16, + other.extract(0).min(0x0000FFFF) as u16, + other.extract(1).min(0x0000FFFF) as u16, + other.extract(2).min(0x0000FFFF) as u16, + other.extract(3).min(0x0000FFFF) as u16, + other.extract(4).min(0x0000FFFF) as u16, + other.extract(5).min(0x0000FFFF) as u16, + other.extract(6).min(0x0000FFFF) as u16, + other.extract(7).min(0x0000FFFF) as u16, + ) } } impl Downcast for u16x16 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u8x32 { - u8x32::new(self.extract(0).min(0x00FF) as u8, - self.extract(1).min(0x00FF) as u8, - self.extract(2).min(0x00FF) as u8, - self.extract(3).min(0x00FF) as u8, - self.extract(4).min(0x00FF) as u8, - self.extract(5).min(0x00FF) as u8, - self.extract(6).min(0x00FF) as u8, - self.extract(7).min(0x00FF) as u8, - self.extract(8).min(0x00FF) as u8, - self.extract(9).min(0x00FF) as u8, - self.extract(10).min(0x00FF) as u8, - self.extract(11).min(0x00FF) as u8, - self.extract(12).min(0x00FF) as u8, - self.extract(13).min(0x00FF) as u8, - self.extract(14).min(0x00FF) as u8, - self.extract(15).min(0x00FF) as u8, - other.extract(0).min(0x00FF) as u8, - other.extract(1).min(0x00FF) as u8, - other.extract(2).min(0x00FF) as u8, - other.extract(3).min(0x00FF) as u8, - other.extract(4).min(0x00FF) as u8, - other.extract(5).min(0x00FF) as u8, - other.extract(6).min(0x00FF) as u8, - other.extract(7).min(0x00FF) as u8, - other.extract(8).min(0x00FF) as u8, - other.extract(9).min(0x00FF) as u8, - other.extract(10).min(0x00FF) as u8, - other.extract(11).min(0x00FF) as u8, - other.extract(12).min(0x00FF) as u8, - other.extract(13).min(0x00FF) as u8, - other.extract(14).min(0x00FF) as u8, - other.extract(15).min(0x00FF) as u8) + u8x32::new( + self.extract(0).min(0x00FF) as u8, + self.extract(1).min(0x00FF) as u8, + self.extract(2).min(0x00FF) as u8, + self.extract(3).min(0x00FF) as u8, + self.extract(4).min(0x00FF) as u8, + self.extract(5).min(0x00FF) as u8, + self.extract(6).min(0x00FF) as u8, + self.extract(7).min(0x00FF) as u8, + self.extract(8).min(0x00FF) as u8, + self.extract(9).min(0x00FF) as u8, + self.extract(10).min(0x00FF) as u8, + self.extract(11).min(0x00FF) as u8, + self.extract(12).min(0x00FF) as u8, + self.extract(13).min(0x00FF) as u8, + self.extract(14).min(0x00FF) as u8, + self.extract(15).min(0x00FF) as u8, + other.extract(0).min(0x00FF) as u8, + other.extract(1).min(0x00FF) as u8, + other.extract(2).min(0x00FF) as u8, + other.extract(3).min(0x00FF) as u8, + other.extract(4).min(0x00FF) as u8, + other.extract(5).min(0x00FF) as u8, + other.extract(6).min(0x00FF) as u8, + other.extract(7).min(0x00FF) as u8, + other.extract(8).min(0x00FF) as u8, + other.extract(9).min(0x00FF) as u8, + other.extract(10).min(0x00FF) as u8, + other.extract(11).min(0x00FF) as u8, + other.extract(12).min(0x00FF) as u8, + other.extract(13).min(0x00FF) as u8, + other.extract(14).min(0x00FF) as u8, + other.extract(15).min(0x00FF) as u8, + ) } } impl Downcast for i64x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i32x8 { - i32x8::new(self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32) + i32x8::new( + self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32, + ) } } impl Downcast for u64x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u32x8 { - u32x8::new(self.extract(0).min(0xFFFFFFFF) as u32, - self.extract(1).min(0x7FFFFFFF) as u32, - self.extract(2).min(0xFFFFFFFF) as u32, - self.extract(3).min(0x7FFFFFFF) as u32, - other.extract(0).min(0x7FFFFFFF) as u32, - other.extract(1).min(0x7FFFFFFF) as u32, - other.extract(2).min(0x7FFFFFFF) as u32, - other.extract(3).min(0x7FFFFFFF) as u32) + u32x8::new( + self.extract(0).min(0xFFFFFFFF) as u32, + self.extract(1).min(0x7FFFFFFF) as u32, + self.extract(2).min(0xFFFFFFFF) as u32, + self.extract(3).min(0x7FFFFFFF) as u32, + other.extract(0).min(0x7FFFFFFF) as u32, + other.extract(1).min(0x7FFFFFFF) as u32, + other.extract(2).min(0x7FFFFFFF) as u32, + other.extract(3).min(0x7FFFFFFF) as u32, + ) } } impl Downcast for f64x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> f32x8 { - f32x8::new(self.extract(0) as f32, - self.extract(1) as f32, - self.extract(2) as f32, - self.extract(3) as f32, - other.extract(0) as f32, - other.extract(1) as f32, - other.extract(2) as f32, - other.extract(3) as f32) + f32x8::new( + self.extract(0) as f32, + self.extract(1) as f32, + self.extract(2) as f32, + self.extract(3) as f32, + other.extract(0) as f32, + other.extract(1) as f32, + other.extract(2) as f32, + other.extract(3) as f32, + ) } } diff --git a/src/arch/unknown/intrin/endian.rs b/src/arch/unknown/intrin/endian.rs index 7960f65..310284d 100644 --- a/src/arch/unknown/intrin/endian.rs +++ b/src/arch/unknown/intrin/endian.rs @@ -6,40 +6,91 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::endian::*; +use crate::vecs::*; -impl_packed_swap_bytes!(u8x16, u8x16, "__undefined", __undefined, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(i8x16, u8x16, "__undefined", __undefined, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(u16x8, u8x16, "__undefined", __undefined, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(i16x8, u8x16, "__undefined", __undefined, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(u32x4, u8x16, "__undefined", __undefined, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), - (0, 1, 2, 3)); -impl_packed_swap_bytes!(i32x4, u8x16, "__undefined", __undefined, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), - (0, 1, 2, 3)); -impl_packed_swap_bytes!(u64x2, u8x16, "__undefined", __undefined, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), - (0, 1)); -impl_packed_swap_bytes!(i64x2, u8x16, "__undefined", __undefined, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), - (0, 1)); +impl_packed_swap_bytes!( + u8x16, + u8x16, + "__undefined", + __undefined, + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + i8x16, + u8x16, + "__undefined", + __undefined, + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + u16x8, + u8x16, + "__undefined", + __undefined, + (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + i16x8, + u8x16, + "__undefined", + __undefined, + (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + u32x4, + u8x16, + "__undefined", + __undefined, + (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), + (0, 1, 2, 3) +); +impl_packed_swap_bytes!( + i32x4, + u8x16, + "__undefined", + __undefined, + (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), + (0, 1, 2, 3) +); +impl_packed_swap_bytes!( + u64x2, + u8x16, + "__undefined", + __undefined, + (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), + (0, 1) +); +impl_packed_swap_bytes!( + i64x2, + u8x16, + "__undefined", + __undefined, + (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), + (0, 1) +); mod tests { #![allow(unused_imports)] - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; - test_packed_swap_bytes!((u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2), - (swap_bytes_u8x16, swap_bytes_i8x16, swap_bytes_u16x8, swap_bytes_i16x8, swap_bytes_u32x4, swap_bytes_i32x4, swap_bytes_u64x2, swap_bytes_i64x2)); + test_packed_swap_bytes!( + (u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2), + ( + swap_bytes_u8x16, + swap_bytes_i8x16, + swap_bytes_u16x8, + swap_bytes_i16x8, + swap_bytes_u32x4, + swap_bytes_i32x4, + swap_bytes_u64x2, + swap_bytes_i64x2 + ) + ); } diff --git a/src/arch/unknown/intrin/eq.rs b/src/arch/unknown/intrin/eq.rs index c9deb49..cb808b9 100644 --- a/src/arch/unknown/intrin/eq.rs +++ b/src/arch/unknown/intrin/eq.rs @@ -5,8 +5,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::eq::*; use crate::arch::current::vecs::*; +use crate::intrin::eq::*; use crate::vecs::*; rust_fallback_eq! { @@ -71,8 +71,8 @@ rust_fallback_eq! { mod tests { #![allow(unused_imports)] - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; // test_packed_eq!(u8x64, u8, u8x64, u8, test_eq_u8x64); // test_packed_eq!(u8x32, u8, u8x32, u8, test_eq_u8x32); diff --git a/src/arch/unknown/intrin/hadd.rs b/src/arch/unknown/intrin/hadd.rs index 79fc424..fe40cdc 100644 --- a/src/arch/unknown/intrin/hadd.rs +++ b/src/arch/unknown/intrin/hadd.rs @@ -5,18 +5,76 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::hadd::*; -use crate::core::ops::Add; use crate::arch::current::vecs::*; +use crate::core::ops::Add; +use crate::intrin::hadd::*; use crate::vecs::*; -impl HAdd for u64x2 { hop!(hadd, Add::add, 0, 1); } -impl HAdd for u32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } -impl HAdd for u16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HAdd for u8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HAdd for i64x2 { hop!(hadd, Add::add, 0, 1); } -impl HAdd for i32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } -impl HAdd for i16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HAdd for i8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HAdd for f64x2 { hop!(hadd, Add::add, 0, 1); } -impl HAdd for f32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } +impl HAdd for u64x2 { + hop!(hadd, Add::add, 0, 1); +} +impl HAdd for u32x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} +impl HAdd for u16x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HAdd for u8x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HAdd for i64x2 { + hop!(hadd, Add::add, 0, 1); +} +impl HAdd for i32x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} +impl HAdd for i16x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HAdd for i8x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HAdd for f64x2 { + hop!(hadd, Add::add, 0, 1); +} +impl HAdd for f32x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} diff --git a/src/arch/unknown/intrin/hsub.rs b/src/arch/unknown/intrin/hsub.rs index 420c812..bedb09c 100644 --- a/src/arch/unknown/intrin/hsub.rs +++ b/src/arch/unknown/intrin/hsub.rs @@ -6,17 +6,75 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::intrin::hsub::*; use crate::core::ops::Sub; +use crate::intrin::hsub::*; +use crate::vecs::*; -impl HSub for u64x2 { hop!(hsub, Sub::sub, 0, 1); } -impl HSub for u32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } -impl HSub for u16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HSub for u8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HSub for i64x2 { hop!(hsub, Sub::sub, 0, 1); } -impl HSub for i32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } -impl HSub for i16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HSub for i8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HSub for f64x2 { hop!(hsub, Sub::sub, 0, 1); } -impl HSub for f32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } +impl HSub for u64x2 { + hop!(hsub, Sub::sub, 0, 1); +} +impl HSub for u32x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} +impl HSub for u16x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HSub for u8x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HSub for i64x2 { + hop!(hsub, Sub::sub, 0, 1); +} +impl HSub for i32x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} +impl HSub for i16x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HSub for i8x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HSub for f64x2 { + hop!(hsub, Sub::sub, 0, 1); +} +impl HSub for f32x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} diff --git a/src/arch/unknown/intrin/merge.rs b/src/arch/unknown/intrin/merge.rs index ffdd166..da1ca27 100644 --- a/src/arch/unknown/intrin/merge.rs +++ b/src/arch/unknown/intrin/merge.rs @@ -6,30 +6,187 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::merge::*; +use crate::vecs::*; // Will produce fallback implementations only, so we get away with __undefined. -impl_packed_merge!(u8x16, u8x16, u8, __undefined, "__undefined", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -impl_packed_merge!(u16x8, u16x8, u16, __undefined, "__undefined", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); -impl_packed_merge!(u32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(u64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1); -impl_packed_merge!(i8x16, u8x16, u8, __undefined, "__undefined", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -impl_packed_merge!(i16x8, u16x8, u16, __undefined, "__undefined", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); -impl_packed_merge!(i32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(i64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1); -impl_packed_merge!(f32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(f64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1); +impl_packed_merge!( + u8x16, + u8x16, + u8, + __undefined, + "__undefined", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); +impl_packed_merge!( + u16x8, + u16x8, + u16, + __undefined, + "__undefined", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); +impl_packed_merge!( + u32x4, + u32x4, + u32, + __undefined, + "__undefined", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + u64x2, + u64x2, + u64, + __undefined, + "__undefined", + (0), + (1), + 0, + 1 +); +impl_packed_merge!( + i8x16, + u8x16, + u8, + __undefined, + "__undefined", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); +impl_packed_merge!( + i16x8, + u16x8, + u16, + __undefined, + "__undefined", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); +impl_packed_merge!( + i32x4, + u32x4, + u32, + __undefined, + "__undefined", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + i64x2, + u64x2, + u64, + __undefined, + "__undefined", + (0), + (1), + 0, + 1 +); +impl_packed_merge!( + f32x4, + u32x4, + u32, + __undefined, + "__undefined", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + f64x2, + u64x2, + u64, + __undefined, + "__undefined", + (0), + (1), + 0, + 1 +); mod tests { #![allow(unused_imports)] - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; // TODO: Which ones do we really need? test_packed_merge!( (u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4, u64x2, i64x2, f64x2), - (merge_u8x16, merge_i8x16, merge_u16x8, merge_i16x8, merge_u32x4, merge_i32x4, merge_f32x4, merge_u64x2, merge_i64x2, merge_f64x2) + ( + merge_u8x16, + merge_i8x16, + merge_u16x8, + merge_i16x8, + merge_u32x4, + merge_i32x4, + merge_f32x4, + merge_u64x2, + merge_i64x2, + merge_f64x2 + ) ); } diff --git a/src/arch/unknown/intrin/mod.rs b/src/arch/unknown/intrin/mod.rs index 8998477..254d78e 100644 --- a/src/arch/unknown/intrin/mod.rs +++ b/src/arch/unknown/intrin/mod.rs @@ -19,10 +19,10 @@ mod round; mod rsqrt; mod saturating_add; mod saturating_hadd; -mod saturating_sub; mod saturating_hsub; -mod sum; +mod saturating_sub; mod sqrt; +mod sum; mod transmute; mod upcast; @@ -43,8 +43,8 @@ pub mod prelude { pub use super::saturating_hadd::*; pub use super::saturating_hsub::*; pub use super::saturating_sub::*; - pub use super::sum::*; pub use super::sqrt::*; + pub use super::sum::*; pub use super::transmute::*; pub use super::upcast::*; } diff --git a/src/arch/unknown/intrin/recip.rs b/src/arch/unknown/intrin/recip.rs index fdf61fb..0a338cd 100644 --- a/src/arch/unknown/intrin/recip.rs +++ b/src/arch/unknown/intrin/recip.rs @@ -6,8 +6,8 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::recip::Recip; +use crate::vecs::*; rust_fallback_impl! { impl Recip for f32x4 where "__undefined" { diff --git a/src/arch/unknown/intrin/round.rs b/src/arch/unknown/intrin/round.rs index a5158ae..1e4ba88 100644 --- a/src/arch/unknown/intrin/round.rs +++ b/src/arch/unknown/intrin/round.rs @@ -5,8 +5,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::round::Round; use crate::arch::current::vecs::*; +use crate::intrin::round::Round; use crate::vecs::*; rust_fallback_impl! { diff --git a/src/arch/unknown/intrin/rsqrt.rs b/src/arch/unknown/intrin/rsqrt.rs index d089a4b..a74cbff 100644 --- a/src/arch/unknown/intrin/rsqrt.rs +++ b/src/arch/unknown/intrin/rsqrt.rs @@ -5,8 +5,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::rsqrt::*; use crate::arch::current::vecs::*; +use crate::intrin::rsqrt::*; use crate::vecs::*; // TODO: Guards and non-simd diff --git a/src/arch/unknown/intrin/saturating_add.rs b/src/arch/unknown/intrin/saturating_add.rs index 530ec70..c8b3ae7 100644 --- a/src/arch/unknown/intrin/saturating_add.rs +++ b/src/arch/unknown/intrin/saturating_add.rs @@ -6,8 +6,8 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::saturating_add::*; +use crate::vecs::*; rust_fallback_impl_binary! { impl SaturatingAdd for u8x16 where "__undefined" { diff --git a/src/arch/unknown/intrin/saturating_hadd.rs b/src/arch/unknown/intrin/saturating_hadd.rs index 898774c..58bacc8 100644 --- a/src/arch/unknown/intrin/saturating_hadd.rs +++ b/src/arch/unknown/intrin/saturating_hadd.rs @@ -6,14 +6,68 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::saturating_hadd::*; +use crate::vecs::*; -impl SaturatingHAdd for u64x2 { hop!(saturating_hadd, u64::saturating_add, 0, 1); } -impl SaturatingHAdd for u32x4 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); } -impl SaturatingHAdd for u16x8 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHAdd for u8x16 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHAdd for i64x2 { hop!(saturating_hadd, i64::saturating_add, 0, 1); } -impl SaturatingHAdd for i32x4 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); } -impl SaturatingHAdd for i16x8 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHAdd for i8x16 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl SaturatingHAdd for u64x2 { + hop!(saturating_hadd, u64::saturating_add, 0, 1); +} +impl SaturatingHAdd for u32x4 { + hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); +} +impl SaturatingHAdd for u16x8 { + hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHAdd for u8x16 { + hop!( + saturating_hadd, + u8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHAdd for i64x2 { + hop!(saturating_hadd, i64::saturating_add, 0, 1); +} +impl SaturatingHAdd for i32x4 { + hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); +} +impl SaturatingHAdd for i16x8 { + hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHAdd for i8x16 { + hop!( + saturating_hadd, + i8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} diff --git a/src/arch/unknown/intrin/saturating_hsub.rs b/src/arch/unknown/intrin/saturating_hsub.rs index 354e8ef..b514a07 100644 --- a/src/arch/unknown/intrin/saturating_hsub.rs +++ b/src/arch/unknown/intrin/saturating_hsub.rs @@ -6,14 +6,68 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::saturating_hsub::*; +use crate::vecs::*; -impl SaturatingHSub for u64x2 { hop!(saturating_hsub, u64::saturating_sub, 0, 1); } -impl SaturatingHSub for u32x4 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); } -impl SaturatingHSub for u16x8 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHSub for u8x16 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHSub for i64x2 { hop!(saturating_hsub, i64::saturating_sub, 0, 1); } -impl SaturatingHSub for i32x4 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); } -impl SaturatingHSub for i16x8 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHSub for i8x16 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl SaturatingHSub for u64x2 { + hop!(saturating_hsub, u64::saturating_sub, 0, 1); +} +impl SaturatingHSub for u32x4 { + hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); +} +impl SaturatingHSub for u16x8 { + hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHSub for u8x16 { + hop!( + saturating_hsub, + u8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHSub for i64x2 { + hop!(saturating_hsub, i64::saturating_sub, 0, 1); +} +impl SaturatingHSub for i32x4 { + hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); +} +impl SaturatingHSub for i16x8 { + hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHSub for i8x16 { + hop!( + saturating_hsub, + i8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} diff --git a/src/arch/unknown/intrin/saturating_sub.rs b/src/arch/unknown/intrin/saturating_sub.rs index 49b594d..2884938 100644 --- a/src/arch/unknown/intrin/saturating_sub.rs +++ b/src/arch/unknown/intrin/saturating_sub.rs @@ -5,8 +5,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::saturating_sub::*; use crate::arch::current::vecs::*; +use crate::intrin::saturating_sub::*; use crate::vecs::*; rust_fallback_impl_binary! { diff --git a/src/arch/unknown/intrin/sqrt.rs b/src/arch/unknown/intrin/sqrt.rs index dac3454..bc3fd9e 100644 --- a/src/arch/unknown/intrin/sqrt.rs +++ b/src/arch/unknown/intrin/sqrt.rs @@ -5,8 +5,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::sqrt::*; use crate::arch::current::vecs::*; +use crate::intrin::sqrt::*; use crate::vecs::*; rust_fallback_impl! { diff --git a/src/arch/unknown/intrin/sum.rs b/src/arch/unknown/intrin/sum.rs index a08077b..bbeb9fb 100644 --- a/src/arch/unknown/intrin/sum.rs +++ b/src/arch/unknown/intrin/sum.rs @@ -6,8 +6,8 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; +use crate::intrin::sum::{Sum, UpcastSum}; use crate::vecs::*; -use crate::intrin::sum::{Sum,UpcastSum}; impl_packed_sum!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, f32x4, f64x2); impl_packed_upcast_sum!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, f32x4, f64x2); @@ -15,8 +15,8 @@ impl_packed_upcast_sum!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, mod tests { #![allow(unused_imports)] - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; test_packed_sum_int!(u8x16, u8, test_packed_sum_u8x16); test_packed_sum_int!(i8x16, i8, test_packed_sum_i8x16); diff --git a/src/arch/unknown/intrin/transmute.rs b/src/arch/unknown/intrin/transmute.rs index 93f2510..aa6fc7d 100644 --- a/src/arch/unknown/intrin/transmute.rs +++ b/src/arch/unknown/intrin/transmute.rs @@ -5,9 +5,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::intrin::transmute::*; use crate::arch::current::vecs::*; use crate::core::mem::transmute; +use crate::intrin::transmute::*; impl_packed_transmute!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4, u64x2, i64x2, f64x2, ... diff --git a/src/arch/unknown/intrin/upcast.rs b/src/arch/unknown/intrin/upcast.rs index 4771473..caba048 100644 --- a/src/arch/unknown/intrin/upcast.rs +++ b/src/arch/unknown/intrin/upcast.rs @@ -11,376 +11,471 @@ use crate::intrin::upcast::*; impl Upcast for u8x16 { #[inline(always)] fn upcast(self) -> (u16x8, u16x8) { - (u16x8::new(self.extract(0) as u16, - self.extract(1) as u16, - self.extract(2) as u16, - self.extract(3) as u16, - self.extract(4) as u16, - self.extract(5) as u16, - self.extract(6) as u16, - self.extract(7) as u16), - u16x8::new(self.extract(8) as u16, - self.extract(9) as u16, - self.extract(10) as u16, - self.extract(11) as u16, - self.extract(12) as u16, - self.extract(13) as u16, - self.extract(14) as u16, - self.extract(15) as u16)) + ( + u16x8::new( + self.extract(0) as u16, + self.extract(1) as u16, + self.extract(2) as u16, + self.extract(3) as u16, + self.extract(4) as u16, + self.extract(5) as u16, + self.extract(6) as u16, + self.extract(7) as u16, + ), + u16x8::new( + self.extract(8) as u16, + self.extract(9) as u16, + self.extract(10) as u16, + self.extract(11) as u16, + self.extract(12) as u16, + self.extract(13) as u16, + self.extract(14) as u16, + self.extract(15) as u16, + ), + ) } } impl Upcast for i8x16 { #[inline(always)] fn upcast(self) -> (i16x8, i16x8) { - (i16x8::new(self.extract(0) as i16, - self.extract(1) as i16, - self.extract(2) as i16, - self.extract(3) as i16, - self.extract(4) as i16, - self.extract(5) as i16, - self.extract(6) as i16, - self.extract(7) as i16), - i16x8::new(self.extract(8) as i16, - self.extract(9) as i16, - self.extract(10) as i16, - self.extract(11) as i16, - self.extract(12) as i16, - self.extract(13) as i16, - self.extract(14) as i16, - self.extract(15) as i16)) + ( + i16x8::new( + self.extract(0) as i16, + self.extract(1) as i16, + self.extract(2) as i16, + self.extract(3) as i16, + self.extract(4) as i16, + self.extract(5) as i16, + self.extract(6) as i16, + self.extract(7) as i16, + ), + i16x8::new( + self.extract(8) as i16, + self.extract(9) as i16, + self.extract(10) as i16, + self.extract(11) as i16, + self.extract(12) as i16, + self.extract(13) as i16, + self.extract(14) as i16, + self.extract(15) as i16, + ), + ) } } impl Upcast for u16x8 { #[inline(always)] fn upcast(self) -> (u32x4, u32x4) { - (u32x4::new(self.extract(0) as u32, - self.extract(1) as u32, - self.extract(2) as u32, - self.extract(3) as u32), - u32x4::new(self.extract(4) as u32, - self.extract(5) as u32, - self.extract(6) as u32, - self.extract(7) as u32)) + ( + u32x4::new( + self.extract(0) as u32, + self.extract(1) as u32, + self.extract(2) as u32, + self.extract(3) as u32, + ), + u32x4::new( + self.extract(4) as u32, + self.extract(5) as u32, + self.extract(6) as u32, + self.extract(7) as u32, + ), + ) } } impl Upcast for i16x8 { #[inline(always)] fn upcast(self) -> (i32x4, i32x4) { - (i32x4::new(self.extract(0) as i32, - self.extract(1) as i32, - self.extract(2) as i32, - self.extract(3) as i32), - i32x4::new(self.extract(4) as i32, - self.extract(5) as i32, - self.extract(6) as i32, - self.extract(7) as i32)) + ( + i32x4::new( + self.extract(0) as i32, + self.extract(1) as i32, + self.extract(2) as i32, + self.extract(3) as i32, + ), + i32x4::new( + self.extract(4) as i32, + self.extract(5) as i32, + self.extract(6) as i32, + self.extract(7) as i32, + ), + ) } } impl Upcast for u8x32 { #[inline(always)] fn upcast(self) -> (u16x16, u16x16) { - (u16x16::new(self.extract(0) as u16, - self.extract(1) as u16, - self.extract(2) as u16, - self.extract(3) as u16, - self.extract(4) as u16, - self.extract(5) as u16, - self.extract(6) as u16, - self.extract(7) as u16, - self.extract(8) as u16, - self.extract(9) as u16, - self.extract(10) as u16, - self.extract(11) as u16, - self.extract(12) as u16, - self.extract(13) as u16, - self.extract(14) as u16, - self.extract(15) as u16), - u16x16::new(self.extract(16) as u16, - self.extract(17) as u16, - self.extract(18) as u16, - self.extract(19) as u16, - self.extract(20) as u16, - self.extract(21) as u16, - self.extract(22) as u16, - self.extract(23) as u16, - self.extract(24) as u16, - self.extract(25) as u16, - self.extract(26) as u16, - self.extract(27) as u16, - self.extract(28) as u16, - self.extract(29) as u16, - self.extract(30) as u16, - self.extract(31) as u16)) + ( + u16x16::new( + self.extract(0) as u16, + self.extract(1) as u16, + self.extract(2) as u16, + self.extract(3) as u16, + self.extract(4) as u16, + self.extract(5) as u16, + self.extract(6) as u16, + self.extract(7) as u16, + self.extract(8) as u16, + self.extract(9) as u16, + self.extract(10) as u16, + self.extract(11) as u16, + self.extract(12) as u16, + self.extract(13) as u16, + self.extract(14) as u16, + self.extract(15) as u16, + ), + u16x16::new( + self.extract(16) as u16, + self.extract(17) as u16, + self.extract(18) as u16, + self.extract(19) as u16, + self.extract(20) as u16, + self.extract(21) as u16, + self.extract(22) as u16, + self.extract(23) as u16, + self.extract(24) as u16, + self.extract(25) as u16, + self.extract(26) as u16, + self.extract(27) as u16, + self.extract(28) as u16, + self.extract(29) as u16, + self.extract(30) as u16, + self.extract(31) as u16, + ), + ) } } impl Upcast for i8x32 { #[inline(always)] fn upcast(self) -> (i16x16, i16x16) { - (i16x16::new(self.extract(0) as i16, - self.extract(1) as i16, - self.extract(2) as i16, - self.extract(3) as i16, - self.extract(4) as i16, - self.extract(5) as i16, - self.extract(6) as i16, - self.extract(7) as i16, - self.extract(8) as i16, - self.extract(9) as i16, - self.extract(10) as i16, - self.extract(11) as i16, - self.extract(12) as i16, - self.extract(13) as i16, - self.extract(14) as i16, - self.extract(15) as i16), - i16x16::new(self.extract(16) as i16, - self.extract(17) as i16, - self.extract(18) as i16, - self.extract(19) as i16, - self.extract(20) as i16, - self.extract(21) as i16, - self.extract(22) as i16, - self.extract(23) as i16, - self.extract(24) as i16, - self.extract(25) as i16, - self.extract(26) as i16, - self.extract(27) as i16, - self.extract(28) as i16, - self.extract(29) as i16, - self.extract(30) as i16, - self.extract(31) as i16)) + ( + i16x16::new( + self.extract(0) as i16, + self.extract(1) as i16, + self.extract(2) as i16, + self.extract(3) as i16, + self.extract(4) as i16, + self.extract(5) as i16, + self.extract(6) as i16, + self.extract(7) as i16, + self.extract(8) as i16, + self.extract(9) as i16, + self.extract(10) as i16, + self.extract(11) as i16, + self.extract(12) as i16, + self.extract(13) as i16, + self.extract(14) as i16, + self.extract(15) as i16, + ), + i16x16::new( + self.extract(16) as i16, + self.extract(17) as i16, + self.extract(18) as i16, + self.extract(19) as i16, + self.extract(20) as i16, + self.extract(21) as i16, + self.extract(22) as i16, + self.extract(23) as i16, + self.extract(24) as i16, + self.extract(25) as i16, + self.extract(26) as i16, + self.extract(27) as i16, + self.extract(28) as i16, + self.extract(29) as i16, + self.extract(30) as i16, + self.extract(31) as i16, + ), + ) } } impl Upcast for u16x16 { #[inline(always)] fn upcast(self) -> (u32x8, u32x8) { - (u32x8::new(self.extract(0) as u32, - self.extract(1) as u32, - self.extract(2) as u32, - self.extract(3) as u32, - self.extract(4) as u32, - self.extract(5) as u32, - self.extract(6) as u32, - self.extract(7) as u32), - u32x8::new(self.extract(8) as u32, - self.extract(9) as u32, - self.extract(10) as u32, - self.extract(11) as u32, - self.extract(12) as u32, - self.extract(13) as u32, - self.extract(14) as u32, - self.extract(15) as u32)) - + ( + u32x8::new( + self.extract(0) as u32, + self.extract(1) as u32, + self.extract(2) as u32, + self.extract(3) as u32, + self.extract(4) as u32, + self.extract(5) as u32, + self.extract(6) as u32, + self.extract(7) as u32, + ), + u32x8::new( + self.extract(8) as u32, + self.extract(9) as u32, + self.extract(10) as u32, + self.extract(11) as u32, + self.extract(12) as u32, + self.extract(13) as u32, + self.extract(14) as u32, + self.extract(15) as u32, + ), + ) } } impl Upcast for i16x16 { #[inline(always)] fn upcast(self) -> (i32x8, i32x8) { - (i32x8::new(self.extract(0) as i32, - self.extract(1) as i32, - self.extract(2) as i32, - self.extract(3) as i32, - self.extract(4) as i32, - self.extract(5) as i32, - self.extract(6) as i32, - self.extract(7) as i32), - i32x8::new(self.extract(8) as i32, - self.extract(9) as i32, - self.extract(10) as i32, - self.extract(11) as i32, - self.extract(12) as i32, - self.extract(13) as i32, - self.extract(14) as i32, - self.extract(15) as i32)) + ( + i32x8::new( + self.extract(0) as i32, + self.extract(1) as i32, + self.extract(2) as i32, + self.extract(3) as i32, + self.extract(4) as i32, + self.extract(5) as i32, + self.extract(6) as i32, + self.extract(7) as i32, + ), + i32x8::new( + self.extract(8) as i32, + self.extract(9) as i32, + self.extract(10) as i32, + self.extract(11) as i32, + self.extract(12) as i32, + self.extract(13) as i32, + self.extract(14) as i32, + self.extract(15) as i32, + ), + ) } } impl Upcast for f32x4 { #[inline(always)] fn upcast(self) -> (f64x2, f64x2) { - (f64x2::new(self.extract(0) as f64, - self.extract(1) as f64), - f64x2::new(self.extract(2) as f64, - self.extract(3) as f64)) + ( + f64x2::new(self.extract(0) as f64, self.extract(1) as f64), + f64x2::new(self.extract(2) as f64, self.extract(3) as f64), + ) } } impl Upcast for i32x4 { #[inline(always)] fn upcast(self) -> (f64x2, f64x2) { - (f64x2::new(self.extract(0) as f64, - self.extract(1) as f64), - f64x2::new(self.extract(2) as f64, - self.extract(3) as f64)) + ( + f64x2::new(self.extract(0) as f64, self.extract(1) as f64), + f64x2::new(self.extract(2) as f64, self.extract(3) as f64), + ) } } impl Upcast for i32x4 { #[inline(always)] fn upcast(self) -> (i64x2, i64x2) { - (i64x2::new(self.extract(0) as i64, - self.extract(1) as i64), - i64x2::new(self.extract(2) as i64, - self.extract(3) as i64)) + ( + i64x2::new(self.extract(0) as i64, self.extract(1) as i64), + i64x2::new(self.extract(2) as i64, self.extract(3) as i64), + ) } } impl Upcast for u32x4 { #[inline(always)] fn upcast(self) -> (u64x2, u64x2) { - (u64x2::new(self.extract(0) as u64, - self.extract(1) as u64), - u64x2::new(self.extract(2) as u64, - self.extract(3) as u64)) + ( + u64x2::new(self.extract(0) as u64, self.extract(1) as u64), + u64x2::new(self.extract(2) as u64, self.extract(3) as u64), + ) } } impl Upcast for f32x8 { #[inline(always)] fn upcast(self) -> (f64x4, f64x4) { - (f64x4::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64), - f64x4::new(self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64)) + ( + f64x4::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + ), + f64x4::new( + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + ) } } impl Upcast for i32x8 { #[inline(always)] fn upcast(self) -> (f64x4, f64x4) { - (f64x4::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64), - f64x4::new(self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64)) + ( + f64x4::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + ), + f64x4::new( + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + ) } } impl Upcast for i32x8 { #[inline(always)] fn upcast(self) -> (i64x4, i64x4) { - (i64x4::new(self.extract(0) as i64, - self.extract(1) as i64, - self.extract(2) as i64, - self.extract(3) as i64), - i64x4::new(self.extract(4) as i64, - self.extract(5) as i64, - self.extract(6) as i64, - self.extract(7) as i64)) + ( + i64x4::new( + self.extract(0) as i64, + self.extract(1) as i64, + self.extract(2) as i64, + self.extract(3) as i64, + ), + i64x4::new( + self.extract(4) as i64, + self.extract(5) as i64, + self.extract(6) as i64, + self.extract(7) as i64, + ), + ) } } impl Upcast for u32x8 { #[inline(always)] fn upcast(self) -> (u64x4, u64x4) { - (u64x4::new(self.extract(0) as u64, - self.extract(1) as u64, - self.extract(2) as u64, - self.extract(3) as u64), - u64x4::new(self.extract(4) as u64, - self.extract(5) as u64, - self.extract(6) as u64, - self.extract(7) as u64)) + ( + u64x4::new( + self.extract(0) as u64, + self.extract(1) as u64, + self.extract(2) as u64, + self.extract(3) as u64, + ), + u64x4::new( + self.extract(4) as u64, + self.extract(5) as u64, + self.extract(6) as u64, + self.extract(7) as u64, + ), + ) } } impl Upcast for f32x16 { #[inline(always)] fn upcast(self) -> (f64x8, f64x8) { - (f64x8::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64, - self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64), - f64x8::new(self.extract(8) as f64, - self.extract(9) as f64, - self.extract(10) as f64, - self.extract(11) as f64, - self.extract(12) as f64, - self.extract(13) as f64, - self.extract(14) as f64, - self.extract(15) as f64)) + ( + f64x8::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + f64x8::new( + self.extract(8) as f64, + self.extract(9) as f64, + self.extract(10) as f64, + self.extract(11) as f64, + self.extract(12) as f64, + self.extract(13) as f64, + self.extract(14) as f64, + self.extract(15) as f64, + ), + ) } } impl Upcast for i32x16 { #[inline(always)] fn upcast(self) -> (f64x8, f64x8) { - (f64x8::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64, - self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64), - f64x8::new(self.extract(8) as f64, - self.extract(9) as f64, - self.extract(10) as f64, - self.extract(11) as f64, - self.extract(12) as f64, - self.extract(13) as f64, - self.extract(14) as f64, - self.extract(15) as f64)) + ( + f64x8::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + f64x8::new( + self.extract(8) as f64, + self.extract(9) as f64, + self.extract(10) as f64, + self.extract(11) as f64, + self.extract(12) as f64, + self.extract(13) as f64, + self.extract(14) as f64, + self.extract(15) as f64, + ), + ) } } impl Upcast for i32x16 { #[inline(always)] fn upcast(self) -> (i64x8, i64x8) { - (i64x8::new(self.extract(0) as i64, - self.extract(1) as i64, - self.extract(2) as i64, - self.extract(3) as i64, - self.extract(4) as i64, - self.extract(5) as i64, - self.extract(6) as i64, - self.extract(7) as i64), - i64x8::new(self.extract(8) as i64, - self.extract(9) as i64, - self.extract(10) as i64, - self.extract(11) as i64, - self.extract(12) as i64, - self.extract(13) as i64, - self.extract(14) as i64, - self.extract(15) as i64)) + ( + i64x8::new( + self.extract(0) as i64, + self.extract(1) as i64, + self.extract(2) as i64, + self.extract(3) as i64, + self.extract(4) as i64, + self.extract(5) as i64, + self.extract(6) as i64, + self.extract(7) as i64, + ), + i64x8::new( + self.extract(8) as i64, + self.extract(9) as i64, + self.extract(10) as i64, + self.extract(11) as i64, + self.extract(12) as i64, + self.extract(13) as i64, + self.extract(14) as i64, + self.extract(15) as i64, + ), + ) } } impl Upcast for u32x16 { #[inline(always)] fn upcast(self) -> (u64x8, u64x8) { - (u64x8::new(self.extract(0) as u64, - self.extract(1) as u64, - self.extract(2) as u64, - self.extract(3) as u64, - self.extract(4) as u64, - self.extract(5) as u64, - self.extract(6) as u64, - self.extract(7) as u64), - u64x8::new(self.extract(8) as u64, - self.extract(9) as u64, - self.extract(10) as u64, - self.extract(11) as u64, - self.extract(12) as u64, - self.extract(13) as u64, - self.extract(14) as u64, - self.extract(15) as u64)) + ( + u64x8::new( + self.extract(0) as u64, + self.extract(1) as u64, + self.extract(2) as u64, + self.extract(3) as u64, + self.extract(4) as u64, + self.extract(5) as u64, + self.extract(6) as u64, + self.extract(7) as u64, + ), + u64x8::new( + self.extract(8) as u64, + self.extract(9) as u64, + self.extract(10) as u64, + self.extract(11) as u64, + self.extract(12) as u64, + self.extract(13) as u64, + self.extract(14) as u64, + self.extract(15) as u64, + ), + ) } } diff --git a/src/arch/unknown/mod.rs b/src/arch/unknown/mod.rs index a2be36e..9bc51e7 100644 --- a/src/arch/unknown/mod.rs +++ b/src/arch/unknown/mod.rs @@ -1,3 +1,3 @@ pub mod intrin; -pub mod vecs; pub mod vec_patterns; +pub mod vecs; diff --git a/src/arch/unknown/vec_patterns.rs b/src/arch/unknown/vec_patterns.rs index a4ce16f..91ff65b 100644 --- a/src/arch/unknown/vec_patterns.rs +++ b/src/arch/unknown/vec_patterns.rs @@ -15,153 +15,239 @@ use crate::arch::current::vecs::*; use crate::core::mem::transmute; use crate::vecs::*; - - -const PART_MASK: [u8; 128] = [0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF]; +const PART_MASK: [u8; 128] = [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, +]; impl Pattern for u8x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for i8x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for u16x8 { @@ -175,20 +261,29 @@ impl Pattern for u16x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -204,21 +299,21 @@ impl Pattern for u16x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for i16x8 { @@ -232,20 +327,29 @@ impl Pattern for i16x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -261,21 +365,21 @@ impl Pattern for i16x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for u32x4 { @@ -289,20 +393,29 @@ impl Pattern for u32x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -314,21 +427,21 @@ impl Pattern for u32x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for i32x4 { @@ -342,20 +455,29 @@ impl Pattern for i32x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -367,21 +489,21 @@ impl Pattern for i32x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for f32x4 { @@ -395,20 +517,29 @@ impl Pattern for f32x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -420,21 +551,21 @@ impl Pattern for f32x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for u64x2 { @@ -448,20 +579,29 @@ impl Pattern for u64x2 { Self::new(hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -471,21 +611,21 @@ impl Pattern for u64x2 { 0 => Self::new(lo, lo), 1 => Self::new(hi, lo), 2 => Self::new(hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for i64x2 { @@ -499,20 +639,29 @@ impl Pattern for i64x2 { Self::new(hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -522,21 +671,21 @@ impl Pattern for i64x2 { 0 => Self::new(lo, lo), 1 => Self::new(hi, lo), 2 => Self::new(hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for f64x2 { @@ -550,20 +699,29 @@ impl Pattern for f64x2 { Self::new(hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "__undefined")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(__undefined( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "__undefined")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(__undefined(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "__undefined"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -573,20 +731,19 @@ impl Pattern for f64x2 { 0 => Self::new(lo, lo), 1 => Self::new(hi, lo), 2 => Self::new(hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } - diff --git a/src/arch/unknown/vecs.rs b/src/arch/unknown/vecs.rs index d24df90..b2e571f 100644 --- a/src/arch/unknown/vecs.rs +++ b/src/arch/unknown/vecs.rs @@ -9,10 +9,10 @@ //! Vector types which aren't interpreted as SIMD vectors, for systems which //! don't have SIMD support. -use crate::core::ops::*; +use crate::core::fmt::*; use crate::core::mem::*; +use crate::core::ops::*; use crate::core::ptr::*; -use crate::core::fmt::*; use crate::vecs::*; macro_rules! impl_packed_type { @@ -224,7 +224,7 @@ macro_rules! impl_cast { ret } } - } + }; } // "undefined" is just a string that should not match any target-feature. @@ -250,25 +250,125 @@ impl_packed_type!(i64, i64s, i64x4, 4, [x0, x1, x2, x3]); impl_packed_type!(i64, i64s, i64x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]); impl_packed_type!(f32, f32s, f32x4, 4, [x0, x1, x2, x3]); impl_packed_type!(f32, f32s, f32x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]); -impl_packed_type!(f32, f32s, f32x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]); +impl_packed_type!( + f32, + f32s, + f32x16, + 16, + [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15] +); impl_packed_type!(u32, u32s, u32x4, 4, [x0, x1, x2, x3]); impl_packed_type!(u32, u32s, u32x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]); -impl_packed_type!(u32, u32s, u32x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]); +impl_packed_type!( + u32, + u32s, + u32x16, + 16, + [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15] +); impl_packed_type!(i32, i32s, i32x4, 4, [x0, x1, x2, x3]); impl_packed_type!(i32, i32s, i32x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]); -impl_packed_type!(i32, i32s, i32x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]); +impl_packed_type!( + i32, + i32s, + i32x16, + 16, + [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15] +); impl_packed_type!(u16, u16s, u16x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]); -impl_packed_type!(u16, u16s, u16x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]); -impl_packed_type!(u16, u16s, u16x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]); +impl_packed_type!( + u16, + u16s, + u16x16, + 16, + [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15] +); +impl_packed_type!( + u16, + u16s, + u16x32, + 32, + [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 + ] +); impl_packed_type!(i16, i16s, i16x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]); -impl_packed_type!(i16, i16s, i16x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]); -impl_packed_type!(i16, i16s, i16x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]); -impl_packed_type!(u8, u8s, u8x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]); -impl_packed_type!(u8, u8s, u8x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]); -impl_packed_type!(u8, u8s, u8x64, 64, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63]); -impl_packed_type!(i8, i8s, i8x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]); -impl_packed_type!(i8, i8s, i8x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]); -impl_packed_type!(i8, i8s, i8x64, 64, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63]); +impl_packed_type!( + i16, + i16s, + i16x16, + 16, + [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15] +); +impl_packed_type!( + i16, + i16s, + i16x32, + 32, + [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 + ] +); +impl_packed_type!( + u8, + u8s, + u8x16, + 16, + [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15] +); +impl_packed_type!( + u8, + u8s, + u8x32, + 32, + [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 + ] +); +impl_packed_type!( + u8, + u8s, + u8x64, + 64, + [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, + x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, + x56, x57, x58, x59, x60, x61, x62, x63 + ] +); +impl_packed_type!( + i8, + i8s, + i8x16, + 16, + [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15] +); +impl_packed_type!( + i8, + i8s, + i8x32, + 32, + [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 + ] +); +impl_packed_type!( + i8, + i8s, + i8x64, + 64, + [ + x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, + x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, + x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, + x56, x57, x58, x59, x60, x61, x62, x63 + ] +); impl_from!(u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16); impl_from!(i64x2, u64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16); diff --git a/src/arch/x86/intrin/abs.rs b/src/arch/x86/intrin/abs.rs index a4dc623..4e44597 100644 --- a/src/arch/x86/intrin/abs.rs +++ b/src/arch/x86/intrin/abs.rs @@ -7,11 +7,11 @@ use crate::intrin::abs::Abs; -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; -use crate::vektor::x86::*; use crate::arch::current::vecs::*; use crate::core::mem::transmute; +use crate::vektor::x86::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; impl Abs for f32x4 { type Out = f32x4; @@ -27,10 +27,12 @@ impl Abs for f32x4 { #[cfg(not(target_feature = "sse"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs(), - self.extract(2).abs(), - self.extract(3).abs()) + Self::Out::new( + self.extract(0).abs(), + self.extract(1).abs(), + self.extract(2).abs(), + self.extract(3).abs(), + ) } } @@ -41,15 +43,19 @@ impl Abs for f64x2 { #[cfg(target_feature = "sse2")] fn abs(&self) -> Self::Out { optimized!(); - unsafe { _mm_and_pd(*self, Self::splat(transmute::(0x7FFFFFFFFFFFFFFF))) } + unsafe { + _mm_and_pd( + *self, + Self::splat(transmute::(0x7FFFFFFFFFFFFFFF)), + ) + } } #[inline(always)] #[cfg(not(target_feature = "sse2"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs()) + Self::Out::new(self.extract(0).abs(), self.extract(1).abs()) } } @@ -67,14 +73,16 @@ impl Abs for f32x8 { #[cfg(not(target_feature = "avx"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs(), - self.extract(2).abs(), - self.extract(3).abs(), - self.extract(4).abs(), - self.extract(5).abs(), - self.extract(6).abs(), - self.extract(7).abs()) + Self::Out::new( + self.extract(0).abs(), + self.extract(1).abs(), + self.extract(2).abs(), + self.extract(3).abs(), + self.extract(4).abs(), + self.extract(5).abs(), + self.extract(6).abs(), + self.extract(7).abs(), + ) } } @@ -85,17 +93,24 @@ impl Abs for f64x4 { #[cfg(target_feature = "avx")] fn abs(&self) -> Self::Out { optimized!(); - unsafe { _mm256_and_pd(*self, Self::splat(transmute::(0x7FFFFFFFFFFFFFFF))) } + unsafe { + _mm256_and_pd( + *self, + Self::splat(transmute::(0x7FFFFFFFFFFFFFFF)), + ) + } } #[inline(always)] #[cfg(not(target_feature = "avx"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(self.extract(0).abs(), - self.extract(1).abs(), - self.extract(2).abs(), - self.extract(3).abs()) + Self::Out::new( + self.extract(0).abs(), + self.extract(1).abs(), + self.extract(2).abs(), + self.extract(3).abs(), + ) } } @@ -113,22 +128,24 @@ impl Abs for i8x16 { #[cfg(not(target_feature = "ssse3"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }, - unsafe { transmute::(self.extract(8).overflowing_abs().0) }, - unsafe { transmute::(self.extract(9).overflowing_abs().0) }, - unsafe { transmute::(self.extract(10).overflowing_abs().0) }, - unsafe { transmute::(self.extract(11).overflowing_abs().0) }, - unsafe { transmute::(self.extract(12).overflowing_abs().0) }, - unsafe { transmute::(self.extract(13).overflowing_abs().0) }, - unsafe { transmute::(self.extract(14).overflowing_abs().0) }, - unsafe { transmute::(self.extract(15).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + unsafe { transmute::(self.extract(8).overflowing_abs().0) }, + unsafe { transmute::(self.extract(9).overflowing_abs().0) }, + unsafe { transmute::(self.extract(10).overflowing_abs().0) }, + unsafe { transmute::(self.extract(11).overflowing_abs().0) }, + unsafe { transmute::(self.extract(12).overflowing_abs().0) }, + unsafe { transmute::(self.extract(13).overflowing_abs().0) }, + unsafe { transmute::(self.extract(14).overflowing_abs().0) }, + unsafe { transmute::(self.extract(15).overflowing_abs().0) }, + ) } } @@ -146,14 +163,16 @@ impl Abs for i16x8 { #[cfg(not(target_feature = "ssse3"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + ) } } @@ -171,10 +190,12 @@ impl Abs for i32x4 { #[cfg(not(target_feature = "ssse3"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + ) } } @@ -192,38 +213,40 @@ impl Abs for i8x32 { #[cfg(not(target_feature = "avx2"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }, - unsafe { transmute::(self.extract(8).overflowing_abs().0) }, - unsafe { transmute::(self.extract(9).overflowing_abs().0) }, - unsafe { transmute::(self.extract(10).overflowing_abs().0) }, - unsafe { transmute::(self.extract(11).overflowing_abs().0) }, - unsafe { transmute::(self.extract(12).overflowing_abs().0) }, - unsafe { transmute::(self.extract(13).overflowing_abs().0) }, - unsafe { transmute::(self.extract(14).overflowing_abs().0) }, - unsafe { transmute::(self.extract(15).overflowing_abs().0) }, - unsafe { transmute::(self.extract(16).overflowing_abs().0) }, - unsafe { transmute::(self.extract(17).overflowing_abs().0) }, - unsafe { transmute::(self.extract(18).overflowing_abs().0) }, - unsafe { transmute::(self.extract(19).overflowing_abs().0) }, - unsafe { transmute::(self.extract(20).overflowing_abs().0) }, - unsafe { transmute::(self.extract(21).overflowing_abs().0) }, - unsafe { transmute::(self.extract(22).overflowing_abs().0) }, - unsafe { transmute::(self.extract(23).overflowing_abs().0) }, - unsafe { transmute::(self.extract(24).overflowing_abs().0) }, - unsafe { transmute::(self.extract(25).overflowing_abs().0) }, - unsafe { transmute::(self.extract(26).overflowing_abs().0) }, - unsafe { transmute::(self.extract(27).overflowing_abs().0) }, - unsafe { transmute::(self.extract(28).overflowing_abs().0) }, - unsafe { transmute::(self.extract(29).overflowing_abs().0) }, - unsafe { transmute::(self.extract(30).overflowing_abs().0) }, - unsafe { transmute::(self.extract(31).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + unsafe { transmute::(self.extract(8).overflowing_abs().0) }, + unsafe { transmute::(self.extract(9).overflowing_abs().0) }, + unsafe { transmute::(self.extract(10).overflowing_abs().0) }, + unsafe { transmute::(self.extract(11).overflowing_abs().0) }, + unsafe { transmute::(self.extract(12).overflowing_abs().0) }, + unsafe { transmute::(self.extract(13).overflowing_abs().0) }, + unsafe { transmute::(self.extract(14).overflowing_abs().0) }, + unsafe { transmute::(self.extract(15).overflowing_abs().0) }, + unsafe { transmute::(self.extract(16).overflowing_abs().0) }, + unsafe { transmute::(self.extract(17).overflowing_abs().0) }, + unsafe { transmute::(self.extract(18).overflowing_abs().0) }, + unsafe { transmute::(self.extract(19).overflowing_abs().0) }, + unsafe { transmute::(self.extract(20).overflowing_abs().0) }, + unsafe { transmute::(self.extract(21).overflowing_abs().0) }, + unsafe { transmute::(self.extract(22).overflowing_abs().0) }, + unsafe { transmute::(self.extract(23).overflowing_abs().0) }, + unsafe { transmute::(self.extract(24).overflowing_abs().0) }, + unsafe { transmute::(self.extract(25).overflowing_abs().0) }, + unsafe { transmute::(self.extract(26).overflowing_abs().0) }, + unsafe { transmute::(self.extract(27).overflowing_abs().0) }, + unsafe { transmute::(self.extract(28).overflowing_abs().0) }, + unsafe { transmute::(self.extract(29).overflowing_abs().0) }, + unsafe { transmute::(self.extract(30).overflowing_abs().0) }, + unsafe { transmute::(self.extract(31).overflowing_abs().0) }, + ) } } @@ -240,22 +263,24 @@ impl Abs for i16x16 { #[cfg(not(target_feature = "avx2"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }, - unsafe { transmute::(self.extract(8).overflowing_abs().0) }, - unsafe { transmute::(self.extract(9).overflowing_abs().0) }, - unsafe { transmute::(self.extract(10).overflowing_abs().0) }, - unsafe { transmute::(self.extract(11).overflowing_abs().0) }, - unsafe { transmute::(self.extract(12).overflowing_abs().0) }, - unsafe { transmute::(self.extract(13).overflowing_abs().0) }, - unsafe { transmute::(self.extract(14).overflowing_abs().0) }, - unsafe { transmute::(self.extract(15).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + unsafe { transmute::(self.extract(8).overflowing_abs().0) }, + unsafe { transmute::(self.extract(9).overflowing_abs().0) }, + unsafe { transmute::(self.extract(10).overflowing_abs().0) }, + unsafe { transmute::(self.extract(11).overflowing_abs().0) }, + unsafe { transmute::(self.extract(12).overflowing_abs().0) }, + unsafe { transmute::(self.extract(13).overflowing_abs().0) }, + unsafe { transmute::(self.extract(14).overflowing_abs().0) }, + unsafe { transmute::(self.extract(15).overflowing_abs().0) }, + ) } } @@ -272,14 +297,16 @@ impl Abs for i32x8 { #[cfg(not(target_feature = "avx2"))] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + ) } } @@ -289,8 +316,10 @@ impl Abs for i64x2 { #[inline(always)] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + ) } } @@ -300,10 +329,12 @@ impl Abs for i64x4 { #[inline(always)] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + ) } } @@ -313,14 +344,15 @@ impl Abs for i64x8 { #[inline(always)] fn abs(&self) -> Self::Out { fallback!(); - Self::Out::new(unsafe { transmute::(self.extract(0).overflowing_abs().0) }, - unsafe { transmute::(self.extract(1).overflowing_abs().0) }, - unsafe { transmute::(self.extract(2).overflowing_abs().0) }, - unsafe { transmute::(self.extract(3).overflowing_abs().0) }, - unsafe { transmute::(self.extract(4).overflowing_abs().0) }, - unsafe { transmute::(self.extract(5).overflowing_abs().0) }, - unsafe { transmute::(self.extract(6).overflowing_abs().0) }, - unsafe { transmute::(self.extract(7).overflowing_abs().0) }) + Self::Out::new( + unsafe { transmute::(self.extract(0).overflowing_abs().0) }, + unsafe { transmute::(self.extract(1).overflowing_abs().0) }, + unsafe { transmute::(self.extract(2).overflowing_abs().0) }, + unsafe { transmute::(self.extract(3).overflowing_abs().0) }, + unsafe { transmute::(self.extract(4).overflowing_abs().0) }, + unsafe { transmute::(self.extract(5).overflowing_abs().0) }, + unsafe { transmute::(self.extract(6).overflowing_abs().0) }, + unsafe { transmute::(self.extract(7).overflowing_abs().0) }, + ) } } - diff --git a/src/arch/x86/intrin/cmp.rs b/src/arch/x86/intrin/cmp.rs index 7b97db8..b596401 100644 --- a/src/arch/x86/intrin/cmp.rs +++ b/src/arch/x86/intrin/cmp.rs @@ -5,11 +5,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::cmp::*; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; rust_fallback_impl_binary! { impl Cmp for u8x16 where "sse2" { diff --git a/src/arch/x86/intrin/destride.rs b/src/arch/x86/intrin/destride.rs index 8715278..b58dd2d 100644 --- a/src/arch/x86/intrin/destride.rs +++ b/src/arch/x86/intrin/destride.rs @@ -1,11 +1,11 @@ use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; +use crate::core::mem::transmute; +use crate::intrin::destride::*; use crate::intrin::merge::*; use crate::intrin::transmute::*; -use crate::intrin::destride::*; -use crate::core::mem::transmute; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; impl Destride for u8x16 { #[inline(always)] @@ -13,10 +13,19 @@ impl Destride for u8x16 { fn destride_two(self, other: Self) -> (Self, Self) { optimized!(); unsafe { - let a = _mm_shuffle_epi8(self.be_i8s(), Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).be_i8s()); - let b = _mm_shuffle_epi8(other.be_i8s(), Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14).be_i8s()); + let a = _mm_shuffle_epi8( + self.be_i8s(), + Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).be_i8s(), + ); + let b = _mm_shuffle_epi8( + other.be_i8s(), + Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14).be_i8s(), + ); // Backwards merge of a and b (keeps elements at the same indices) - let c = _mm_shuffle_epi8(b.merge_halves(a), Self::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7).be_i8s()); + let c = _mm_shuffle_epi8( + b.merge_halves(a), + Self::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7).be_i8s(), + ); (a.merge_halves(b).be_u8s(), c.be_u8s()) } } @@ -42,8 +51,22 @@ impl Destride for u8x32 { optimized!(); unsafe { // In-lane destrided vectors - let a = _mm256_shuffle_epi8(self.be_i8s(), Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).be_i8s()); - let b = _mm256_shuffle_epi8(other.be_i8s(), Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14).be_i8s()); + let a = _mm256_shuffle_epi8( + self.be_i8s(), + Self::new( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, + 14, 1, 3, 5, 7, 9, 11, 13, 15, + ) + .be_i8s(), + ); + let b = _mm256_shuffle_epi8( + other.be_i8s(), + Self::new( + 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, + 15, 0, 2, 4, 6, 8, 10, 12, 14, + ) + .be_i8s(), + ); // Cross-lane destrided vectors let aa = _mm256_permute4x64_epi64(a.be_i64s(), 0xD8).be_u8s(); let bb = _mm256_permute4x64_epi64(b.be_i64s(), 0xD8).be_u8s(); @@ -57,7 +80,9 @@ impl Destride for u8x32 { #[cfg(not(target_feature = "avx2"))] fn destride_two(self, other: Self) -> (Self, Self) { fallback!(); - destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) + destride_two_polyfill!( + self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ) } #[inline(always)] @@ -73,10 +98,25 @@ impl Destride for i8x16 { fn destride_two(self, other: Self) -> (Self, Self) { optimized!(); unsafe { - let a = _mm_shuffle_epi8(transmute(self), transmute(Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15))); - let b = _mm_shuffle_epi8(transmute(other), transmute(Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14))); + let a = _mm_shuffle_epi8( + transmute(self), + transmute(Self::new( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + )), + ); + let b = _mm_shuffle_epi8( + transmute(other), + transmute(Self::new( + 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, + )), + ); // Backwards merge of a and b (keeps elements at the same indices) - let c = _mm_shuffle_epi8(b.merge_halves(a), transmute(Self::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7))); + let c = _mm_shuffle_epi8( + b.merge_halves(a), + transmute(Self::new( + 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, + )), + ); (a.be_i8s().merge_halves(b.be_i8s()), c.be_i8s()) } } @@ -102,8 +142,20 @@ impl Destride for i8x32 { optimized!(); unsafe { // In-lane destrided vectors - let a = _mm256_shuffle_epi8(transmute(self), transmute(Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15))); - let b = _mm256_shuffle_epi8(transmute(other), transmute(Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14))); + let a = _mm256_shuffle_epi8( + transmute(self), + transmute(Self::new( + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, + 14, 1, 3, 5, 7, 9, 11, 13, 15, + )), + ); + let b = _mm256_shuffle_epi8( + transmute(other), + transmute(Self::new( + 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, + 15, 0, 2, 4, 6, 8, 10, 12, 14, + )), + ); // Cross-lane destrided vectors let aa = _mm256_permute4x64_epi64(a.be_i64s(), 0xD8).be_i8s(); let bb = _mm256_permute4x64_epi64(b.be_i64s(), 0xD8).be_i8s(); @@ -117,7 +169,9 @@ impl Destride for i8x32 { #[cfg(not(target_feature = "avx2"))] fn destride_two(self, other: Self) -> (Self, Self) { fallback!(); - destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) + destride_two_polyfill!( + self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + ) } #[inline(always)] diff --git a/src/arch/x86/intrin/downcast.rs b/src/arch/x86/intrin/downcast.rs index 55c4686..48a0a10 100644 --- a/src/arch/x86/intrin/downcast.rs +++ b/src/arch/x86/intrin/downcast.rs @@ -6,12 +6,12 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; +use crate::core::mem::transmute; use crate::intrin::downcast::*; use crate::intrin::transmute::*; -use crate::core::mem::transmute; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; impl Downcast for i32x4 { #[inline(always)] @@ -25,14 +25,16 @@ impl Downcast for i32x4 { #[cfg(not(target_feature = "sse2"))] fn saturating_downcast(self, other: Self) -> i16x8 { fallback!(); - i16x8::new(self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(3).min(0x00007FFF).max(-0x00008000) as i16) + i16x8::new( + self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + ) } } @@ -40,10 +42,12 @@ impl Downcast for i64x2 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i32x4 { fallback!(); - i32x4::new(self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32) + i32x4::new( + self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + ) } } @@ -51,10 +55,12 @@ impl Downcast for u64x2 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u32x4 { fallback!(); - u32x4::new(self.extract(0).min(0xFFFFFFFF) as u32, - self.extract(1).min(0x7FFFFFFF) as u32, - other.extract(0).min(0x7FFFFFFF) as u32, - other.extract(1).min(0x7FFFFFFF) as u32) + u32x4::new( + self.extract(0).min(0xFFFFFFFF) as u32, + self.extract(1).min(0x7FFFFFFF) as u32, + other.extract(0).min(0x7FFFFFFF) as u32, + other.extract(1).min(0x7FFFFFFF) as u32, + ) } } @@ -62,10 +68,12 @@ impl Downcast for f64x2 { #[inline(always)] fn saturating_downcast(self, other: Self) -> f32x4 { fallback!(); - f32x4::new(self.extract(0) as f32, - self.extract(1) as f32, - other.extract(0) as f32, - other.extract(1) as f32) + f32x4::new( + self.extract(0) as f32, + self.extract(1) as f32, + other.extract(0) as f32, + other.extract(1) as f32, + ) } } @@ -81,22 +89,24 @@ impl Downcast for i16x8 { #[cfg(not(target_feature = "sse2"))] fn saturating_downcast(self, other: Self) -> i8x16 { fallback!(); - i8x16::new(self.extract(0).min(0x007F).max(-0x0080) as i8, - self.extract(1).min(0x007F).max(-0x0080) as i8, - self.extract(2).min(0x007F).max(-0x0080) as i8, - self.extract(3).min(0x007F).max(-0x0080) as i8, - self.extract(4).min(0x007F).max(-0x0080) as i8, - self.extract(5).min(0x007F).max(-0x0080) as i8, - self.extract(6).min(0x007F).max(-0x0080) as i8, - self.extract(7).min(0x007F).max(-0x0080) as i8, - other.extract(0).min(0x007F).max(-0x0080) as i8, - other.extract(1).min(0x007F).max(-0x0080) as i8, - other.extract(2).min(0x007F).max(-0x0080) as i8, - other.extract(3).min(0x007F).max(-0x0080) as i8, - other.extract(4).min(0x007F).max(-0x0080) as i8, - other.extract(5).min(0x007F).max(-0x0080) as i8, - other.extract(6).min(0x007F).max(-0x0080) as i8, - other.extract(7).min(0x007F).max(-0x0080) as i8) + i8x16::new( + self.extract(0).min(0x007F).max(-0x0080) as i8, + self.extract(1).min(0x007F).max(-0x0080) as i8, + self.extract(2).min(0x007F).max(-0x0080) as i8, + self.extract(3).min(0x007F).max(-0x0080) as i8, + self.extract(4).min(0x007F).max(-0x0080) as i8, + self.extract(5).min(0x007F).max(-0x0080) as i8, + self.extract(6).min(0x007F).max(-0x0080) as i8, + self.extract(7).min(0x007F).max(-0x0080) as i8, + other.extract(0).min(0x007F).max(-0x0080) as i8, + other.extract(1).min(0x007F).max(-0x0080) as i8, + other.extract(2).min(0x007F).max(-0x0080) as i8, + other.extract(3).min(0x007F).max(-0x0080) as i8, + other.extract(4).min(0x007F).max(-0x0080) as i8, + other.extract(5).min(0x007F).max(-0x0080) as i8, + other.extract(6).min(0x007F).max(-0x0080) as i8, + other.extract(7).min(0x007F).max(-0x0080) as i8, + ) } } @@ -112,14 +122,16 @@ impl Downcast for u32x4 { #[cfg(not(target_feature = "sse4.1"))] fn saturating_downcast(self, other: Self) -> u16x8 { fallback!(); - u16x8::new(self.extract(0).min(0x0000FFFF) as u16, - self.extract(1).min(0x0000FFFF) as u16, - self.extract(2).min(0x0000FFFF) as u16, - self.extract(3).min(0x0000FFFF) as u16, - other.extract(0).min(0x0000FFFF) as u16, - other.extract(1).min(0x0000FFFF) as u16, - other.extract(2).min(0x0000FFFF) as u16, - other.extract(3).min(0x0000FFFF) as u16) + u16x8::new( + self.extract(0).min(0x0000FFFF) as u16, + self.extract(1).min(0x0000FFFF) as u16, + self.extract(2).min(0x0000FFFF) as u16, + self.extract(3).min(0x0000FFFF) as u16, + other.extract(0).min(0x0000FFFF) as u16, + other.extract(1).min(0x0000FFFF) as u16, + other.extract(2).min(0x0000FFFF) as u16, + other.extract(3).min(0x0000FFFF) as u16, + ) } } @@ -135,22 +147,24 @@ impl Downcast for u16x8 { #[cfg(not(target_feature = "sse2"))] fn saturating_downcast(self, other: Self) -> u8x16 { fallback!(); - u8x16::new(self.extract(0).min(0x00FF) as u8, - self.extract(1).min(0x00FF) as u8, - self.extract(2).min(0x00FF) as u8, - self.extract(3).min(0x00FF) as u8, - self.extract(4).min(0x00FF) as u8, - self.extract(5).min(0x00FF) as u8, - self.extract(6).min(0x00FF) as u8, - self.extract(7).min(0x00FF) as u8, - other.extract(0).min(0x00FF) as u8, - other.extract(1).min(0x00FF) as u8, - other.extract(2).min(0x00FF) as u8, - other.extract(3).min(0x00FF) as u8, - other.extract(4).min(0x00FF) as u8, - other.extract(5).min(0x00FF) as u8, - other.extract(6).min(0x00FF) as u8, - other.extract(7).min(0x00FF) as u8) + u8x16::new( + self.extract(0).min(0x00FF) as u8, + self.extract(1).min(0x00FF) as u8, + self.extract(2).min(0x00FF) as u8, + self.extract(3).min(0x00FF) as u8, + self.extract(4).min(0x00FF) as u8, + self.extract(5).min(0x00FF) as u8, + self.extract(6).min(0x00FF) as u8, + self.extract(7).min(0x00FF) as u8, + other.extract(0).min(0x00FF) as u8, + other.extract(1).min(0x00FF) as u8, + other.extract(2).min(0x00FF) as u8, + other.extract(3).min(0x00FF) as u8, + other.extract(4).min(0x00FF) as u8, + other.extract(5).min(0x00FF) as u8, + other.extract(6).min(0x00FF) as u8, + other.extract(7).min(0x00FF) as u8, + ) } } @@ -166,22 +180,24 @@ impl Downcast for i32x8 { #[cfg(not(target_feature = "avx2"))] fn saturating_downcast(self, other: Self) -> i16x16 { fallback!(); - i16x16::new(self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(4).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(5).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(6).min(0x00007FFF).max(-0x00008000) as i16, - self.extract(7).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(3).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(4).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(5).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(6).min(0x00007FFF).max(-0x00008000) as i16, - other.extract(7).min(0x00007FFF).max(-0x00008000) as i16) + i16x16::new( + self.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(4).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(5).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(6).min(0x00007FFF).max(-0x00008000) as i16, + self.extract(7).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(0).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(1).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(2).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(3).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(4).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(5).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(6).min(0x00007FFF).max(-0x00008000) as i16, + other.extract(7).min(0x00007FFF).max(-0x00008000) as i16, + ) } } @@ -197,38 +213,40 @@ impl Downcast for i16x16 { #[cfg(not(target_feature = "avx2"))] fn saturating_downcast(self, other: Self) -> i8x32 { fallback!(); - i8x32::new(self.extract(0).min(0x007F).max(-0x0080) as i8, - self.extract(1).min(0x007F).max(-0x0080) as i8, - self.extract(2).min(0x007F).max(-0x0080) as i8, - self.extract(3).min(0x007F).max(-0x0080) as i8, - self.extract(4).min(0x007F).max(-0x0080) as i8, - self.extract(5).min(0x007F).max(-0x0080) as i8, - self.extract(6).min(0x007F).max(-0x0080) as i8, - self.extract(7).min(0x007F).max(-0x0080) as i8, - self.extract(8).min(0x007F).max(-0x0080) as i8, - self.extract(9).min(0x007F).max(-0x0080) as i8, - self.extract(10).min(0x007F).max(-0x0080) as i8, - self.extract(11).min(0x007F).max(-0x0080) as i8, - self.extract(12).min(0x007F).max(-0x0080) as i8, - self.extract(13).min(0x007F).max(-0x0080) as i8, - self.extract(14).min(0x007F).max(-0x0080) as i8, - self.extract(15).min(0x007F).max(-0x0080) as i8, - other.extract(0).min(0x007F).max(-0x0080) as i8, - other.extract(1).min(0x007F).max(-0x0080) as i8, - other.extract(2).min(0x007F).max(-0x0080) as i8, - other.extract(3).min(0x007F).max(-0x0080) as i8, - other.extract(4).min(0x007F).max(-0x0080) as i8, - other.extract(5).min(0x007F).max(-0x0080) as i8, - other.extract(6).min(0x007F).max(-0x0080) as i8, - other.extract(7).min(0x007F).max(-0x0080) as i8, - other.extract(8).min(0x007F).max(-0x0080) as i8, - other.extract(9).min(0x007F).max(-0x0080) as i8, - other.extract(10).min(0x007F).max(-0x0080) as i8, - other.extract(11).min(0x007F).max(-0x0080) as i8, - other.extract(12).min(0x007F).max(-0x0080) as i8, - other.extract(13).min(0x007F).max(-0x0080) as i8, - other.extract(14).min(0x007F).max(-0x0080) as i8, - other.extract(15).min(0x007F).max(-0x0080) as i8) + i8x32::new( + self.extract(0).min(0x007F).max(-0x0080) as i8, + self.extract(1).min(0x007F).max(-0x0080) as i8, + self.extract(2).min(0x007F).max(-0x0080) as i8, + self.extract(3).min(0x007F).max(-0x0080) as i8, + self.extract(4).min(0x007F).max(-0x0080) as i8, + self.extract(5).min(0x007F).max(-0x0080) as i8, + self.extract(6).min(0x007F).max(-0x0080) as i8, + self.extract(7).min(0x007F).max(-0x0080) as i8, + self.extract(8).min(0x007F).max(-0x0080) as i8, + self.extract(9).min(0x007F).max(-0x0080) as i8, + self.extract(10).min(0x007F).max(-0x0080) as i8, + self.extract(11).min(0x007F).max(-0x0080) as i8, + self.extract(12).min(0x007F).max(-0x0080) as i8, + self.extract(13).min(0x007F).max(-0x0080) as i8, + self.extract(14).min(0x007F).max(-0x0080) as i8, + self.extract(15).min(0x007F).max(-0x0080) as i8, + other.extract(0).min(0x007F).max(-0x0080) as i8, + other.extract(1).min(0x007F).max(-0x0080) as i8, + other.extract(2).min(0x007F).max(-0x0080) as i8, + other.extract(3).min(0x007F).max(-0x0080) as i8, + other.extract(4).min(0x007F).max(-0x0080) as i8, + other.extract(5).min(0x007F).max(-0x0080) as i8, + other.extract(6).min(0x007F).max(-0x0080) as i8, + other.extract(7).min(0x007F).max(-0x0080) as i8, + other.extract(8).min(0x007F).max(-0x0080) as i8, + other.extract(9).min(0x007F).max(-0x0080) as i8, + other.extract(10).min(0x007F).max(-0x0080) as i8, + other.extract(11).min(0x007F).max(-0x0080) as i8, + other.extract(12).min(0x007F).max(-0x0080) as i8, + other.extract(13).min(0x007F).max(-0x0080) as i8, + other.extract(14).min(0x007F).max(-0x0080) as i8, + other.extract(15).min(0x007F).max(-0x0080) as i8, + ) } } @@ -244,22 +262,24 @@ impl Downcast for u32x8 { #[cfg(not(target_feature = "avx2"))] fn saturating_downcast(self, other: Self) -> u16x16 { fallback!(); - u16x16::new(self.extract(0).min(0x0000FFFF) as u16, - self.extract(1).min(0x0000FFFF) as u16, - self.extract(2).min(0x0000FFFF) as u16, - self.extract(3).min(0x0000FFFF) as u16, - self.extract(4).min(0x0000FFFF) as u16, - self.extract(5).min(0x0000FFFF) as u16, - self.extract(6).min(0x0000FFFF) as u16, - self.extract(7).min(0x0000FFFF) as u16, - other.extract(0).min(0x0000FFFF) as u16, - other.extract(1).min(0x0000FFFF) as u16, - other.extract(2).min(0x0000FFFF) as u16, - other.extract(3).min(0x0000FFFF) as u16, - other.extract(4).min(0x0000FFFF) as u16, - other.extract(5).min(0x0000FFFF) as u16, - other.extract(6).min(0x0000FFFF) as u16, - other.extract(7).min(0x0000FFFF) as u16) + u16x16::new( + self.extract(0).min(0x0000FFFF) as u16, + self.extract(1).min(0x0000FFFF) as u16, + self.extract(2).min(0x0000FFFF) as u16, + self.extract(3).min(0x0000FFFF) as u16, + self.extract(4).min(0x0000FFFF) as u16, + self.extract(5).min(0x0000FFFF) as u16, + self.extract(6).min(0x0000FFFF) as u16, + self.extract(7).min(0x0000FFFF) as u16, + other.extract(0).min(0x0000FFFF) as u16, + other.extract(1).min(0x0000FFFF) as u16, + other.extract(2).min(0x0000FFFF) as u16, + other.extract(3).min(0x0000FFFF) as u16, + other.extract(4).min(0x0000FFFF) as u16, + other.extract(5).min(0x0000FFFF) as u16, + other.extract(6).min(0x0000FFFF) as u16, + other.extract(7).min(0x0000FFFF) as u16, + ) } } @@ -275,38 +295,40 @@ impl Downcast for u16x16 { #[cfg(not(target_feature = "avx2"))] fn saturating_downcast(self, other: Self) -> u8x32 { fallback!(); - u8x32::new(self.extract(0).min(0x00FF) as u8, - self.extract(1).min(0x00FF) as u8, - self.extract(2).min(0x00FF) as u8, - self.extract(3).min(0x00FF) as u8, - self.extract(4).min(0x00FF) as u8, - self.extract(5).min(0x00FF) as u8, - self.extract(6).min(0x00FF) as u8, - self.extract(7).min(0x00FF) as u8, - self.extract(8).min(0x00FF) as u8, - self.extract(9).min(0x00FF) as u8, - self.extract(10).min(0x00FF) as u8, - self.extract(11).min(0x00FF) as u8, - self.extract(12).min(0x00FF) as u8, - self.extract(13).min(0x00FF) as u8, - self.extract(14).min(0x00FF) as u8, - self.extract(15).min(0x00FF) as u8, - other.extract(0).min(0x00FF) as u8, - other.extract(1).min(0x00FF) as u8, - other.extract(2).min(0x00FF) as u8, - other.extract(3).min(0x00FF) as u8, - other.extract(4).min(0x00FF) as u8, - other.extract(5).min(0x00FF) as u8, - other.extract(6).min(0x00FF) as u8, - other.extract(7).min(0x00FF) as u8, - other.extract(8).min(0x00FF) as u8, - other.extract(9).min(0x00FF) as u8, - other.extract(10).min(0x00FF) as u8, - other.extract(11).min(0x00FF) as u8, - other.extract(12).min(0x00FF) as u8, - other.extract(13).min(0x00FF) as u8, - other.extract(14).min(0x00FF) as u8, - other.extract(15).min(0x00FF) as u8) + u8x32::new( + self.extract(0).min(0x00FF) as u8, + self.extract(1).min(0x00FF) as u8, + self.extract(2).min(0x00FF) as u8, + self.extract(3).min(0x00FF) as u8, + self.extract(4).min(0x00FF) as u8, + self.extract(5).min(0x00FF) as u8, + self.extract(6).min(0x00FF) as u8, + self.extract(7).min(0x00FF) as u8, + self.extract(8).min(0x00FF) as u8, + self.extract(9).min(0x00FF) as u8, + self.extract(10).min(0x00FF) as u8, + self.extract(11).min(0x00FF) as u8, + self.extract(12).min(0x00FF) as u8, + self.extract(13).min(0x00FF) as u8, + self.extract(14).min(0x00FF) as u8, + self.extract(15).min(0x00FF) as u8, + other.extract(0).min(0x00FF) as u8, + other.extract(1).min(0x00FF) as u8, + other.extract(2).min(0x00FF) as u8, + other.extract(3).min(0x00FF) as u8, + other.extract(4).min(0x00FF) as u8, + other.extract(5).min(0x00FF) as u8, + other.extract(6).min(0x00FF) as u8, + other.extract(7).min(0x00FF) as u8, + other.extract(8).min(0x00FF) as u8, + other.extract(9).min(0x00FF) as u8, + other.extract(10).min(0x00FF) as u8, + other.extract(11).min(0x00FF) as u8, + other.extract(12).min(0x00FF) as u8, + other.extract(13).min(0x00FF) as u8, + other.extract(14).min(0x00FF) as u8, + other.extract(15).min(0x00FF) as u8, + ) } } @@ -314,14 +336,16 @@ impl Downcast for i64x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> i32x8 { fallback!(); - i32x8::new(self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, - self.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, - other.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32) + i32x8::new( + self.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, + self.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(0).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(1).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(2).min(0x7FFFFFFF).max(-0x80000000) as i32, + other.extract(3).min(0x7FFFFFFF).max(-0x80000000) as i32, + ) } } @@ -329,14 +353,16 @@ impl Downcast for u64x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> u32x8 { fallback!(); - u32x8::new(self.extract(0).min(0xFFFFFFFF) as u32, - self.extract(1).min(0x7FFFFFFF) as u32, - self.extract(2).min(0xFFFFFFFF) as u32, - self.extract(3).min(0x7FFFFFFF) as u32, - other.extract(0).min(0x7FFFFFFF) as u32, - other.extract(1).min(0x7FFFFFFF) as u32, - other.extract(2).min(0x7FFFFFFF) as u32, - other.extract(3).min(0x7FFFFFFF) as u32) + u32x8::new( + self.extract(0).min(0xFFFFFFFF) as u32, + self.extract(1).min(0x7FFFFFFF) as u32, + self.extract(2).min(0xFFFFFFFF) as u32, + self.extract(3).min(0x7FFFFFFF) as u32, + other.extract(0).min(0x7FFFFFFF) as u32, + other.extract(1).min(0x7FFFFFFF) as u32, + other.extract(2).min(0x7FFFFFFF) as u32, + other.extract(3).min(0x7FFFFFFF) as u32, + ) } } @@ -344,13 +370,15 @@ impl Downcast for f64x4 { #[inline(always)] fn saturating_downcast(self, other: Self) -> f32x8 { fallback!(); - f32x8::new(self.extract(0) as f32, - self.extract(1) as f32, - self.extract(2) as f32, - self.extract(3) as f32, - other.extract(0) as f32, - other.extract(1) as f32, - other.extract(2) as f32, - other.extract(3) as f32) + f32x8::new( + self.extract(0) as f32, + self.extract(1) as f32, + self.extract(2) as f32, + self.extract(3) as f32, + other.extract(0) as f32, + other.extract(1) as f32, + other.extract(2) as f32, + other.extract(3) as f32, + ) } } diff --git a/src/arch/x86/intrin/endian.rs b/src/arch/x86/intrin/endian.rs index 3eae3b5..4d22703 100644 --- a/src/arch/x86/intrin/endian.rs +++ b/src/arch/x86/intrin/endian.rs @@ -1,88 +1,314 @@ use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; +use crate::core::mem::transmute; use crate::intrin::endian::*; use crate::intrin::transmute::*; -use crate::core::mem::transmute; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; -impl_packed_swap_bytes!(u8x64, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63)); -impl_packed_swap_bytes!(u8x32, u8x32, "avx2", _mm256_shuffle_epi8, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); -impl_packed_swap_bytes!(u8x16, u8x16, "ssse3", _mm_shuffle_epi8, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(i8x64, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63)); -impl_packed_swap_bytes!(i8x32, u8x32, "avx2", _mm256_shuffle_epi8, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); -impl_packed_swap_bytes!(i8x16, u8x16, "ssse3", _mm_shuffle_epi8, - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(u16x32, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30, 33, 32, 35, 34, 37, 36, 39, 38, 41, 40, 43, 42, 45, 44, 47, 46, 49, 48, 51, 50, 53, 52, 55, 54, 57, 56, 59, 58, 61, 60, 63, 62), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); -impl_packed_swap_bytes!(u16x16, u8x32, "avx2", _mm256_shuffle_epi8, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(u16x8, u8x16, "ssse3", _mm_shuffle_epi8, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(i16x32, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30, 33, 32, 35, 34, 37, 36, 39, 38, 41, 40, 43, 42, 45, 44, 47, 46, 49, 48, 51, 50, 53, 52, 55, 54, 57, 56, 59, 58, 61, 60, 63, 62), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); -impl_packed_swap_bytes!(i16x16, u8x32, "avx2", _mm256_shuffle_epi8, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(i16x8, u8x16, "ssse3", _mm_shuffle_epi8, - (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(u32x16, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40, 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(u32x8, u8x32, "avx2", _mm256_shuffle_epi8, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(u32x4, u8x16, "ssse3", _mm_shuffle_epi8, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), - (0, 1, 2, 3)); -impl_packed_swap_bytes!(i32x16, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40, 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60), - (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); -impl_packed_swap_bytes!(i32x8, u8x32, "avx2", _mm256_shuffle_epi8, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(i32x4, u8x16, "ssse3", _mm_shuffle_epi8, - (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), - (0, 1, 2, 3)); -impl_packed_swap_bytes!(u64x8, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, 39, 38, 37, 36, 35, 34, 33, 32, 47, 46, 45, 44, 43, 42, 41, 40, 55, 54, 53, 52, 51, 50, 49, 48, 63, 62, 61, 60, 59, 58, 57, 56), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(u64x4, u8x32, "avx2", _mm256_shuffle_epi8, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24), - (0, 1, 2, 3)); -impl_packed_swap_bytes!(u64x2, u8x16, "ssse3", _mm_shuffle_epi8, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), - (0, 1)); -impl_packed_swap_bytes!(i64x8, u8x64, "avx512-butnotyet", _mm512_permutexvar_epi8, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, 39, 38, 37, 36, 35, 34, 33, 32, 47, 46, 45, 44, 43, 42, 41, 40, 55, 54, 53, 52, 51, 50, 49, 48, 63, 62, 61, 60, 59, 58, 57, 56), - (0, 1, 2, 3, 4, 5, 6, 7)); -impl_packed_swap_bytes!(i64x4, u8x32, "avx2", _mm256_shuffle_epi8, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24), - (0, 1, 2, 3)); -impl_packed_swap_bytes!(i64x2, u8x16, "ssse3", _mm_shuffle_epi8, - (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), - (0, 1)); +impl_packed_swap_bytes!( + u8x64, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ), + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ) +); +impl_packed_swap_bytes!( + u8x32, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ), + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ) +); +impl_packed_swap_bytes!( + u8x16, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + i8x64, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ), + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 + ) +); +impl_packed_swap_bytes!( + i8x32, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ), + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ) +); +impl_packed_swap_bytes!( + i8x16, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + u16x32, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, + 24, 27, 26, 29, 28, 31, 30, 33, 32, 35, 34, 37, 36, 39, 38, 41, 40, 43, 42, 45, 44, 47, 46, + 49, 48, 51, 50, 53, 52, 55, 54, 57, 56, 59, 58, 61, 60, 63, 62 + ), + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ) +); +impl_packed_swap_bytes!( + u16x16, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, + 24, 27, 26, 29, 28, 31, 30 + ), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + u16x8, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + i16x32, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, + 24, 27, 26, 29, 28, 31, 30, 33, 32, 35, 34, 37, 36, 39, 38, 41, 40, 43, 42, 45, 44, 47, 46, + 49, 48, 51, 50, 53, 52, 55, 54, 57, 56, 59, 58, 61, 60, 63, 62 + ), + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ) +); +impl_packed_swap_bytes!( + i16x16, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, + 24, 27, 26, 29, 28, 31, 30 + ), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + i16x8, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + u32x16, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, + 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40, 47, 46, 45, 44, + 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60 + ), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + u32x8, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, + 26, 25, 24, 31, 30, 29, 28 + ), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + u32x4, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), + (0, 1, 2, 3) +); +impl_packed_swap_bytes!( + i32x16, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, + 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40, 47, 46, 45, 44, + 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60 + ), + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +); +impl_packed_swap_bytes!( + i32x8, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, + 26, 25, 24, 31, 30, 29, 28 + ), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + i32x4, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), + (0, 1, 2, 3) +); +impl_packed_swap_bytes!( + u64x8, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, + 30, 29, 28, 27, 26, 25, 24, 39, 38, 37, 36, 35, 34, 33, 32, 47, 46, 45, 44, 43, 42, 41, 40, + 55, 54, 53, 52, 51, 50, 49, 48, 63, 62, 61, 60, 59, 58, 57, 56 + ), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + u64x4, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, + 30, 29, 28, 27, 26, 25, 24 + ), + (0, 1, 2, 3) +); +impl_packed_swap_bytes!( + u64x2, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), + (0, 1) +); +impl_packed_swap_bytes!( + i64x8, + u8x64, + "avx512-butnotyet", + _mm512_permutexvar_epi8, + ( + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, + 30, 29, 28, 27, 26, 25, 24, 39, 38, 37, 36, 35, 34, 33, 32, 47, 46, 45, 44, 43, 42, 41, 40, + 55, 54, 53, 52, 51, 50, 49, 48, 63, 62, 61, 60, 59, 58, 57, 56 + ), + (0, 1, 2, 3, 4, 5, 6, 7) +); +impl_packed_swap_bytes!( + i64x4, + u8x32, + "avx2", + _mm256_shuffle_epi8, + ( + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, + 30, 29, 28, 27, 26, 25, 24 + ), + (0, 1, 2, 3) +); +impl_packed_swap_bytes!( + i64x2, + u8x16, + "ssse3", + _mm_shuffle_epi8, + (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), + (0, 1) +); mod tests { - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; - test_packed_swap_bytes!((u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2), - (swap_bytes_u8x64, swap_bytes_u8x32, swap_bytes_u8x16, swap_bytes_i8x64, swap_bytes_i8x32, swap_bytes_i8x16, swap_bytes_u16x32, swap_bytes_u16x16, swap_bytes_u16x8, swap_bytes_i16x32, swap_bytes_i16x16, swap_bytes_i16x8, swap_bytes_u32x16, swap_bytes_u32x8, swap_bytes_u32x4, swap_bytes_i32x16, swap_bytes_i32x8, swap_bytes_i32x4, swap_bytes_u64x8, swap_bytes_u64x4, swap_bytes_u64x2, swap_bytes_i64x8, swap_bytes_i64x4, swap_bytes_i64x2)); + test_packed_swap_bytes!( + ( + u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, + u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2 + ), + ( + swap_bytes_u8x64, + swap_bytes_u8x32, + swap_bytes_u8x16, + swap_bytes_i8x64, + swap_bytes_i8x32, + swap_bytes_i8x16, + swap_bytes_u16x32, + swap_bytes_u16x16, + swap_bytes_u16x8, + swap_bytes_i16x32, + swap_bytes_i16x16, + swap_bytes_i16x8, + swap_bytes_u32x16, + swap_bytes_u32x8, + swap_bytes_u32x4, + swap_bytes_i32x16, + swap_bytes_i32x8, + swap_bytes_i32x4, + swap_bytes_u64x8, + swap_bytes_u64x4, + swap_bytes_u64x2, + swap_bytes_i64x8, + swap_bytes_i64x4, + swap_bytes_i64x2 + ) + ); } diff --git a/src/arch/x86/intrin/eq.rs b/src/arch/x86/intrin/eq.rs index b6f70d5..1fd1599 100644 --- a/src/arch/x86/intrin/eq.rs +++ b/src/arch/x86/intrin/eq.rs @@ -5,12 +5,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; +use crate::arch::current::vecs::*; use crate::core::ops::BitXor; use crate::intrin::eq::*; -use crate::arch::current::vecs::*; use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; rust_fallback_eq! { impl Eq for u8x16 where "sse2" { @@ -135,8 +135,8 @@ rust_fallback_eq! { } mod tests { - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; // test_packed_eq!(u8x64, u8, u8x64, u8, test_eq_u8x64); test_packed_eq!(u8x32, u8, u8x32, u8, test_eq_u8x32); diff --git a/src/arch/x86/intrin/hadd.rs b/src/arch/x86/intrin/hadd.rs index ac8fb96..c75b5a7 100644 --- a/src/arch/x86/intrin/hadd.rs +++ b/src/arch/x86/intrin/hadd.rs @@ -5,21 +5,25 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; -use crate::intrin::transmute::*; -use crate::intrin::hadd::*; -use crate::core::ops::Add; use crate::arch::current::vecs::*; +use crate::core::ops::Add; +use crate::intrin::hadd::*; +use crate::intrin::transmute::*; use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; #[cfg(target_feature = "sse3")] impl HAdd for f32x4 { #[inline(always)] fn hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hadd_ps(_mm_shuffle_ps(*self, other, 0b01000100), - _mm_shuffle_ps(*self, other, 0b11101110)) } + unsafe { + _mm_hadd_ps( + _mm_shuffle_ps(*self, other, 0b01000100), + _mm_shuffle_ps(*self, other, 0b11101110), + ) + } } } @@ -38,8 +42,12 @@ impl HAdd for f32x8 { #[inline(always)] fn hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hadd_ps(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), - _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked()) } + unsafe { + _mm256_hadd_ps( + _mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), + _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), + ) + } } } @@ -57,8 +65,12 @@ impl HAdd for i16x8 { #[inline(always)] fn hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hadd_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm_hadd_epi16( + _mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } @@ -67,8 +79,12 @@ impl HAdd for i32x4 { #[inline(always)] fn hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hadd_epi32(_mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), - _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } + unsafe { + _mm_hadd_epi32( + _mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + ) + } } } @@ -77,8 +93,12 @@ impl HAdd for i16x16 { #[inline(always)] fn hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hadd_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm256_hadd_epi16( + _mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } @@ -87,46 +107,517 @@ impl HAdd for i32x8 { #[inline(always)] fn hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hadd_epi32(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), - _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } + unsafe { + _mm256_hadd_epi32( + _mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + ) + } } } -impl HAdd for u64x2 { hop!(hadd, Add::add, 0, 1); } -impl HAdd for u64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } -impl HAdd for u64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HAdd for u32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } -impl HAdd for u32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HAdd for u32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HAdd for u16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HAdd for u16x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HAdd for u16x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HAdd for u8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HAdd for u8x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HAdd for u8x64 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } -impl HAdd for i64x2 { hop!(hadd, Add::add, 0, 1); } -impl HAdd for i64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } -impl HAdd for i64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } +impl HAdd for u64x2 { + hop!(hadd, Add::add, 0, 1); +} +impl HAdd for u64x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} +impl HAdd for u64x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HAdd for u32x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} +impl HAdd for u32x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HAdd for u32x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HAdd for u16x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HAdd for u16x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HAdd for u16x32 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HAdd for u8x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HAdd for u8x32 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HAdd for u8x64 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} +impl HAdd for i64x2 { + hop!(hadd, Add::add, 0, 1); +} +impl HAdd for i64x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} +impl HAdd for i64x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "ssse3"))] -impl HAdd for i32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } +impl HAdd for i32x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} #[cfg(not(target_feature = "avx2"))] -impl HAdd for i32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HAdd for i32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl HAdd for i32x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HAdd for i32x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} #[cfg(not(target_feature = "ssse3"))] -impl HAdd for i16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } +impl HAdd for i16x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "avx2"))] -impl HAdd for i16x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HAdd for i16x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HAdd for i8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HAdd for i8x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HAdd for i8x64 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } +impl HAdd for i16x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HAdd for i16x32 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HAdd for i8x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HAdd for i8x32 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HAdd for i8x64 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} #[cfg(not(target_feature = "sse3"))] -impl HAdd for f64x2 { hop!(hadd, Add::add, 0, 1); } +impl HAdd for f64x2 { + hop!(hadd, Add::add, 0, 1); +} #[cfg(not(target_feature = "avx"))] -impl HAdd for f64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } -impl HAdd for f64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } +impl HAdd for f64x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} +impl HAdd for f64x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "sse3"))] -impl HAdd for f32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } +impl HAdd for f32x4 { + hop!(hadd, Add::add, 0, 1, 2, 3); +} #[cfg(not(target_feature = "avx2"))] -impl HAdd for f32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HAdd for f32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl HAdd for f32x8 { + hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HAdd for f32x16 { + hop!( + hadd, + Add::add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} diff --git a/src/arch/x86/intrin/hsub.rs b/src/arch/x86/intrin/hsub.rs index 0fa38e7..db8deac 100644 --- a/src/arch/x86/intrin/hsub.rs +++ b/src/arch/x86/intrin/hsub.rs @@ -5,21 +5,25 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::intrin::transmute::*; -use crate::intrin::hsub::*; use crate::core::ops::Sub; +use crate::intrin::hsub::*; +use crate::intrin::transmute::*; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; #[cfg(target_feature = "sse3")] impl HSub for f32x4 { #[inline(always)] fn hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hsub_ps(_mm_shuffle_ps(*self, other, 0b01000100), - _mm_shuffle_ps(*self, other, 0b11101110)) } + unsafe { + _mm_hsub_ps( + _mm_shuffle_ps(*self, other, 0b01000100), + _mm_shuffle_ps(*self, other, 0b11101110), + ) + } } } @@ -37,8 +41,12 @@ impl HSub for f32x8 { #[inline(always)] fn hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hsub_ps(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), - _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked()) } + unsafe { + _mm256_hsub_ps( + _mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), + _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), + ) + } } } @@ -56,8 +64,12 @@ impl HSub for i16x8 { #[inline(always)] fn hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hsub_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm_hsub_epi16( + _mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } @@ -66,8 +78,12 @@ impl HSub for i32x4 { #[inline(always)] fn hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hsub_epi32(_mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), - _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } + unsafe { + _mm_hsub_epi32( + _mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + ) + } } } @@ -76,8 +92,12 @@ impl HSub for i16x16 { #[inline(always)] fn hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hsub_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm256_hsub_epi16( + _mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } @@ -86,46 +106,517 @@ impl HSub for i32x8 { #[inline(always)] fn hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hsub_epi32(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), - _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } + unsafe { + _mm256_hsub_epi32( + _mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), + ) + } } } -impl HSub for u64x2 { hop!(hsub, Sub::sub, 0, 1); } -impl HSub for u64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } -impl HSub for u64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HSub for u32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } -impl HSub for u32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HSub for u32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HSub for u16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HSub for u16x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HSub for u16x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HSub for u8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HSub for u8x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HSub for u8x64 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } -impl HSub for i64x2 { hop!(hsub, Sub::sub, 0, 1); } -impl HSub for i64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } -impl HSub for i64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } +impl HSub for u64x2 { + hop!(hsub, Sub::sub, 0, 1); +} +impl HSub for u64x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} +impl HSub for u64x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HSub for u32x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} +impl HSub for u32x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HSub for u32x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HSub for u16x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HSub for u16x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HSub for u16x32 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HSub for u8x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HSub for u8x32 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HSub for u8x64 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} +impl HSub for i64x2 { + hop!(hsub, Sub::sub, 0, 1); +} +impl HSub for i64x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} +impl HSub for i64x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "ssse3"))] -impl HSub for i32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } +impl HSub for i32x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} #[cfg(not(target_feature = "avx2"))] -impl HSub for i32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HSub for i32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl HSub for i32x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HSub for i32x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} #[cfg(not(target_feature = "ssse3"))] -impl HSub for i16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } +impl HSub for i16x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "avx2"))] -impl HSub for i16x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HSub for i16x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HSub for i8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl HSub for i8x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl HSub for i8x64 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } +impl HSub for i16x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HSub for i16x32 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HSub for i8x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl HSub for i8x32 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl HSub for i8x64 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} #[cfg(not(target_feature = "sse3"))] -impl HSub for f64x2 { hop!(hsub, Sub::sub, 0, 1); } +impl HSub for f64x2 { + hop!(hsub, Sub::sub, 0, 1); +} #[cfg(not(target_feature = "avx"))] -impl HSub for f64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } -impl HSub for f64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } +impl HSub for f64x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} +impl HSub for f64x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "sse3"))] -impl HSub for f32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } +impl HSub for f32x4 { + hop!(hsub, Sub::sub, 0, 1, 2, 3); +} #[cfg(not(target_feature = "avx2"))] -impl HSub for f32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl HSub for f32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl HSub for f32x8 { + hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl HSub for f32x16 { + hop!( + hsub, + Sub::sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} diff --git a/src/arch/x86/intrin/merge.rs b/src/arch/x86/intrin/merge.rs index c51d98f..bea05ed 100644 --- a/src/arch/x86/intrin/merge.rs +++ b/src/arch/x86/intrin/merge.rs @@ -1,59 +1,779 @@ use crate::arch::current::vecs::*; -use crate::vecs::*; +use crate::core::mem::transmute; +use crate::intrin::merge::*; +use crate::intrin::transmute::*; use crate::vec_patterns::*; -use crate::vektor::x86_64::*; +use crate::vecs::*; use crate::vektor::x86::*; -use crate::intrin::transmute::*; -use crate::intrin::merge::*; -use crate::core::mem::transmute; +use crate::vektor::x86_64::*; // TODO: The AVX-512 version of this macro doesn't work; impl when stdsimd gets // around to it (and when I have some hardware to test it on). -impl_packed_merge!(u8x16, u8x16, u8, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -impl_packed_merge!(u8x32, u8x32, u8, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -impl_packed_merge!(u8x64, u8x64, u8, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), (32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); +impl_packed_merge!( + u8x16, + u8x16, + u8, + _mm_blendv_epi8, + "sse4.1", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); +impl_packed_merge!( + u8x32, + u8x32, + u8, + _mm256_blendv_epi8, + "avx2", + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 +); +impl_packed_merge!( + u8x64, + u8x64, + u8, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ), + ( + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63 + ), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 +); -impl_packed_merge!(u16x8, u16x8, u16, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); -impl_packed_merge!(u16x16, u16x16, u16, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -impl_packed_merge!(u16x32, u16x32, u16, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); +impl_packed_merge!( + u16x8, + u16x8, + u16, + _mm_blendv_epi8, + "sse4.1", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); +impl_packed_merge!( + u16x16, + u16x16, + u16, + _mm256_blendv_epi8, + "avx2", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); +impl_packed_merge!( + u16x32, + u16x32, + u16, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 +); -impl_packed_merge!(u32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(u32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); -impl_packed_merge!(u32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +impl_packed_merge!( + u32x4, + u32x4, + u32, + _mm_blendv_epi8, + "sse4.1", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + u32x8, + u32x8, + u32, + _mm256_blendv_epi8, + "avx2", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); +impl_packed_merge!( + u32x16, + u32x16, + u32, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); impl_packed_merge!(u64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1); -impl_packed_merge!(u64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(u64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); +impl_packed_merge!( + u64x4, + u64x4, + u64, + _mm256_blendv_epi8, + "avx2", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + u64x8, + u64x8, + u64, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); -impl_packed_merge!(i8x16, u8x16, u8, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -impl_packed_merge!(i8x32, u8x32, u8, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -impl_packed_merge!(i8x64, u8x64, u8, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), (32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); +impl_packed_merge!( + i8x16, + u8x16, + u8, + _mm_blendv_epi8, + "sse4.1", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); +impl_packed_merge!( + i8x32, + u8x32, + u8, + _mm256_blendv_epi8, + "avx2", + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 +); +impl_packed_merge!( + i8x64, + u8x64, + u8, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + ( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31 + ), + ( + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63 + ), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 +); -impl_packed_merge!(i16x8, u16x8, u16, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); -impl_packed_merge!(i16x16, u16x16, u16, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -impl_packed_merge!(i16x32, u16x32, u16, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); +impl_packed_merge!( + i16x8, + u16x8, + u16, + _mm_blendv_epi8, + "sse4.1", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); +impl_packed_merge!( + i16x16, + u16x16, + u16, + _mm256_blendv_epi8, + "avx2", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); +impl_packed_merge!( + i16x32, + u16x32, + u16, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 +); -impl_packed_merge!(i32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(i32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); -impl_packed_merge!(i32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +impl_packed_merge!( + i32x4, + u32x4, + u32, + _mm_blendv_epi8, + "sse4.1", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + i32x8, + u32x8, + u32, + _mm256_blendv_epi8, + "avx2", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); +impl_packed_merge!( + i32x16, + u32x16, + u32, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); impl_packed_merge!(i64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1); -impl_packed_merge!(i64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(i64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); +impl_packed_merge!( + i64x4, + u64x4, + u64, + _mm256_blendv_epi8, + "avx2", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + i64x8, + u64x8, + u64, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); -impl_packed_merge!(f32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(f32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); -impl_packed_merge!(f32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +impl_packed_merge!( + f32x4, + u32x4, + u32, + _mm_blendv_epi8, + "sse4.1", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + f32x8, + u32x8, + u32, + _mm256_blendv_epi8, + "avx2", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); +impl_packed_merge!( + f32x16, + u32x16, + u32, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3, 4, 5, 6, 7), + (8, 9, 10, 11, 12, 13, 14, 15), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 +); impl_packed_merge!(f64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1); -impl_packed_merge!(f64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3); -impl_packed_merge!(f64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); +impl_packed_merge!( + f64x4, + u64x4, + u64, + _mm256_blendv_epi8, + "avx2", + (0, 1), + (2, 3), + 0, + 1, + 2, + 3 +); +impl_packed_merge!( + f64x8, + u64x8, + u64, + _mm512_mask_mov_epi8, + "avx512-butnotyet", + (0, 1, 2, 3), + (4, 5, 6, 7), + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7 +); mod tests { - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; test_packed_merge!( - (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2), - (merge_u8x64, merge_u8x32, merge_u8x16, merge_i8x64, merge_i8x32, merge_i8x16, merge_u16x32, merge_u16x16, merge_u16x8, merge_i16x32, merge_i16x16, merge_i16x8, merge_u32x16, merge_u32x8, merge_u32x4, merge_i32x16, merge_i32x8, merge_i32x4, merge_f32x16, merge_f32x8, merge_f32x4, merge_u64x8, merge_u64x4, merge_u64x2, merge_i64x8, merge_i64x4, merge_i64x2, merge_f64x8, merge_f64x4, merge_f64x2)); + ( + u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, + u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, + i64x8, i64x4, i64x2, f64x8, f64x4, f64x2 + ), + ( + merge_u8x64, + merge_u8x32, + merge_u8x16, + merge_i8x64, + merge_i8x32, + merge_i8x16, + merge_u16x32, + merge_u16x16, + merge_u16x8, + merge_i16x32, + merge_i16x16, + merge_i16x8, + merge_u32x16, + merge_u32x8, + merge_u32x4, + merge_i32x16, + merge_i32x8, + merge_i32x4, + merge_f32x16, + merge_f32x8, + merge_f32x4, + merge_u64x8, + merge_u64x4, + merge_u64x2, + merge_i64x8, + merge_i64x4, + merge_i64x2, + merge_f64x8, + merge_f64x4, + merge_f64x2 + ) + ); } diff --git a/src/arch/x86/intrin/mod.rs b/src/arch/x86/intrin/mod.rs index bd87c04..8b1ac7e 100644 --- a/src/arch/x86/intrin/mod.rs +++ b/src/arch/x86/intrin/mod.rs @@ -20,13 +20,13 @@ mod merge; mod popcnt; mod recip; mod round; -mod sum; mod rsqrt; mod saturating_add; mod saturating_hadd; -mod saturating_sub; mod saturating_hsub; +mod saturating_sub; mod sqrt; +mod sum; mod transmute; mod upcast; @@ -47,11 +47,11 @@ pub mod prelude { pub use super::recip::*; pub use super::round::*; pub use super::rsqrt::*; - pub use super::sum::*; pub use super::saturating_add::*; pub use super::saturating_hadd::*; pub use super::saturating_hsub::*; pub use super::saturating_sub::*; + pub use super::sum::*; pub use super::transmute::*; pub use super::upcast::*; } diff --git a/src/arch/x86/intrin/popcnt.rs b/src/arch/x86/intrin/popcnt.rs index 37665fa..5d0a433 100644 --- a/src/arch/x86/intrin/popcnt.rs +++ b/src/arch/x86/intrin/popcnt.rs @@ -5,16 +5,16 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; -use crate::intrin::sum::*; -use crate::intrin::transmute::*; -use crate::intrin::popcnt::*; use crate::arch::current::intrin::upcast::*; -use crate::intrin::sum::UpcastSum; use crate::arch::current::vecs::*; +use crate::intrin::popcnt::*; +use crate::intrin::sum::UpcastSum; +use crate::intrin::sum::*; +use crate::intrin::transmute::*; use crate::intrin::upcast::*; use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; #[inline(always)] #[cfg(target_feature = "ssse3")] @@ -27,7 +27,7 @@ unsafe fn popcnt128(v: u8x16) -> usize { let hi: u8x16 = v.be_u8s() >> 4; (_mm_shuffle_epi8(lookup, hi.be_i8s()).be_u8s() + _mm_shuffle_epi8(lookup, lo.be_i8s()).be_u8s()) - .sum_upcast() as usize + .sum_upcast() as usize } #[inline(always)] @@ -35,7 +35,8 @@ unsafe fn popcnt128(v: u8x16) -> usize { #[allow(unused_unsafe)] unsafe fn popcnt128(v: u8x16) -> usize { fallback!(); - v.be_u64s(). scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) + v.be_u64s() + .scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) } #[inline(always)] @@ -44,13 +45,15 @@ unsafe fn popcnt256(v: u8x32) -> usize { // AVX2 popcnt algorithm by Wojciech Muła, Nathan Kurz, and Daniel Lemire // https://arxiv.org/abs/1611.07612 optimized!(); - let lookup = i8x32::new(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + let lookup = i8x32::new( + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, + 3, 4, + ); let lo = v.be_u8s() & 0x0f; let hi: u8x32 = v.be_u8s() >> 4; (_mm256_shuffle_epi8(lookup, hi.be_i8s()).be_u8s() + _mm256_shuffle_epi8(lookup, lo.be_i8s()).be_u8s()) - .sum_upcast() as usize + .sum_upcast() as usize } #[inline(always)] @@ -58,14 +61,16 @@ unsafe fn popcnt256(v: u8x32) -> usize { #[allow(unused_unsafe)] unsafe fn popcnt256(v: u8x32) -> usize { fallback!(); - v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) + v.be_u64s() + .scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) } #[inline(always)] // #[cfg(not(target_feature = "avx512"))] unsafe fn popcnt512(v: u8x64) -> usize { fallback!(); - v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) + v.be_u64s() + .scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) } impl_popcnt!(u8x64, popcnt512, u8x32, popcnt256, u8x16, popcnt128); @@ -79,10 +84,43 @@ impl_popcnt!(i64x8, popcnt512, i64x4, popcnt256, i64x2, popcnt128); #[cfg(test)] mod tests { - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; - test_popcnt!((u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, u64, u64, u64, i64, i64, i64), - (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2), - (popcnt_u8x64, popcnt_u8x32, popcnt_u8x16, popcnt_i8x64, popcnt_i8x32, popcnt_i8x16, popcnt_u16x32, popcnt_u16x16, popcnt_u16x8, popcnt_i16x32, popcnt_i16x16, popcnt_i16x8, popcnt_u32x16, popcnt_u32x8, popcnt_u32x4, popcnt_i32x16, popcnt_i32x8, popcnt_i32x4, popcnt_u64x8, popcnt_u64x4, popcnt_u64x2, popcnt_i64x8, popcnt_i64x4, popcnt_i64x2)); + test_popcnt!( + ( + u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, + u64, u64, u64, i64, i64, i64 + ), + ( + u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, + u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2 + ), + ( + popcnt_u8x64, + popcnt_u8x32, + popcnt_u8x16, + popcnt_i8x64, + popcnt_i8x32, + popcnt_i8x16, + popcnt_u16x32, + popcnt_u16x16, + popcnt_u16x8, + popcnt_i16x32, + popcnt_i16x16, + popcnt_i16x8, + popcnt_u32x16, + popcnt_u32x8, + popcnt_u32x4, + popcnt_i32x16, + popcnt_i32x8, + popcnt_i32x4, + popcnt_u64x8, + popcnt_u64x4, + popcnt_u64x2, + popcnt_i64x8, + popcnt_i64x4, + popcnt_i64x2 + ) + ); } diff --git a/src/arch/x86/intrin/recip.rs b/src/arch/x86/intrin/recip.rs index 5d5b75a..722679a 100644 --- a/src/arch/x86/intrin/recip.rs +++ b/src/arch/x86/intrin/recip.rs @@ -5,11 +5,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::recip::Recip; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; rust_fallback_impl! { impl Recip for f32x8 where "avx" { @@ -22,4 +22,3 @@ rust_fallback_impl! { recip => _mm_rcp_ps(), [0, 1, 2, 3]; } } - diff --git a/src/arch/x86/intrin/round.rs b/src/arch/x86/intrin/round.rs index 7cff34a..19b2550 100644 --- a/src/arch/x86/intrin/round.rs +++ b/src/arch/x86/intrin/round.rs @@ -5,12 +5,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; -use crate::intrin::round::Round; -use crate::core::arch::x86_64::{_MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TRUNC}; use crate::arch::current::vecs::*; +use crate::core::arch::x86_64::{_MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TRUNC}; +use crate::intrin::round::Round; use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; rust_fallback_impl! { impl Round for f32x4 where "sse4.1" { diff --git a/src/arch/x86/intrin/rsqrt.rs b/src/arch/x86/intrin/rsqrt.rs index a967620..a9c5d21 100644 --- a/src/arch/x86/intrin/rsqrt.rs +++ b/src/arch/x86/intrin/rsqrt.rs @@ -5,11 +5,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; -use crate::intrin::rsqrt::*; use crate::arch::current::vecs::*; +use crate::intrin::rsqrt::*; use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; // TODO: Guards and non-simd diff --git a/src/arch/x86/intrin/saturating_add.rs b/src/arch/x86/intrin/saturating_add.rs index a106c36..51d532a 100644 --- a/src/arch/x86/intrin/saturating_add.rs +++ b/src/arch/x86/intrin/saturating_add.rs @@ -5,11 +5,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::intrin::saturating_add::*; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; rust_fallback_impl_binary! { impl SaturatingAdd for u8x16 where "sse2" { diff --git a/src/arch/x86/intrin/saturating_hadd.rs b/src/arch/x86/intrin/saturating_hadd.rs index f863e4f..96167de 100644 --- a/src/arch/x86/intrin/saturating_hadd.rs +++ b/src/arch/x86/intrin/saturating_hadd.rs @@ -5,20 +5,24 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::intrin::transmute::*; use crate::intrin::saturating_hadd::*; +use crate::intrin::transmute::*; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; #[cfg(target_feature = "ssse3")] impl SaturatingHAdd for i16x8 { #[inline(always)] fn saturating_hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hadds_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm_hadds_epi16( + _mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } @@ -27,34 +31,474 @@ impl SaturatingHAdd for i16x16 { #[inline(always)] fn saturating_hadd(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hadds_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm256_hadds_epi16( + _mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } -impl SaturatingHAdd for u64x2 { hop!(saturating_hadd, u64::saturating_add, 0, 1); } -impl SaturatingHAdd for u64x4 { hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3); } -impl SaturatingHAdd for u64x8 { hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHAdd for u32x4 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); } -impl SaturatingHAdd for u32x8 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHAdd for u32x16 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHAdd for u16x8 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHAdd for u16x16 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHAdd for u16x32 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHAdd for u8x16 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHAdd for u8x32 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHAdd for u8x64 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } -impl SaturatingHAdd for i64x2 { hop!(saturating_hadd, i64::saturating_add, 0, 1); } -impl SaturatingHAdd for i64x4 { hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3); } -impl SaturatingHAdd for i64x8 { hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHAdd for i32x4 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); } -impl SaturatingHAdd for i32x8 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHAdd for i32x16 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl SaturatingHAdd for u64x2 { + hop!(saturating_hadd, u64::saturating_add, 0, 1); +} +impl SaturatingHAdd for u64x4 { + hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3); +} +impl SaturatingHAdd for u64x8 { + hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHAdd for u32x4 { + hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); +} +impl SaturatingHAdd for u32x8 { + hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHAdd for u32x16 { + hop!( + saturating_hadd, + u32::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHAdd for u16x8 { + hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHAdd for u16x16 { + hop!( + saturating_hadd, + u16::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHAdd for u16x32 { + hop!( + saturating_hadd, + u16::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHAdd for u8x16 { + hop!( + saturating_hadd, + u8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHAdd for u8x32 { + hop!( + saturating_hadd, + u8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHAdd for u8x64 { + hop!( + saturating_hadd, + u8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} +impl SaturatingHAdd for i64x2 { + hop!(saturating_hadd, i64::saturating_add, 0, 1); +} +impl SaturatingHAdd for i64x4 { + hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3); +} +impl SaturatingHAdd for i64x8 { + hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHAdd for i32x4 { + hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); +} +impl SaturatingHAdd for i32x8 { + hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHAdd for i32x16 { + hop!( + saturating_hadd, + i32::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} #[cfg(not(target_feature = "ssse3"))] -impl SaturatingHAdd for i16x8 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } +impl SaturatingHAdd for i16x8 { + hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "avx2"))] -impl SaturatingHAdd for i16x16 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHAdd for i16x32 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHAdd for i8x16 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHAdd for i8x32 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHAdd for i8x64 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } +impl SaturatingHAdd for i16x16 { + hop!( + saturating_hadd, + i16::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHAdd for i16x32 { + hop!( + saturating_hadd, + i16::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHAdd for i8x16 { + hop!( + saturating_hadd, + i8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHAdd for i8x32 { + hop!( + saturating_hadd, + i8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHAdd for i8x64 { + hop!( + saturating_hadd, + i8::saturating_add, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} diff --git a/src/arch/x86/intrin/saturating_hsub.rs b/src/arch/x86/intrin/saturating_hsub.rs index 7ccf40a..a8f6a6e 100644 --- a/src/arch/x86/intrin/saturating_hsub.rs +++ b/src/arch/x86/intrin/saturating_hsub.rs @@ -5,20 +5,24 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::intrin::transmute::*; use crate::intrin::saturating_hsub::*; +use crate::intrin::transmute::*; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; #[cfg(target_feature = "ssse3")] impl SaturatingHSub for i16x8 { #[inline(always)] fn saturating_hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm_hsubs_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm_hsubs_epi16( + _mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } @@ -27,34 +31,474 @@ impl SaturatingHSub for i16x16 { #[inline(always)] fn saturating_hsub(&self, other: Self) -> Self { optimized!(); - unsafe { _mm256_hsubs_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), - _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } + unsafe { + _mm256_hsubs_epi16( + _mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), + ) + } } } -impl SaturatingHSub for u64x2 { hop!(saturating_hsub, u64::saturating_sub, 0, 1); } -impl SaturatingHSub for u64x4 { hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3); } -impl SaturatingHSub for u64x8 { hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHSub for u32x4 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); } -impl SaturatingHSub for u32x8 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHSub for u32x16 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHSub for u16x8 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHSub for u16x16 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHSub for u16x32 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHSub for u8x16 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHSub for u8x32 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHSub for u8x64 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } -impl SaturatingHSub for i64x2 { hop!(saturating_hsub, i64::saturating_sub, 0, 1); } -impl SaturatingHSub for i64x4 { hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3); } -impl SaturatingHSub for i64x8 { hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHSub for i32x4 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); } -impl SaturatingHSub for i32x8 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } -impl SaturatingHSub for i32x16 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } +impl SaturatingHSub for u64x2 { + hop!(saturating_hsub, u64::saturating_sub, 0, 1); +} +impl SaturatingHSub for u64x4 { + hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3); +} +impl SaturatingHSub for u64x8 { + hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHSub for u32x4 { + hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); +} +impl SaturatingHSub for u32x8 { + hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHSub for u32x16 { + hop!( + saturating_hsub, + u32::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHSub for u16x8 { + hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHSub for u16x16 { + hop!( + saturating_hsub, + u16::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHSub for u16x32 { + hop!( + saturating_hsub, + u16::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHSub for u8x16 { + hop!( + saturating_hsub, + u8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHSub for u8x32 { + hop!( + saturating_hsub, + u8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHSub for u8x64 { + hop!( + saturating_hsub, + u8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} +impl SaturatingHSub for i64x2 { + hop!(saturating_hsub, i64::saturating_sub, 0, 1); +} +impl SaturatingHSub for i64x4 { + hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3); +} +impl SaturatingHSub for i64x8 { + hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHSub for i32x4 { + hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); +} +impl SaturatingHSub for i32x8 { + hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} +impl SaturatingHSub for i32x16 { + hop!( + saturating_hsub, + i32::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} #[cfg(not(target_feature = "ssse3"))] -impl SaturatingHSub for i16x8 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } +impl SaturatingHSub for i16x8 { + hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); +} #[cfg(not(target_feature = "avx2"))] -impl SaturatingHSub for i16x16 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHSub for i16x32 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHSub for i8x16 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -impl SaturatingHSub for i8x32 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } -impl SaturatingHSub for i8x64 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } +impl SaturatingHSub for i16x16 { + hop!( + saturating_hsub, + i16::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHSub for i16x32 { + hop!( + saturating_hsub, + i16::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHSub for i8x16 { + hop!( + saturating_hsub, + i8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15 + ); +} +impl SaturatingHSub for i8x32 { + hop!( + saturating_hsub, + i8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31 + ); +} +impl SaturatingHSub for i8x64 { + hop!( + saturating_hsub, + i8::saturating_sub, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63 + ); +} diff --git a/src/arch/x86/intrin/saturating_sub.rs b/src/arch/x86/intrin/saturating_sub.rs index 1e41ac8..9a7479b 100644 --- a/src/arch/x86/intrin/saturating_sub.rs +++ b/src/arch/x86/intrin/saturating_sub.rs @@ -5,11 +5,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; -use crate::intrin::saturating_sub::*; use crate::arch::current::vecs::*; +use crate::intrin::saturating_sub::*; use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; rust_fallback_impl_binary! { impl SaturatingSub for u8x16 where "sse2" { diff --git a/src/arch/x86/intrin/sqrt.rs b/src/arch/x86/intrin/sqrt.rs index 2f54666..d68342e 100644 --- a/src/arch/x86/intrin/sqrt.rs +++ b/src/arch/x86/intrin/sqrt.rs @@ -5,11 +5,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use vektor::x86_64::*; -use vektor::x86::*; -use crate::intrin::sqrt::*; use crate::arch::current::vecs::*; +use crate::intrin::sqrt::*; use crate::vecs::*; +use vektor::x86::*; +use vektor::x86_64::*; rust_fallback_impl! { impl Sqrt for f32x8 where "avx" { diff --git a/src/arch/x86/intrin/sum.rs b/src/arch/x86/intrin/sum.rs index 3b408de..eb525d5 100644 --- a/src/arch/x86/intrin/sum.rs +++ b/src/arch/x86/intrin/sum.rs @@ -1,13 +1,13 @@ use crate::arch::current::vecs::*; -use crate::vecs::*; -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; use crate::core::ops::Add; -use crate::intrin::upcast::Upcast; -use crate::intrin::cmp::Cmp; use crate::intrin::abs::Abs; -use crate::intrin::sum::{Sum,UpcastSum}; +use crate::intrin::cmp::Cmp; +use crate::intrin::sum::{Sum, UpcastSum}; use crate::intrin::transmute::Transmute; +use crate::intrin::upcast::Upcast; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; // TODO: Specialization // impl Sum for T where T : , T::Scalar : Add, T::Scalar : From { @@ -30,10 +30,16 @@ impl Sum for i8x16 { #[inline(always)] fn sum(&self) -> Self::Scalar { optimized!(); - let pos = unsafe { _mm_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x16::splat(0)).be_u16s() }; - let neg = unsafe { _mm_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x16::splat(0)).be_u16s() }; - pos.extract(0).overflowing_sub(neg.extract(0)).0 - .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0).0 as i8 + let pos = + unsafe { _mm_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x16::splat(0)).be_u16s() }; + let neg = unsafe { + _mm_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x16::splat(0)).be_u16s() + }; + pos.extract(0) + .overflowing_sub(neg.extract(0)) + .0 + .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0) + .0 as i8 } } @@ -42,10 +48,16 @@ impl UpcastSum for i8x16 { #[inline(always)] fn sum_upcast(&self) -> i64 { optimized!(); - let pos = unsafe { _mm_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x16::splat(0)).be_u16s() }; - let neg = unsafe { _mm_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x16::splat(0)).be_u16s() }; - pos.extract(0).overflowing_sub(neg.extract(0)).0 - .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0).0 as i8 as i64 + let pos = + unsafe { _mm_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x16::splat(0)).be_u16s() }; + let neg = unsafe { + _mm_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x16::splat(0)).be_u16s() + }; + pos.extract(0) + .overflowing_sub(neg.extract(0)) + .0 + .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0) + .0 as i8 as i64 } } @@ -54,12 +66,21 @@ impl Sum for i8x32 { #[inline(always)] fn sum(&self) -> Self::Scalar { optimized!(); - let pos = unsafe { _mm256_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x32::splat(0)).be_u16s() }; - let neg = unsafe { _mm256_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x32::splat(0)).be_u16s() }; - pos.extract(0).overflowing_sub(neg.extract(0)).0 - .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0).0 - .overflowing_add(pos.extract(8).overflowing_sub(neg.extract(8)).0).0 - .overflowing_add(pos.extract(12).overflowing_sub(neg.extract(12)).0).0 as i8 + let pos = unsafe { + _mm256_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x32::splat(0)).be_u16s() + }; + let neg = unsafe { + _mm256_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x32::splat(0)).be_u16s() + }; + pos.extract(0) + .overflowing_sub(neg.extract(0)) + .0 + .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0) + .0 + .overflowing_add(pos.extract(8).overflowing_sub(neg.extract(8)).0) + .0 + .overflowing_add(pos.extract(12).overflowing_sub(neg.extract(12)).0) + .0 as i8 } } @@ -68,12 +89,21 @@ impl UpcastSum for i8x32 { #[inline(always)] fn sum_upcast(&self) -> i64 { optimized!(); - let pos = unsafe { _mm256_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x32::splat(0)).be_u16s() }; - let neg = unsafe { _mm256_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x32::splat(0)).be_u16s() }; - pos.extract(0).overflowing_sub(neg.extract(0)).0 - .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0).0 - .overflowing_add(pos.extract(8).overflowing_sub(neg.extract(8)).0).0 - .overflowing_add(pos.extract(12).overflowing_sub(neg.extract(12)).0).0 as i8 as i64 + let pos = unsafe { + _mm256_sad_epu8(self.max(Self::splat(0)).be_u8s(), u8x32::splat(0)).be_u16s() + }; + let neg = unsafe { + _mm256_sad_epu8(self.min(Self::splat(0)).abs().be_u8s(), u8x32::splat(0)).be_u16s() + }; + pos.extract(0) + .overflowing_sub(neg.extract(0)) + .0 + .overflowing_add(pos.extract(4).overflowing_sub(neg.extract(4)).0) + .0 + .overflowing_add(pos.extract(8).overflowing_sub(neg.extract(8)).0) + .0 + .overflowing_add(pos.extract(12).overflowing_sub(neg.extract(12)).0) + .0 as i8 as i64 } } @@ -122,10 +152,11 @@ impl Sum for i16x8 { #[inline(always)] fn sum(&self) -> Self::Scalar { optimized!(); - let x = unsafe { + let x = unsafe { _mm_hadd_epi16( - _mm_hadd_epi16( - _mm_hadd_epi16(*self, Self::splat(0)), Self::splat(0)), Self::splat(0)) + _mm_hadd_epi16(_mm_hadd_epi16(*self, Self::splat(0)), Self::splat(0)), + Self::splat(0), + ) }; x.extract(0) } @@ -136,10 +167,11 @@ impl Sum for i16x16 { #[inline(always)] fn sum(&self) -> Self::Scalar { optimized!(); - let x = unsafe { + let x = unsafe { _mm256_hadd_epi16( - _mm256_hadd_epi16( - _mm256_hadd_epi16(*self, Self::splat(0)), Self::splat(0)), Self::splat(0)) + _mm256_hadd_epi16(_mm256_hadd_epi16(*self, Self::splat(0)), Self::splat(0)), + Self::splat(0), + ) }; x.extract(0) + x.extract(8) } @@ -152,10 +184,14 @@ impl UpcastSum for i16x16 { optimized!(); unsafe { let (a, b) = self.upcast(); - let x = _mm256_hadd_epi32( - _mm256_hadd_epi32(a.be_i32s(), i32x8::splat(0)), i32x8::splat(0)); - let y = _mm256_hadd_epi32( - _mm256_hadd_epi32(b.be_i32s(), i32x8::splat(0)), i32x8::splat(0)); + let x = _mm256_hadd_epi32( + _mm256_hadd_epi32(a.be_i32s(), i32x8::splat(0)), + i32x8::splat(0), + ); + let y = _mm256_hadd_epi32( + _mm256_hadd_epi32(b.be_i32s(), i32x8::splat(0)), + i32x8::splat(0), + ); (x.extract(0) + x.extract(4) + y.extract(0) + y.extract(4)) as i64 } } @@ -168,10 +204,14 @@ impl Sum for u16x16 { optimized!(); unsafe { let (a, b) = self.upcast(); - let x = _mm256_hadd_epi32( - _mm256_hadd_epi32(a.be_i32s(), i32x8::splat(0)), i32x8::splat(0)); - let y = _mm256_hadd_epi32( - _mm256_hadd_epi32(b.be_i32s(), i32x8::splat(0)), i32x8::splat(0)); + let x = _mm256_hadd_epi32( + _mm256_hadd_epi32(a.be_i32s(), i32x8::splat(0)), + i32x8::splat(0), + ); + let y = _mm256_hadd_epi32( + _mm256_hadd_epi32(b.be_i32s(), i32x8::splat(0)), + i32x8::splat(0), + ); (x.extract(0) + x.extract(4) + y.extract(0) + y.extract(4)) as u16 } } @@ -184,10 +224,14 @@ impl UpcastSum for u16x16 { optimized!(); unsafe { let (a, b) = self.upcast(); - let x = _mm256_hadd_epi32( - _mm256_hadd_epi32(a.be_i32s(), i32x8::splat(0)), i32x8::splat(0)); - let y = _mm256_hadd_epi32( - _mm256_hadd_epi32(b.be_i32s(), i32x8::splat(0)), i32x8::splat(0)); + let x = _mm256_hadd_epi32( + _mm256_hadd_epi32(a.be_i32s(), i32x8::splat(0)), + i32x8::splat(0), + ); + let y = _mm256_hadd_epi32( + _mm256_hadd_epi32(b.be_i32s(), i32x8::splat(0)), + i32x8::splat(0), + ); (x.extract(0) + x.extract(4) + y.extract(0) + y.extract(4)) as i64 } } @@ -198,10 +242,8 @@ impl Sum for i32x8 { #[inline(always)] fn sum(&self) -> Self::Scalar { optimized!(); - let x = unsafe { - _mm256_hadd_epi32( - _mm256_hadd_epi32(*self, Self::splat(0)), Self::splat(0)) - }; + let x = + unsafe { _mm256_hadd_epi32(_mm256_hadd_epi32(*self, Self::splat(0)), Self::splat(0)) }; x.extract(0) + x.extract(4) } } @@ -256,8 +298,14 @@ impl Sum for f64x4 { } } -impl_packed_sum!(u8x64, i8x64, u16x32, u16x8, i16x32, u32x16, u32x8, u32x4, i32x16, i32x4, f32x16, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8); -impl_packed_upcast_sum!(u8x64, i8x64, u16x32, u16x8, i16x32, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2); +impl_packed_sum!( + u8x64, i8x64, u16x32, u16x8, i16x32, u32x16, u32x8, u32x4, i32x16, i32x4, f32x16, u64x8, u64x4, + u64x2, i64x8, i64x4, i64x2, f64x8 +); +impl_packed_upcast_sum!( + u8x64, i8x64, u16x32, u16x8, i16x32, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, + f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2 +); #[cfg(not(target_feature = "avx2"))] impl_packed_sum!(i8x32, u8x32, i16x16, u16x16, i32x8); @@ -281,8 +329,8 @@ impl_packed_sum!(i16x8); impl_packed_upcast_sum!(); mod tests { - use crate::prelude::*; use crate::arch::current::vecs::*; + use crate::prelude::*; test_packed_sum_int!(u8x64, u8, test_packed_sum_u8x64); test_packed_sum_int!(u8x32, u8, test_packed_sum_u8x32); diff --git a/src/arch/x86/intrin/transmute.rs b/src/arch/x86/intrin/transmute.rs index ad95b42..c6bbc5f 100644 --- a/src/arch/x86/intrin/transmute.rs +++ b/src/arch/x86/intrin/transmute.rs @@ -5,12 +5,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vektor::x86_64::*; -use crate::vektor::x86::*; -use crate::intrin::transmute::*; use crate::arch::current::vecs::*; -use crate::vecs::*; use crate::core::mem::transmute; +use crate::intrin::transmute::*; +use crate::vecs::*; +use crate::vektor::x86::*; +use crate::vektor::x86_64::*; impl_packed_transmute!(u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, f32x8, u64x4, i64x4, f64x4, ... diff --git a/src/arch/x86/intrin/upcast.rs b/src/arch/x86/intrin/upcast.rs index 0430c6d..3acf30f 100644 --- a/src/arch/x86/intrin/upcast.rs +++ b/src/arch/x86/intrin/upcast.rs @@ -6,12 +6,12 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. use crate::arch::current::vecs::*; -use crate::vecs::*; +use crate::core::mem::transmute; use crate::intrin::transmute::*; use crate::intrin::upcast::*; -use crate::vektor::x86_64::*; +use crate::vecs::*; use crate::vektor::x86::*; -use crate::core::mem::transmute; +use crate::vektor::x86_64::*; impl Upcast for u8x16 { #[inline(always)] @@ -20,8 +20,10 @@ impl Upcast for u8x16 { // Shuffle the vector as i32s for better perf optimized!(); unsafe { - (_mm_cvtepu8_epi16(self).be_u16s(), - _mm_cvtepu8_epi16(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_u8s()).be_u16s()) + ( + _mm_cvtepu8_epi16(self).be_u16s(), + _mm_cvtepu8_epi16(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_u8s()).be_u16s(), + ) } } @@ -29,22 +31,28 @@ impl Upcast for u8x16 { #[cfg(not(target_feature = "sse4.1"))] fn upcast(self) -> (u16x8, u16x8) { fallback!(); - (u16x8::new(self.extract(0) as u16, - self.extract(1) as u16, - self.extract(2) as u16, - self.extract(3) as u16, - self.extract(4) as u16, - self.extract(5) as u16, - self.extract(6) as u16, - self.extract(7) as u16), - u16x8::new(self.extract(8) as u16, - self.extract(9) as u16, - self.extract(10) as u16, - self.extract(11) as u16, - self.extract(12) as u16, - self.extract(13) as u16, - self.extract(14) as u16, - self.extract(15) as u16)) + ( + u16x8::new( + self.extract(0) as u16, + self.extract(1) as u16, + self.extract(2) as u16, + self.extract(3) as u16, + self.extract(4) as u16, + self.extract(5) as u16, + self.extract(6) as u16, + self.extract(7) as u16, + ), + u16x8::new( + self.extract(8) as u16, + self.extract(9) as u16, + self.extract(10) as u16, + self.extract(11) as u16, + self.extract(12) as u16, + self.extract(13) as u16, + self.extract(14) as u16, + self.extract(15) as u16, + ), + ) } } @@ -55,8 +63,10 @@ impl Upcast for i8x16 { // Shuffle the vector as i32s for better perf optimized!(); unsafe { - (_mm_cvtepi8_epi16(self), - _mm_cvtepi8_epi16(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_i8s())) + ( + _mm_cvtepi8_epi16(self), + _mm_cvtepi8_epi16(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_i8s()), + ) } } @@ -64,22 +74,28 @@ impl Upcast for i8x16 { #[cfg(not(target_feature = "sse4.1"))] fn upcast(self) -> (i16x8, i16x8) { fallback!(); - (i16x8::new(self.extract(0) as i16, - self.extract(1) as i16, - self.extract(2) as i16, - self.extract(3) as i16, - self.extract(4) as i16, - self.extract(5) as i16, - self.extract(6) as i16, - self.extract(7) as i16), - i16x8::new(self.extract(8) as i16, - self.extract(9) as i16, - self.extract(10) as i16, - self.extract(11) as i16, - self.extract(12) as i16, - self.extract(13) as i16, - self.extract(14) as i16, - self.extract(15) as i16)) + ( + i16x8::new( + self.extract(0) as i16, + self.extract(1) as i16, + self.extract(2) as i16, + self.extract(3) as i16, + self.extract(4) as i16, + self.extract(5) as i16, + self.extract(6) as i16, + self.extract(7) as i16, + ), + i16x8::new( + self.extract(8) as i16, + self.extract(9) as i16, + self.extract(10) as i16, + self.extract(11) as i16, + self.extract(12) as i16, + self.extract(13) as i16, + self.extract(14) as i16, + self.extract(15) as i16, + ), + ) } } @@ -89,8 +105,10 @@ impl Upcast for u16x8 { fn upcast(self) -> (u32x4, u32x4) { optimized!(); unsafe { - (_mm_cvtepu16_epi32(self).be_u32s(), - _mm_cvtepu16_epi32(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_u16s()).be_u32s()) + ( + _mm_cvtepu16_epi32(self).be_u32s(), + _mm_cvtepu16_epi32(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_u16s()).be_u32s(), + ) } } @@ -98,14 +116,20 @@ impl Upcast for u16x8 { #[cfg(not(target_feature = "sse4.1"))] fn upcast(self) -> (u32x4, u32x4) { fallback!(); - (u32x4::new(self.extract(0) as u32, - self.extract(1) as u32, - self.extract(2) as u32, - self.extract(3) as u32), - u32x4::new(self.extract(4) as u32, - self.extract(5) as u32, - self.extract(6) as u32, - self.extract(7) as u32)) + ( + u32x4::new( + self.extract(0) as u32, + self.extract(1) as u32, + self.extract(2) as u32, + self.extract(3) as u32, + ), + u32x4::new( + self.extract(4) as u32, + self.extract(5) as u32, + self.extract(6) as u32, + self.extract(7) as u32, + ), + ) } } @@ -115,8 +139,10 @@ impl Upcast for i16x8 { fn upcast(self) -> (i32x4, i32x4) { optimized!(); unsafe { - (_mm_cvtepi16_epi32(self), - _mm_cvtepi16_epi32(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_i16s())) + ( + _mm_cvtepi16_epi32(self), + _mm_cvtepi16_epi32(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_i16s()), + ) } } @@ -124,14 +150,20 @@ impl Upcast for i16x8 { #[cfg(not(target_feature = "sse4.1"))] fn upcast(self) -> (i32x4, i32x4) { fallback!(); - (i32x4::new(self.extract(0) as i32, - self.extract(1) as i32, - self.extract(2) as i32, - self.extract(3) as i32), - i32x4::new(self.extract(4) as i32, - self.extract(5) as i32, - self.extract(6) as i32, - self.extract(7) as i32)) + ( + i32x4::new( + self.extract(0) as i32, + self.extract(1) as i32, + self.extract(2) as i32, + self.extract(3) as i32, + ), + i32x4::new( + self.extract(4) as i32, + self.extract(5) as i32, + self.extract(6) as i32, + self.extract(7) as i32, + ), + ) } } @@ -141,8 +173,13 @@ impl Upcast for u8x32 { fn upcast(self) -> (u16x16, u16x16) { optimized!(); unsafe { - (_mm256_cvtepu8_epi16(transmute(_mm256_castsi256_si128(transmute(self)))).be_u16s(), - _mm256_cvtepu8_epi16(transmute(_mm256_castsi256_si128(transmute(_mm256_permute4x64_epi64(self.be_i64s(), 0x0E).be_u16s())))).be_u16s()) + ( + _mm256_cvtepu8_epi16(transmute(_mm256_castsi256_si128(transmute(self)))).be_u16s(), + _mm256_cvtepu8_epi16(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(self.be_i64s(), 0x0E).be_u16s(), + )))) + .be_u16s(), + ) } } @@ -150,38 +187,44 @@ impl Upcast for u8x32 { #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (u16x16, u16x16) { fallback!(); - (u16x16::new(self.extract(0) as u16, - self.extract(1) as u16, - self.extract(2) as u16, - self.extract(3) as u16, - self.extract(4) as u16, - self.extract(5) as u16, - self.extract(6) as u16, - self.extract(7) as u16, - self.extract(8) as u16, - self.extract(9) as u16, - self.extract(10) as u16, - self.extract(11) as u16, - self.extract(12) as u16, - self.extract(13) as u16, - self.extract(14) as u16, - self.extract(15) as u16), - u16x16::new(self.extract(16) as u16, - self.extract(17) as u16, - self.extract(18) as u16, - self.extract(19) as u16, - self.extract(20) as u16, - self.extract(21) as u16, - self.extract(22) as u16, - self.extract(23) as u16, - self.extract(24) as u16, - self.extract(25) as u16, - self.extract(26) as u16, - self.extract(27) as u16, - self.extract(28) as u16, - self.extract(29) as u16, - self.extract(30) as u16, - self.extract(31) as u16)) + ( + u16x16::new( + self.extract(0) as u16, + self.extract(1) as u16, + self.extract(2) as u16, + self.extract(3) as u16, + self.extract(4) as u16, + self.extract(5) as u16, + self.extract(6) as u16, + self.extract(7) as u16, + self.extract(8) as u16, + self.extract(9) as u16, + self.extract(10) as u16, + self.extract(11) as u16, + self.extract(12) as u16, + self.extract(13) as u16, + self.extract(14) as u16, + self.extract(15) as u16, + ), + u16x16::new( + self.extract(16) as u16, + self.extract(17) as u16, + self.extract(18) as u16, + self.extract(19) as u16, + self.extract(20) as u16, + self.extract(21) as u16, + self.extract(22) as u16, + self.extract(23) as u16, + self.extract(24) as u16, + self.extract(25) as u16, + self.extract(26) as u16, + self.extract(27) as u16, + self.extract(28) as u16, + self.extract(29) as u16, + self.extract(30) as u16, + self.extract(31) as u16, + ), + ) } } @@ -191,15 +234,12 @@ impl Upcast for i8x32 { fn upcast(self) -> (i16x16, i16x16) { optimized!(); unsafe { - (_mm256_cvtepi8_epi16( - transmute( - _mm256_castsi256_si128( - transmute(self)))), - _mm256_cvtepi8_epi16( - transmute( - _mm256_castsi256_si128( - transmute( - _mm256_permute4x64_epi64(self.be_i64s(), 0x0E)))))) + ( + _mm256_cvtepi8_epi16(transmute(_mm256_castsi256_si128(transmute(self)))), + _mm256_cvtepi8_epi16(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(self.be_i64s(), 0x0E), + )))), + ) } } @@ -207,38 +247,44 @@ impl Upcast for i8x32 { #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (i16x16, i16x16) { fallback!(); - (i16x16::new(self.extract(0) as i16, - self.extract(1) as i16, - self.extract(2) as i16, - self.extract(3) as i16, - self.extract(4) as i16, - self.extract(5) as i16, - self.extract(6) as i16, - self.extract(7) as i16, - self.extract(8) as i16, - self.extract(9) as i16, - self.extract(10) as i16, - self.extract(11) as i16, - self.extract(12) as i16, - self.extract(13) as i16, - self.extract(14) as i16, - self.extract(15) as i16), - i16x16::new(self.extract(16) as i16, - self.extract(17) as i16, - self.extract(18) as i16, - self.extract(19) as i16, - self.extract(20) as i16, - self.extract(21) as i16, - self.extract(22) as i16, - self.extract(23) as i16, - self.extract(24) as i16, - self.extract(25) as i16, - self.extract(26) as i16, - self.extract(27) as i16, - self.extract(28) as i16, - self.extract(29) as i16, - self.extract(30) as i16, - self.extract(31) as i16)) + ( + i16x16::new( + self.extract(0) as i16, + self.extract(1) as i16, + self.extract(2) as i16, + self.extract(3) as i16, + self.extract(4) as i16, + self.extract(5) as i16, + self.extract(6) as i16, + self.extract(7) as i16, + self.extract(8) as i16, + self.extract(9) as i16, + self.extract(10) as i16, + self.extract(11) as i16, + self.extract(12) as i16, + self.extract(13) as i16, + self.extract(14) as i16, + self.extract(15) as i16, + ), + i16x16::new( + self.extract(16) as i16, + self.extract(17) as i16, + self.extract(18) as i16, + self.extract(19) as i16, + self.extract(20) as i16, + self.extract(21) as i16, + self.extract(22) as i16, + self.extract(23) as i16, + self.extract(24) as i16, + self.extract(25) as i16, + self.extract(26) as i16, + self.extract(27) as i16, + self.extract(28) as i16, + self.extract(29) as i16, + self.extract(30) as i16, + self.extract(31) as i16, + ), + ) } } @@ -248,8 +294,13 @@ impl Upcast for u16x16 { fn upcast(self) -> (u32x8, u32x8) { optimized!(); unsafe { - (_mm256_cvtepu16_epi32(transmute(_mm256_castsi256_si128(transmute(self)))).be_u32s(), - _mm256_cvtepu16_epi32(transmute(_mm256_castsi256_si128(transmute(_mm256_permute4x64_epi64(self.be_i64s(), 0x0E).be_u32s())))).be_u32s()) + ( + _mm256_cvtepu16_epi32(transmute(_mm256_castsi256_si128(transmute(self)))).be_u32s(), + _mm256_cvtepu16_epi32(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(self.be_i64s(), 0x0E).be_u32s(), + )))) + .be_u32s(), + ) } } @@ -257,23 +308,28 @@ impl Upcast for u16x16 { #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (u32x8, u32x8) { fallback!(); - (u32x8::new(self.extract(0) as u32, - self.extract(1) as u32, - self.extract(2) as u32, - self.extract(3) as u32, - self.extract(4) as u32, - self.extract(5) as u32, - self.extract(6) as u32, - self.extract(7) as u32), - u32x8::new(self.extract(8) as u32, - self.extract(9) as u32, - self.extract(10) as u32, - self.extract(11) as u32, - self.extract(12) as u32, - self.extract(13) as u32, - self.extract(14) as u32, - self.extract(15) as u32)) - + ( + u32x8::new( + self.extract(0) as u32, + self.extract(1) as u32, + self.extract(2) as u32, + self.extract(3) as u32, + self.extract(4) as u32, + self.extract(5) as u32, + self.extract(6) as u32, + self.extract(7) as u32, + ), + u32x8::new( + self.extract(8) as u32, + self.extract(9) as u32, + self.extract(10) as u32, + self.extract(11) as u32, + self.extract(12) as u32, + self.extract(13) as u32, + self.extract(14) as u32, + self.extract(15) as u32, + ), + ) } } @@ -283,15 +339,12 @@ impl Upcast for i16x16 { fn upcast(self) -> (i32x8, i32x8) { optimized!(); unsafe { - (_mm256_cvtepi16_epi32( - transmute( - _mm256_castsi256_si128( - transmute(self)))), - _mm256_cvtepi16_epi32( - transmute( - _mm256_castsi256_si128( - transmute( - _mm256_permute4x64_epi64(self.be_i64s(), 0x0E)))))) + ( + _mm256_cvtepi16_epi32(transmute(_mm256_castsi256_si128(transmute(self)))), + _mm256_cvtepi16_epi32(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(self.be_i64s(), 0x0E), + )))), + ) } } @@ -299,22 +352,28 @@ impl Upcast for i16x16 { #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (i32x8, i32x8) { fallback!(); - (i32x8::new(self.extract(0) as i32, - self.extract(1) as i32, - self.extract(2) as i32, - self.extract(3) as i32, - self.extract(4) as i32, - self.extract(5) as i32, - self.extract(6) as i32, - self.extract(7) as i32), - i32x8::new(self.extract(8) as i32, - self.extract(9) as i32, - self.extract(10) as i32, - self.extract(11) as i32, - self.extract(12) as i32, - self.extract(13) as i32, - self.extract(14) as i32, - self.extract(15) as i32)) + ( + i32x8::new( + self.extract(0) as i32, + self.extract(1) as i32, + self.extract(2) as i32, + self.extract(3) as i32, + self.extract(4) as i32, + self.extract(5) as i32, + self.extract(6) as i32, + self.extract(7) as i32, + ), + i32x8::new( + self.extract(8) as i32, + self.extract(9) as i32, + self.extract(10) as i32, + self.extract(11) as i32, + self.extract(12) as i32, + self.extract(13) as i32, + self.extract(14) as i32, + self.extract(15) as i32, + ), + ) } } @@ -324,17 +383,22 @@ impl Upcast for f32x4 { fn upcast(self) -> (f64x2, f64x2) { // Shuffle the vector as i32s for better perf optimized!(); - unsafe { (_mm_cvtps_pd(self), _mm_cvtps_pd(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_f32s_unchecked())) } + unsafe { + ( + _mm_cvtps_pd(self), + _mm_cvtps_pd(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_f32s_unchecked()), + ) + } } #[inline(always)] #[cfg(not(target_feature = "sse2"))] fn upcast(self) -> (f64x2, f64x2) { fallback!(); - (f64x2::new(self.extract(0) as f64, - self.extract(1) as f64), - f64x2::new(self.extract(2) as f64, - self.extract(3) as f64)) + ( + f64x2::new(self.extract(0) as f64, self.extract(1) as f64), + f64x2::new(self.extract(2) as f64, self.extract(3) as f64), + ) } } @@ -343,17 +407,22 @@ impl Upcast for i32x4 { #[cfg(target_feature = "sse2")] fn upcast(self) -> (f64x2, f64x2) { optimized!(); - unsafe { (_mm_cvtepi32_pd(self), _mm_cvtepi32_pd(_mm_shuffle_epi32(self, 0x0E))) } + unsafe { + ( + _mm_cvtepi32_pd(self), + _mm_cvtepi32_pd(_mm_shuffle_epi32(self, 0x0E)), + ) + } } #[inline(always)] #[cfg(not(target_feature = "sse2"))] fn upcast(self) -> (f64x2, f64x2) { fallback!(); - (f64x2::new(self.extract(0) as f64, - self.extract(1) as f64), - f64x2::new(self.extract(2) as f64, - self.extract(3) as f64)) + ( + f64x2::new(self.extract(0) as f64, self.extract(1) as f64), + f64x2::new(self.extract(2) as f64, self.extract(3) as f64), + ) } } @@ -363,8 +432,10 @@ impl Upcast for i32x4 { fn upcast(self) -> (i64x2, i64x2) { optimized!(); unsafe { - (_mm_cvtepi32_epi64(self), - _mm_cvtepi32_epi64(_mm_shuffle_epi32(self, 0x0E))) + ( + _mm_cvtepi32_epi64(self), + _mm_cvtepi32_epi64(_mm_shuffle_epi32(self, 0x0E)), + ) } } @@ -372,10 +443,10 @@ impl Upcast for i32x4 { #[cfg(not(target_feature = "sse4.1"))] fn upcast(self) -> (i64x2, i64x2) { fallback!(); - (i64x2::new(self.extract(0) as i64, - self.extract(1) as i64), - i64x2::new(self.extract(2) as i64, - self.extract(3) as i64)) + ( + i64x2::new(self.extract(0) as i64, self.extract(1) as i64), + i64x2::new(self.extract(2) as i64, self.extract(3) as i64), + ) } } @@ -385,18 +456,21 @@ impl Upcast for u32x4 { fn upcast(self) -> (u64x2, u64x2) { optimized!(); unsafe { - (_mm_cvtepu32_epi64(self).be_u64s(), - _mm_cvtepu32_epi64(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_u32s()).be_u64s()) } + ( + _mm_cvtepu32_epi64(self).be_u64s(), + _mm_cvtepu32_epi64(_mm_shuffle_epi32(self.be_i32s(), 0x0E).be_u32s()).be_u64s(), + ) + } } #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn upcast(self) -> (u64x2, u64x2) { fallback!(); - (u64x2::new(self.extract(0) as u64, - self.extract(1) as u64), - u64x2::new(self.extract(2) as u64, - self.extract(3) as u64)) + ( + u64x2::new(self.extract(0) as u64, self.extract(1) as u64), + u64x2::new(self.extract(2) as u64, self.extract(3) as u64), + ) } } @@ -407,15 +481,12 @@ impl Upcast for f32x8 { // Shuffle the vector as i32s for better perf optimized!(); unsafe { - (_mm256_cvtps_pd( - transmute( - _mm256_castsi256_si128( - transmute(self)))), - _mm256_cvtps_pd( - transmute( - _mm256_castsi256_si128( - transmute( - _mm256_permute4x64_epi64(self.be_i64s(), 0x0E)))))) + ( + _mm256_cvtps_pd(transmute(_mm256_castsi256_si128(transmute(self)))), + _mm256_cvtps_pd(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(self.be_i64s(), 0x0E), + )))), + ) } } @@ -423,14 +494,20 @@ impl Upcast for f32x8 { #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (f64x4, f64x4) { fallback!(); - (f64x4::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64), - f64x4::new(self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64)) + ( + f64x4::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + ), + f64x4::new( + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + ) } } @@ -440,15 +517,12 @@ impl Upcast for i32x8 { fn upcast(self) -> (f64x4, f64x4) { optimized!(); unsafe { - (_mm256_cvtepi32_pd( - transmute( - _mm256_castsi256_si128( - transmute(self)))), - _mm256_cvtepi32_pd( - transmute( - _mm256_castsi256_si128( - transmute( - _mm256_permute4x64_epi64(self.be_i64s(), 0x0E)))))) + ( + _mm256_cvtepi32_pd(transmute(_mm256_castsi256_si128(transmute(self)))), + _mm256_cvtepi32_pd(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(self.be_i64s(), 0x0E), + )))), + ) } } @@ -456,14 +530,20 @@ impl Upcast for i32x8 { #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (f64x4, f64x4) { fallback!(); - (f64x4::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64), - f64x4::new(self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64)) + ( + f64x4::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + ), + f64x4::new( + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + ) } } @@ -473,15 +553,12 @@ impl Upcast for i32x8 { fn upcast(self) -> (i64x4, i64x4) { optimized!(); unsafe { - (_mm256_cvtepi32_epi64( - transmute( - _mm256_castsi256_si128( - transmute(self)))), - _mm256_cvtepi32_epi64( - transmute( - _mm256_castsi256_si128( - transmute( - _mm256_permute4x64_epi64(self.be_i64s(), 0x0E)))))) + ( + _mm256_cvtepi32_epi64(transmute(_mm256_castsi256_si128(transmute(self)))), + _mm256_cvtepi32_epi64(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(self.be_i64s(), 0x0E), + )))), + ) } } @@ -489,14 +566,20 @@ impl Upcast for i32x8 { #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (i64x4, i64x4) { fallback!(); - (i64x4::new(self.extract(0) as i64, - self.extract(1) as i64, - self.extract(2) as i64, - self.extract(3) as i64), - i64x4::new(self.extract(4) as i64, - self.extract(5) as i64, - self.extract(6) as i64, - self.extract(7) as i64)) + ( + i64x4::new( + self.extract(0) as i64, + self.extract(1) as i64, + self.extract(2) as i64, + self.extract(3) as i64, + ), + i64x4::new( + self.extract(4) as i64, + self.extract(5) as i64, + self.extract(6) as i64, + self.extract(7) as i64, + ), + ) } } @@ -506,22 +589,34 @@ impl Upcast for u32x8 { fn upcast(self) -> (u64x4, u64x4) { optimized!(); unsafe { - (_mm256_cvtepu32_epi64(transmute(_mm256_castsi256_si128(transmute(self)))).be_u64s(), - _mm256_cvtepu32_epi64(transmute(_mm256_castsi256_si128(transmute(_mm256_permute4x64_epi64(transmute(self), 0x0E).be_u32s())))).be_u64s()) } + ( + _mm256_cvtepu32_epi64(transmute(_mm256_castsi256_si128(transmute(self)))).be_u64s(), + _mm256_cvtepu32_epi64(transmute(_mm256_castsi256_si128(transmute( + _mm256_permute4x64_epi64(transmute(self), 0x0E).be_u32s(), + )))) + .be_u64s(), + ) + } } #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn upcast(self) -> (u64x4, u64x4) { fallback!(); - (u64x4::new(self.extract(0) as u64, - self.extract(1) as u64, - self.extract(2) as u64, - self.extract(3) as u64), - u64x4::new(self.extract(4) as u64, - self.extract(5) as u64, - self.extract(6) as u64, - self.extract(7) as u64)) + ( + u64x4::new( + self.extract(0) as u64, + self.extract(1) as u64, + self.extract(2) as u64, + self.extract(3) as u64, + ), + u64x4::new( + self.extract(4) as u64, + self.extract(5) as u64, + self.extract(6) as u64, + self.extract(7) as u64, + ), + ) } } @@ -529,22 +624,28 @@ impl Upcast for f32x16 { #[inline(always)] fn upcast(self) -> (f64x8, f64x8) { fallback!(); - (f64x8::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64, - self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64), - f64x8::new(self.extract(8) as f64, - self.extract(9) as f64, - self.extract(10) as f64, - self.extract(11) as f64, - self.extract(12) as f64, - self.extract(13) as f64, - self.extract(14) as f64, - self.extract(15) as f64)) + ( + f64x8::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + f64x8::new( + self.extract(8) as f64, + self.extract(9) as f64, + self.extract(10) as f64, + self.extract(11) as f64, + self.extract(12) as f64, + self.extract(13) as f64, + self.extract(14) as f64, + self.extract(15) as f64, + ), + ) } } @@ -552,22 +653,28 @@ impl Upcast for i32x16 { #[inline(always)] fn upcast(self) -> (f64x8, f64x8) { fallback!(); - (f64x8::new(self.extract(0) as f64, - self.extract(1) as f64, - self.extract(2) as f64, - self.extract(3) as f64, - self.extract(4) as f64, - self.extract(5) as f64, - self.extract(6) as f64, - self.extract(7) as f64), - f64x8::new(self.extract(8) as f64, - self.extract(9) as f64, - self.extract(10) as f64, - self.extract(11) as f64, - self.extract(12) as f64, - self.extract(13) as f64, - self.extract(14) as f64, - self.extract(15) as f64)) + ( + f64x8::new( + self.extract(0) as f64, + self.extract(1) as f64, + self.extract(2) as f64, + self.extract(3) as f64, + self.extract(4) as f64, + self.extract(5) as f64, + self.extract(6) as f64, + self.extract(7) as f64, + ), + f64x8::new( + self.extract(8) as f64, + self.extract(9) as f64, + self.extract(10) as f64, + self.extract(11) as f64, + self.extract(12) as f64, + self.extract(13) as f64, + self.extract(14) as f64, + self.extract(15) as f64, + ), + ) } } @@ -575,22 +682,28 @@ impl Upcast for i32x16 { #[inline(always)] fn upcast(self) -> (i64x8, i64x8) { fallback!(); - (i64x8::new(self.extract(0) as i64, - self.extract(1) as i64, - self.extract(2) as i64, - self.extract(3) as i64, - self.extract(4) as i64, - self.extract(5) as i64, - self.extract(6) as i64, - self.extract(7) as i64), - i64x8::new(self.extract(8) as i64, - self.extract(9) as i64, - self.extract(10) as i64, - self.extract(11) as i64, - self.extract(12) as i64, - self.extract(13) as i64, - self.extract(14) as i64, - self.extract(15) as i64)) + ( + i64x8::new( + self.extract(0) as i64, + self.extract(1) as i64, + self.extract(2) as i64, + self.extract(3) as i64, + self.extract(4) as i64, + self.extract(5) as i64, + self.extract(6) as i64, + self.extract(7) as i64, + ), + i64x8::new( + self.extract(8) as i64, + self.extract(9) as i64, + self.extract(10) as i64, + self.extract(11) as i64, + self.extract(12) as i64, + self.extract(13) as i64, + self.extract(14) as i64, + self.extract(15) as i64, + ), + ) } } @@ -598,21 +711,27 @@ impl Upcast for u32x16 { #[inline(always)] fn upcast(self) -> (u64x8, u64x8) { fallback!(); - (u64x8::new(self.extract(0) as u64, - self.extract(1) as u64, - self.extract(2) as u64, - self.extract(3) as u64, - self.extract(4) as u64, - self.extract(5) as u64, - self.extract(6) as u64, - self.extract(7) as u64), - u64x8::new(self.extract(8) as u64, - self.extract(9) as u64, - self.extract(10) as u64, - self.extract(11) as u64, - self.extract(12) as u64, - self.extract(13) as u64, - self.extract(14) as u64, - self.extract(15) as u64)) + ( + u64x8::new( + self.extract(0) as u64, + self.extract(1) as u64, + self.extract(2) as u64, + self.extract(3) as u64, + self.extract(4) as u64, + self.extract(5) as u64, + self.extract(6) as u64, + self.extract(7) as u64, + ), + u64x8::new( + self.extract(8) as u64, + self.extract(9) as u64, + self.extract(10) as u64, + self.extract(11) as u64, + self.extract(12) as u64, + self.extract(13) as u64, + self.extract(14) as u64, + self.extract(15) as u64, + ), + ) } } diff --git a/src/arch/x86/mod.rs b/src/arch/x86/mod.rs index a2be36e..9bc51e7 100644 --- a/src/arch/x86/mod.rs +++ b/src/arch/x86/mod.rs @@ -1,3 +1,3 @@ pub mod intrin; -pub mod vecs; pub mod vec_patterns; +pub mod vecs; diff --git a/src/arch/x86/vec_patterns.rs b/src/arch/x86/vec_patterns.rs index d5221cc..e7f36e8 100644 --- a/src/arch/x86/vec_patterns.rs +++ b/src/arch/x86/vec_patterns.rs @@ -17,685 +17,1846 @@ use crate::vecs::*; use vektor::x86::*; -const PART_MASK: [u8; 128] = [0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF]; +const PART_MASK: [u8; 128] = [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, +]; impl Pattern for u8x64 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 17 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 18 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 19 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 20 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 21 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 22 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 23 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 24 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 25 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 26 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 27 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 28 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 29 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 30 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 31 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 32 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 33 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 34 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 35 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 36 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 37 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 38 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 39 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 40 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 41 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 42 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 43 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 44 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 45 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 46 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 47 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 48 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 49 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 50 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 51 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 52 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 53 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 54 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 55 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 56 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 57 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 58 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 59 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 60 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 61 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 62 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 63 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 64 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 17 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 18 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 19 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 20 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 21 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 22 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 23 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 24 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 25 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 26 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 27 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 28 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 29 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 30 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 31 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 32 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 33 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 34 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 35 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 36 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 37 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 38 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 39 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 40 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 41 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 42 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 43 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 44 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 45 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 46 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 47 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 48 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 49 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 50 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 51 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 52 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 53 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 54 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 55 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 56 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 57 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, + ), + 58 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, + ), + 59 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, + ), + 60 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, + ), + 61 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, + ), + 62 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, + ), + 63 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, + ), + 64 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for u8x32 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 17 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 18 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 19 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 20 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 21 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 22 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 23 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 24 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 25 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 26 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 27 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 28 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 29 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 30 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 31 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 32 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 17 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 18 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 19 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 20 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 21 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 22 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 23 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 24 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 25 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 26 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 27 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 28 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 29 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 30 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 31 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 32 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for u8x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for i8x64 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 17 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 18 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 19 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 20 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 21 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 22 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 23 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 24 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 25 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 26 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 27 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 28 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 29 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 30 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 31 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 32 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 33 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 34 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 35 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 36 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 37 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 38 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 39 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 40 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 41 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 42 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 43 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 44 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 45 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 46 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 47 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 48 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 49 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 50 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 51 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 52 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 53 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 54 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 55 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 56 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 57 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 58 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 59 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 60 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 61 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 62 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 63 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 64 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 17 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 18 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 19 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 20 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 21 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 22 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 23 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 24 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 25 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 26 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 27 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 28 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 29 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 30 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 31 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 32 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 33 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 34 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 35 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 36 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 37 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 38 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 39 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 40 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 41 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 42 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 43 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 44 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 45 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 46 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 47 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 48 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 49 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 50 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 51 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 52 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 53 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 54 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 55 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 56 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, + ), + 57 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, + ), + 58 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, + ), + 59 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, + ), + 60 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, + ), + 61 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, + ), + 62 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, + ), + 63 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, + ), + 64 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for i8x32 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 17 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 18 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 19 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 20 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 21 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 22 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 23 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 24 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 25 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 26 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 27 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 28 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 29 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 30 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 31 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 32 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 17 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 18 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 19 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 20 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 21 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 22 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 23 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 24 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 25 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 26 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 27 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 28 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 29 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 30 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 31 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 32 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for i8x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) + } + + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFu8) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFu8) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00u8) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00u8) }) + } } impl Pattern for u16x32 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) + } + + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 17 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 18 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 19 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 20 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 21 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 22 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 23 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 24 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 25 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 26 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 27 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 28 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 29 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 30 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 31 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 32 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 17 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 18 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 19 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 20 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 21 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 22 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 23 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 24 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 25 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 26 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 27 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 28 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 29 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 30 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 31 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 32 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for u16x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) + } + + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for u16x8 { @@ -709,20 +1870,29 @@ impl Pattern for u16x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -738,167 +1908,328 @@ impl Pattern for u16x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for i16x32 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 17 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 18 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 19 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 20 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 21 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 22 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 23 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 24 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 25 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 26 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 27 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 28 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 29 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 30 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 31 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 32 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 17 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 18 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 19 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 20 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 21 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 22 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 23 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 24 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 25 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 26 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 27 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 28 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 29 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 30 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 31 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 32 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for i16x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for i16x8 { @@ -912,20 +2243,29 @@ impl Pattern for i16x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -941,86 +2281,133 @@ impl Pattern for i16x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFu16) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFu16) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000u16) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000u16) }) + } } impl Pattern for u32x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for u32x8 { @@ -1034,20 +2421,29 @@ impl Pattern for u32x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1063,21 +2459,21 @@ impl Pattern for u32x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for u32x4 { @@ -1091,20 +2487,29 @@ impl Pattern for u32x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1116,86 +2521,133 @@ impl Pattern for u32x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for i32x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for i32x8 { @@ -1209,20 +2661,29 @@ impl Pattern for i32x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1238,21 +2699,21 @@ impl Pattern for i32x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for i32x4 { @@ -1266,20 +2727,29 @@ impl Pattern for i32x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1291,86 +2761,133 @@ impl Pattern for i32x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for f32x16 { #[inline(always)] fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo) + Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ) } #[inline(always)] fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self { - Self::new(hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo) + Self::new( + hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, hi, lo, + ) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { assert!(off <= Self::WIDTH); fallback!(); match off { - 0 => Self::new(lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 1 => Self::new(hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 2 => Self::new(hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 3 => Self::new(hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 4 => Self::new(hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 5 => Self::new(hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo), - 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo), - 9 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo), - 10 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo), - 11 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo), - 12 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo), - 13 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo), - 14 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo), - 15 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo), - 16 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + 0 => Self::new( + lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 1 => Self::new( + hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 2 => Self::new( + hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 3 => Self::new( + hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 4 => Self::new( + hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 5 => Self::new( + hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 6 => Self::new( + hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 7 => Self::new( + hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 8 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, lo, + ), + 9 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, lo, + ), + 10 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, lo, + ), + 11 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, lo, + ), + 12 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, lo, + ), + 13 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, lo, + ), + 14 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, lo, + ), + 15 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, lo, + ), + 16 => Self::new( + hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, hi, + ), + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for f32x8 { @@ -1384,20 +2901,29 @@ impl Pattern for f32x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1413,21 +2939,21 @@ impl Pattern for f32x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for f32x4 { @@ -1441,20 +2967,29 @@ impl Pattern for f32x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1466,21 +3001,21 @@ impl Pattern for f32x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFu32) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x00000000u32) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x00000000u32) }) + } } impl Pattern for u64x8 { @@ -1494,20 +3029,29 @@ impl Pattern for u64x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1523,21 +3067,21 @@ impl Pattern for u64x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for u64x4 { @@ -1551,20 +3095,29 @@ impl Pattern for u64x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1576,21 +3129,21 @@ impl Pattern for u64x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for u64x2 { @@ -1604,20 +3157,29 @@ impl Pattern for u64x2 { Self::new(hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1627,21 +3189,21 @@ impl Pattern for u64x2 { 0 => Self::new(lo, lo), 1 => Self::new(hi, lo), 2 => Self::new(hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for i64x8 { @@ -1655,20 +3217,29 @@ impl Pattern for i64x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1684,21 +3255,21 @@ impl Pattern for i64x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for i64x4 { @@ -1712,20 +3283,29 @@ impl Pattern for i64x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1737,21 +3317,21 @@ impl Pattern for i64x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for i64x2 { @@ -1765,20 +3345,29 @@ impl Pattern for i64x2 { Self::new(hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1788,21 +3377,21 @@ impl Pattern for i64x2 { 0 => Self::new(lo, lo), 1 => Self::new(hi, lo), 2 => Self::new(hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for f64x8 { @@ -1816,20 +3405,29 @@ impl Pattern for f64x8 { Self::new(hi, lo, hi, lo, hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx512-notyet")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm512_mask_mov_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx512-notyet")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm512_mask_mov_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx512-notyet"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1845,21 +3443,21 @@ impl Pattern for f64x8 { 6 => Self::new(hi, hi, hi, hi, hi, hi, lo, lo), 7 => Self::new(hi, hi, hi, hi, hi, hi, hi, lo), 8 => Self::new(hi, hi, hi, hi, hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for f64x4 { @@ -1873,20 +3471,29 @@ impl Pattern for f64x4 { Self::new(hi, lo, hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "avx2")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm256_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "avx2")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm256_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "avx2"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1898,21 +3505,21 @@ impl Pattern for f64x4 { 2 => Self::new(hi, hi, lo, lo), 3 => Self::new(hi, hi, hi, lo), 4 => Self::new(hi, hi, hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } impl Pattern for f64x2 { @@ -1926,20 +3533,29 @@ impl Pattern for f64x2 { Self::new(hi, lo) } - #[inline(always)] - fn partition_mask(off: usize) -> Self { - debug_assert!(off <= Self::WIDTH); - debug_assert!(off * Self::Scalar::SIZE <= 64); - Self::load(unsafe { transmute(&PART_MASK[..]) }, 64 / Self::Scalar::SIZE - off) - } + #[inline(always)] + fn partition_mask(off: usize) -> Self { + debug_assert!(off <= Self::WIDTH); + debug_assert!(off * Self::Scalar::SIZE <= 64); + Self::load( + unsafe { transmute(&PART_MASK[..]) }, + 64 / Self::Scalar::SIZE - off, + ) + } + + #[inline(always)] + #[cfg(target_feature = "sse4.1")] + fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { + optimized!(); + unsafe { + transmute(_mm_blendv_epi8( + transmute(Self::splat(hi)), + transmute(Self::splat(lo)), + transmute(Self::partition_mask(off)), + )) + } + } - #[inline(always)] - #[cfg(target_feature = "sse4.1")] - fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { - optimized!(); - unsafe { transmute(_mm_blendv_epi8(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) } - } - #[inline(always)] #[cfg(not(target_feature = "sse4.1"))] fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self { @@ -1949,20 +3565,19 @@ impl Pattern for f64x2 { 0 => Self::new(lo, lo), 1 => Self::new(hi, lo), 2 => Self::new(hi, hi), - _ => unreachable!() + _ => unreachable!(), } } - /// Return a vector made entirely of ones. - #[inline(always)] - fn ones() -> Self { - Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) - } + /// Return a vector made entirely of ones. + #[inline(always)] + fn ones() -> Self { + Self::splat(unsafe { transmute(0xFFFFFFFFFFFFFFFFu64) }) + } - /// Return a vector made entirely of zeroes. - #[inline(always)] - fn zeroes() -> Self { - Self::splat(unsafe { transmute(0x0000000000000000u64) }) - } + /// Return a vector made entirely of zeroes. + #[inline(always)] + fn zeroes() -> Self { + Self::splat(unsafe { transmute(0x0000000000000000u64) }) + } } - diff --git a/src/arch/x86/vecs.rs b/src/arch/x86/vecs.rs index c9ef68a..3e3a0fa 100644 --- a/src/arch/x86/vecs.rs +++ b/src/arch/x86/vecs.rs @@ -1,5 +1,9 @@ pub use crate::vecs::*; -pub use packed_simd::{u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2}; +pub use packed_simd::{ + f32x16, f32x4, f32x8, f64x2, f64x4, f64x8, i16x16, i16x32, i16x8, i32x16, i32x4, i32x8, i64x2, + i64x4, i64x8, i8x16, i8x32, i8x64, u16x16, u16x32, u16x8, u32x16, u32x4, u32x8, u64x2, u64x4, + u64x8, u8x16, u8x32, u8x64, +}; impl_packed!(u8, u8s, u8x64, 1, 64, ["avx512"], ["avx1024"]); impl_packed!(u8, u8s, u8x32, 1, 32, ["avx2"], ["avx512"]); @@ -49,8 +53,51 @@ mod tests { } // TODO: Do we need better test cases for this? - test_product!((u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, f32, f32, f32, u64, u64, u64, i64, i64, i64, f64, f64, f64), - (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2), - (scalar_product_u8x64, scalar_product_u8x32, scalar_product_u8x16, scalar_product_i8x64, scalar_product_i8x32, scalar_product_i8x16, scalar_product_u16x32, scalar_product_u16x16, scalar_product_u16x8, scalar_product_i16x32, scalar_product_i16x16, scalar_product_i16x8, scalar_product_u32x16, scalar_product_u32x8, scalar_product_u32x4, scalar_product_i32x16, scalar_product_i32x8, scalar_product_i32x4, scalar_product_f32x16, scalar_product_f32x8, scalar_product_f32x4, scalar_product_u64x8, scalar_product_u64x4, scalar_product_u64x2, scalar_product_i64x8, scalar_product_i64x4, scalar_product_i64x2, scalar_product_f64x8, scalar_product_f64x4, scalar_product_f64x2), - (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)); + test_product!( + ( + u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, + f32, f32, f32, u64, u64, u64, i64, i64, i64, f64, f64, f64 + ), + ( + u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, + u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, + i64x8, i64x4, i64x2, f64x8, f64x4, f64x2 + ), + ( + scalar_product_u8x64, + scalar_product_u8x32, + scalar_product_u8x16, + scalar_product_i8x64, + scalar_product_i8x32, + scalar_product_i8x16, + scalar_product_u16x32, + scalar_product_u16x16, + scalar_product_u16x8, + scalar_product_i16x32, + scalar_product_i16x16, + scalar_product_i16x8, + scalar_product_u32x16, + scalar_product_u32x8, + scalar_product_u32x4, + scalar_product_i32x16, + scalar_product_i32x8, + scalar_product_i32x4, + scalar_product_f32x16, + scalar_product_f32x8, + scalar_product_f32x4, + scalar_product_u64x8, + scalar_product_u64x4, + scalar_product_u64x2, + scalar_product_i64x8, + scalar_product_i64x4, + scalar_product_i64x2, + scalar_product_f64x8, + scalar_product_f64x4, + scalar_product_f64x2 + ), + ( + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1 + ) + ); } diff --git a/src/debug.rs b/src/debug.rs index b7e3a2a..3470db2 100644 --- a/src/debug.rs +++ b/src/debug.rs @@ -1,7 +1,7 @@ #![allow(unused_macros, dead_code)] -use std::collections::HashSet; use std::cell::RefCell; +use std::collections::HashSet; thread_local! { // Not perfect as it might print multiple times (once per thread), @@ -10,7 +10,6 @@ thread_local! { pub(crate) static OUTPUT_GUARD: RefCell> = RefCell::new(HashSet::new()); } - macro_rules! debug_append_log { ($str:expr) => { use std::io::Write; @@ -23,13 +22,12 @@ macro_rules! debug_append_log { .write(true) .create(true) .append(true) - .open(file_name).and_then(|mut file| { - writeln!(file, "{}", $str) - }).ok(); // `ok` suppresses warning about unused results, about which we don't care. - } + .open(file_name) + .and_then(|mut file| writeln!(file, "{}", $str)) + .ok(); // `ok` suppresses warning about unused results, about which we don't care. + }; } - /// Prints the given string once (for the current thread). /// Useful for not spamming the console. macro_rules! debug_output_once { @@ -49,33 +47,39 @@ macro_rules! debug_output_once { output_guard.insert(output); }); - } + }; } - /// Signal a software fallback is executed. -#[cfg(feature="trace")] +#[cfg(feature = "trace")] macro_rules! fallback { () => { - debug_output_once!(format!("⛔ faster is using SOFTWARE emulation here ({}:{}).", file!(), line!())); - } + debug_output_once!(format!( + "⛔ faster is using SOFTWARE emulation here ({}:{}).", + file!(), + line!() + )); + }; } /// Signal an optimized SIMD intrinsic is executed. -#[cfg(feature="trace")] +#[cfg(feature = "trace")] macro_rules! optimized { () => { - debug_output_once!(format!("🚄 faster is using HARDWARE acceleration here ({}:{}).", file!(), line!())); - } + debug_output_once!(format!( + "🚄 faster is using HARDWARE acceleration here ({}:{}).", + file!(), + line!() + )); + }; } -#[cfg(not(feature="trace"))] +#[cfg(not(feature = "trace"))] macro_rules! fallback { - () => { } + () => {}; } -#[cfg(not(feature="trace"))] +#[cfg(not(feature = "trace"))] macro_rules! optimized { - () => { } + () => {}; } - diff --git a/src/into_iters.rs b/src/into_iters.rs index b66ce24..c2aa0cd 100644 --- a/src/into_iters.rs +++ b/src/into_iters.rs @@ -5,15 +5,15 @@ // License, v. 2.0. If a copy owf the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::iters::{SIMDIter, SIMDIterator, SIMDObject}; +use crate::arch::current::vecs::*; #[allow(unused_imports)] // Remove for specialization use crate::iters::SIMDAdapter; -use crate::arch::current::vecs::*; +use crate::iters::{SIMDIter, SIMDIterator, SIMDObject}; /// A trait which transforms a contiguous collection into an owned stream of /// vectors. pub trait IntoSIMDIterator { - type Iter : SIMDIterator; + type Iter: SIMDIterator; /// Return an iterator over this data which will automatically pack /// values into SIMD vectors. See `SIMDIterator::simd_map` and @@ -24,7 +24,7 @@ pub trait IntoSIMDIterator { /// A trait which transforms a contiguous collection into a slice-backed stream /// of vectors. pub trait IntoSIMDRefIterator<'a> { - type Iter : SIMDIterator; + type Iter: SIMDIterator; /// Return an iterator over this data which will automatically pack /// values into SIMD vectors. See `SIMDIterator::simd_map` and @@ -35,7 +35,7 @@ pub trait IntoSIMDRefIterator<'a> { /// A trait which transforms a contiguous collection into a mutable slice-backed /// stream of vectors. pub trait IntoSIMDRefMutIterator<'a> { - type Iter : SIMDIterator; + type Iter: SIMDIterator; /// Return an iterator over this data which will automatically pack /// values into SIMD vectors. See `SIMDIterator::simd_map` and @@ -115,16 +115,10 @@ macro_rules! impl_array_intos { } } -impl_array_intos!(u8, u8s, - i8, i8s, - u16, u16s, - i16, i16s, - u32, u32s, - i32, i32s, - f32, f32s, - u64, u64s, - i64, i64s, - f64, f64s); +impl_array_intos!( + u8, u8s, i8, i8s, u16, u16s, i16, i16s, u32, u32s, i32, i32s, f32, f32s, u64, u64s, i64, i64s, + f64, f64s +); // TODO: Specialization // impl IntoSIMDIterator for I where I : ExactSizeIterator + Iterator, S : Packable { diff --git a/src/intrin/destride.rs b/src/intrin/destride.rs index 073073d..ebbb2fb 100644 --- a/src/intrin/destride.rs +++ b/src/intrin/destride.rs @@ -5,7 +5,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -pub trait Destride : Sized { +pub trait Destride: Sized { fn destride_two(self, other: Self) -> (Self, Self); fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self); } diff --git a/src/intrin/endian.rs b/src/intrin/endian.rs index e7d33fd..174992b 100644 --- a/src/intrin/endian.rs +++ b/src/intrin/endian.rs @@ -5,7 +5,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -pub trait Reendianize : Sized + Copy { +pub trait Reendianize: Sized + Copy { /// Return a vector containing elements of `self` with switched endianness. /// /// ``` diff --git a/src/intrin/eq.rs b/src/intrin/eq.rs index 0d9f242..ca8d174 100644 --- a/src/intrin/eq.rs +++ b/src/intrin/eq.rs @@ -8,8 +8,8 @@ use crate::core::ops::BitXor; use crate::vecs::*; -pub trait Eq : Packed { - type Out : Pattern + BitXor; +pub trait Eq: Packed { + type Out: Pattern + BitXor; /// Return a vector where each element at an index i is filled with 1s if /// the elements of `self` and `other` at index i are equal, and filled with @@ -40,7 +40,9 @@ pub trait Eq : Packed { /// # } /// ``` #[inline(always)] - fn ne_mask(&self, other: Self) -> Self::Out { self.eq_mask(other) ^ Self::Out::ones() } + fn ne_mask(&self, other: Self) -> Self::Out { + self.eq_mask(other) ^ Self::Out::ones() + } } macro_rules! rust_fallback_eq { @@ -76,20 +78,28 @@ macro_rules! rust_fallback_eq { } macro_rules! test_packed_eq { - ($vec:tt, $el:tt, $mask:tt, $maskel:tt, $name:tt) => { - #[test] - fn $name() { - assert_eq!($vec::halfs(1 as $el, 0 as $el).eq_mask($vec::splat(0 as $el)), - $mask::halfs(0, $maskel::max_value())); + ($vec:tt, $el:tt, $mask:tt, $maskel:tt, $name:tt) => { + #[test] + fn $name() { + assert_eq!( + $vec::halfs(1 as $el, 0 as $el).eq_mask($vec::splat(0 as $el)), + $mask::halfs(0, $maskel::max_value()) + ); - assert_eq!($vec::interleave(1 as $el, 0 as $el).eq_mask($vec::splat(1 as $el)), - $mask::interleave($maskel::max_value(), 0)); + assert_eq!( + $vec::interleave(1 as $el, 0 as $el).eq_mask($vec::splat(1 as $el)), + $mask::interleave($maskel::max_value(), 0) + ); - assert_eq!($vec::halfs(1 as $el, 0 as $el).ne_mask($vec::splat(0 as $el)), - $mask::halfs($maskel::max_value(), 0)); + assert_eq!( + $vec::halfs(1 as $el, 0 as $el).ne_mask($vec::splat(0 as $el)), + $mask::halfs($maskel::max_value(), 0) + ); - assert_eq!($vec::interleave(1 as $el, 0 as $el).ne_mask($vec::splat(1 as $el)), - $mask::interleave(0, $maskel::max_value())); - } + assert_eq!( + $vec::interleave(1 as $el, 0 as $el).ne_mask($vec::splat(1 as $el)), + $mask::interleave(0, $maskel::max_value()) + ); } - } + }; +} diff --git a/src/intrin/hadd.rs b/src/intrin/hadd.rs index 712a3f5..e7046bb 100644 --- a/src/intrin/hadd.rs +++ b/src/intrin/hadd.rs @@ -20,60 +20,90 @@ mod tests { #[test] fn hadd_i8s() { assert_eq!(i8s(1).hadd(i8s(2)), i8s::interleave(2, 4)); - assert_eq!(i8s::interleave(1, 2).hadd(i8s::interleave(3, 4)), i8s::interleave(3, 7)); + assert_eq!( + i8s::interleave(1, 2).hadd(i8s::interleave(3, 4)), + i8s::interleave(3, 7) + ); } #[test] fn hadd_i16s() { assert_eq!(i16s(1).hadd(i16s(2)), i16s::interleave(2, 4)); - assert_eq!(i16s::interleave(1, 2).hadd(i16s::interleave(3, 4)), i16s::interleave(3, 7)); + assert_eq!( + i16s::interleave(1, 2).hadd(i16s::interleave(3, 4)), + i16s::interleave(3, 7) + ); } #[test] fn hadd_i32s() { assert_eq!(i32s(1).hadd(i32s(2)), i32s::interleave(2, 4)); - assert_eq!(i32s::interleave(1, 2).hadd(i32s::interleave(3, 4)), i32s::interleave(3, 7)); + assert_eq!( + i32s::interleave(1, 2).hadd(i32s::interleave(3, 4)), + i32s::interleave(3, 7) + ); } #[test] fn hadd_i64s() { assert_eq!(i64s(1).hadd(i64s(2)), i64s::interleave(2, 4)); - assert_eq!(i64s::interleave(1, 2).hadd(i64s::interleave(3, 4)), i64s::interleave(3, 7)); + assert_eq!( + i64s::interleave(1, 2).hadd(i64s::interleave(3, 4)), + i64s::interleave(3, 7) + ); } #[test] fn hadd_u8s() { assert_eq!(u8s(1).hadd(u8s(2)), u8s::interleave(2, 4)); - assert_eq!(u8s::interleave(1, 2).hadd(u8s::interleave(3, 4)), u8s::interleave(3, 7)); + assert_eq!( + u8s::interleave(1, 2).hadd(u8s::interleave(3, 4)), + u8s::interleave(3, 7) + ); } #[test] fn hadd_u16s() { assert_eq!(u16s(1).hadd(u16s(2)), u16s::interleave(2, 4)); - assert_eq!(u16s::interleave(1, 2).hadd(u16s::interleave(3, 4)), u16s::interleave(3, 7)); + assert_eq!( + u16s::interleave(1, 2).hadd(u16s::interleave(3, 4)), + u16s::interleave(3, 7) + ); } #[test] fn hadd_u32s() { assert_eq!(u32s(1).hadd(u32s(2)), u32s::interleave(2, 4)); - assert_eq!(u32s::interleave(1, 2).hadd(u32s::interleave(3, 4)), u32s::interleave(3, 7)); + assert_eq!( + u32s::interleave(1, 2).hadd(u32s::interleave(3, 4)), + u32s::interleave(3, 7) + ); } #[test] fn hadd_u64s() { assert_eq!(u64s(1).hadd(u64s(2)), u64s::interleave(2, 4)); - assert_eq!(u64s::interleave(1, 2).hadd(u64s::interleave(3, 4)), u64s::interleave(3, 7)); + assert_eq!( + u64s::interleave(1, 2).hadd(u64s::interleave(3, 4)), + u64s::interleave(3, 7) + ); } #[test] fn hadd_f32s() { assert_eq!(f32s(1.0).hadd(f32s(2.0)), f32s::interleave(2.0, 4.0)); - assert_eq!(f32s::interleave(1.0, 2.0).hadd(f32s::interleave(3.0, 4.0)), f32s::interleave(3.0, 7.0)); + assert_eq!( + f32s::interleave(1.0, 2.0).hadd(f32s::interleave(3.0, 4.0)), + f32s::interleave(3.0, 7.0) + ); } #[test] fn hadd_f64s() { assert_eq!(f64s(1.0).hadd(f64s(2.0)), f64s::interleave(2.0, 4.0)); - assert_eq!(f64s::interleave(1.0, 2.0).hadd(f64s::interleave(3.0, 4.0)), f64s::interleave(3.0, 7.0)); + assert_eq!( + f64s::interleave(1.0, 2.0).hadd(f64s::interleave(3.0, 4.0)), + f64s::interleave(3.0, 7.0) + ); } } diff --git a/src/intrin/hsub.rs b/src/intrin/hsub.rs index 1ee5488..25965c6 100644 --- a/src/intrin/hsub.rs +++ b/src/intrin/hsub.rs @@ -20,60 +20,90 @@ mod tests { #[test] fn hsub_i8s() { assert_eq!(i8s(1).hsub(i8s(2)), i8s::interleave(0, 0)); - assert_eq!(i8s::interleave(1, 2).hsub(i8s::interleave(3, 4)), i8s::interleave(-1, -1)); + assert_eq!( + i8s::interleave(1, 2).hsub(i8s::interleave(3, 4)), + i8s::interleave(-1, -1) + ); } #[test] fn hsub_i16s() { assert_eq!(i16s(1).hsub(i16s(2)), i16s::interleave(0, 0)); - assert_eq!(i16s::interleave(1, 2).hsub(i16s::interleave(3, 4)), i16s::interleave(-1, -1)); + assert_eq!( + i16s::interleave(1, 2).hsub(i16s::interleave(3, 4)), + i16s::interleave(-1, -1) + ); } #[test] fn hsub_i32s() { assert_eq!(i32s(1).hsub(i32s(2)), i32s::interleave(0, 0)); - assert_eq!(i32s::interleave(1, 2).hsub(i32s::interleave(3, 4)), i32s::interleave(-1, -1)); + assert_eq!( + i32s::interleave(1, 2).hsub(i32s::interleave(3, 4)), + i32s::interleave(-1, -1) + ); } #[test] fn hsub_i64s() { assert_eq!(i64s(1).hsub(i64s(2)), i64s::interleave(0, 0)); - assert_eq!(i64s::interleave(1, 2).hsub(i64s::interleave(3, 4)), i64s::interleave(-1, -1)); + assert_eq!( + i64s::interleave(1, 2).hsub(i64s::interleave(3, 4)), + i64s::interleave(-1, -1) + ); } #[test] fn hsub_u8s() { assert_eq!(u8s(1).hsub(u8s(2)), u8s::interleave(0, 0)); - assert_eq!(u8s::interleave(2, 1).hsub(u8s::interleave(4, 3)), u8s::interleave(1, 1)); + assert_eq!( + u8s::interleave(2, 1).hsub(u8s::interleave(4, 3)), + u8s::interleave(1, 1) + ); } #[test] fn hsub_u16s() { assert_eq!(u16s(1).hsub(u16s(2)), u16s::interleave(0, 0)); - assert_eq!(u16s::interleave(2, 1).hsub(u16s::interleave(4, 3)), u16s::interleave(1, 1)); + assert_eq!( + u16s::interleave(2, 1).hsub(u16s::interleave(4, 3)), + u16s::interleave(1, 1) + ); } #[test] fn hsub_u32s() { assert_eq!(u32s(1).hsub(u32s(2)), u32s::interleave(0, 0)); - assert_eq!(u32s::interleave(2, 1).hsub(u32s::interleave(4, 3)), u32s::interleave(1, 1)); + assert_eq!( + u32s::interleave(2, 1).hsub(u32s::interleave(4, 3)), + u32s::interleave(1, 1) + ); } #[test] fn hsub_u64s() { assert_eq!(u64s(1).hsub(u64s(2)), u64s::interleave(0, 0)); - assert_eq!(u64s::interleave(2, 1).hsub(u64s::interleave(4, 3)), u64s::interleave(1, 1)); + assert_eq!( + u64s::interleave(2, 1).hsub(u64s::interleave(4, 3)), + u64s::interleave(1, 1) + ); } #[test] fn hsub_f32s() { assert_eq!(f32s(1.0).hsub(f32s(2.0)), f32s::interleave(0.0, 0.0)); - assert_eq!(f32s::interleave(1.0, 2.0).hsub(f32s::interleave(3.0, 4.0)), f32s::interleave(-1.0, -1.0)); + assert_eq!( + f32s::interleave(1.0, 2.0).hsub(f32s::interleave(3.0, 4.0)), + f32s::interleave(-1.0, -1.0) + ); } #[test] fn hsub_f64s() { assert_eq!(f64s(1.0).hsub(f64s(2.0)), f64s::interleave(0.0, 0.0)); - assert_eq!(f64s::interleave(1.0, 2.0).hsub(f64s::interleave(3.0, 4.0)), f64s::interleave(-1.0, -1.0)); + assert_eq!( + f64s::interleave(1.0, 2.0).hsub(f64s::interleave(3.0, 4.0)), + f64s::interleave(-1.0, -1.0) + ); } } diff --git a/src/intrin/macros.rs b/src/intrin/macros.rs index 27ee745..986f073 100644 --- a/src/intrin/macros.rs +++ b/src/intrin/macros.rs @@ -5,7 +5,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. - macro_rules! rust_fallback_impl { (impl $trait:tt for $type:tt where $feat:tt { $($rustfn:ident => $mmfn:tt ( $($mmfnargs:expr),* ), [$($n:expr),+]);*;}) => ( diff --git a/src/intrin/mod.rs b/src/intrin/mod.rs index cbb0266..12ba546 100644 --- a/src/intrin/mod.rs +++ b/src/intrin/mod.rs @@ -9,25 +9,33 @@ pub mod abs; pub mod addsub; pub mod cast; pub mod cmp; -#[macro_use] pub mod destride; +#[macro_use] +pub mod destride; pub mod downcast; -#[macro_use] pub mod endian; -#[macro_use] pub mod eq; +#[macro_use] +pub mod endian; +#[macro_use] +pub mod eq; pub mod hadd; pub mod hsub; -#[macro_use] pub mod macros; -#[macro_use] pub mod merge; -#[macro_use] pub mod popcnt; +#[macro_use] +pub mod macros; +#[macro_use] +pub mod merge; +#[macro_use] +pub mod popcnt; pub mod recip; pub mod round; pub mod rsqrt; -#[macro_use] pub mod sum; +#[macro_use] +pub mod sum; pub mod saturating_add; pub mod saturating_hadd; pub mod saturating_hsub; pub mod saturating_sub; pub mod sqrt; -#[macro_use] pub mod transmute; +#[macro_use] +pub mod transmute; pub mod upcast; // We use an internal prelude not to clutter the namespace when we import @@ -48,12 +56,12 @@ pub(crate) mod prelude { pub use super::recip::*; pub use super::round::*; pub use super::rsqrt::*; - pub use super::sum::*; pub use super::saturating_add::*; pub use super::saturating_hadd::*; pub use super::saturating_hsub::*; pub use super::saturating_sub::*; pub use super::sqrt::*; + pub use super::sum::*; pub use super::transmute::*; pub use super::upcast::*; } diff --git a/src/intrin/popcnt.rs b/src/intrin/popcnt.rs index 8f53ba3..16f3ddd 100644 --- a/src/intrin/popcnt.rs +++ b/src/intrin/popcnt.rs @@ -7,7 +7,7 @@ use crate::vecs::*; -pub trait Popcnt : Packed { +pub trait Popcnt: Packed { fn count_ones(&self) -> usize; #[inline(always)] diff --git a/src/intrin/saturating_hadd.rs b/src/intrin/saturating_hadd.rs index 89f2688..e81ce13 100644 --- a/src/intrin/saturating_hadd.rs +++ b/src/intrin/saturating_hadd.rs @@ -20,56 +20,106 @@ mod tests { #[test] fn saturating_hadd_i8s() { assert_eq!(i8s(1).saturating_hadd(i8s(2)), i8s::interleave(2, 4)); - assert_eq!(i8s::interleave(1, 2).saturating_hadd(i8s::interleave(3, 4)), i8s::interleave(3, 7)); - assert_eq!(i8s::interleave(-100, -100).saturating_hadd(i8s::interleave(100, 100)), i8s::interleave(i8::min_value(), i8::max_value())); + assert_eq!( + i8s::interleave(1, 2).saturating_hadd(i8s::interleave(3, 4)), + i8s::interleave(3, 7) + ); + assert_eq!( + i8s::interleave(-100, -100).saturating_hadd(i8s::interleave(100, 100)), + i8s::interleave(i8::min_value(), i8::max_value()) + ); } #[test] fn saturating_hadd_i16s() { assert_eq!(i16s(1).saturating_hadd(i16s(2)), i16s::interleave(2, 4)); - assert_eq!(i16s::interleave(1, 2).saturating_hadd(i16s::interleave(3, 4)), i16s::interleave(3, 7)); - assert_eq!(i16s::interleave(-30000, -30000).saturating_hadd(i16s::interleave(30000, 30000)), i16s::interleave(i16::min_value(), i16::max_value())); + assert_eq!( + i16s::interleave(1, 2).saturating_hadd(i16s::interleave(3, 4)), + i16s::interleave(3, 7) + ); + assert_eq!( + i16s::interleave(-30000, -30000).saturating_hadd(i16s::interleave(30000, 30000)), + i16s::interleave(i16::min_value(), i16::max_value()) + ); } #[test] fn saturating_hadd_i32s() { assert_eq!(i32s(1).saturating_hadd(i32s(2)), i32s::interleave(2, 4)); - assert_eq!(i32s::interleave(1, 2).saturating_hadd(i32s::interleave(3, 4)), i32s::interleave(3, 7)); - assert_eq!(i32s::interleave(-2_000_000_000, -2_000_000_000).saturating_hadd(i32s::interleave(2_000_000_000, 2_000_000_000)), i32s::interleave(i32::min_value(), i32::max_value())); + assert_eq!( + i32s::interleave(1, 2).saturating_hadd(i32s::interleave(3, 4)), + i32s::interleave(3, 7) + ); + assert_eq!( + i32s::interleave(-2_000_000_000, -2_000_000_000) + .saturating_hadd(i32s::interleave(2_000_000_000, 2_000_000_000)), + i32s::interleave(i32::min_value(), i32::max_value()) + ); } #[test] fn saturating_hadd_i64s() { assert_eq!(i64s(1).saturating_hadd(i64s(2)), i64s::interleave(2, 4)); - assert_eq!(i64s::interleave(1, 2).saturating_hadd(i64s::interleave(3, 4)), i64s::interleave(3, 7)); - assert_eq!(i64s::interleave(-9_000_000_000_000_000_000, -9_000_000_000_000_000_000).saturating_hadd(i64s::interleave(9_000_000_000_000_000_000, 9_000_000_000_000_000_000)), i64s::interleave(i64::min_value(), i64::max_value())); + assert_eq!( + i64s::interleave(1, 2).saturating_hadd(i64s::interleave(3, 4)), + i64s::interleave(3, 7) + ); + assert_eq!( + i64s::interleave(-9_000_000_000_000_000_000, -9_000_000_000_000_000_000) + .saturating_hadd(i64s::interleave( + 9_000_000_000_000_000_000, + 9_000_000_000_000_000_000 + )), + i64s::interleave(i64::min_value(), i64::max_value()) + ); } #[test] fn saturating_hadd_u8s() { assert_eq!(u8s(1).saturating_hadd(u8s(2)), u8s::interleave(2, 4)); - assert_eq!(u8s::interleave(1, 2).saturating_hadd(u8s::interleave(3, 4)), u8s::interleave(3, 7)); + assert_eq!( + u8s::interleave(1, 2).saturating_hadd(u8s::interleave(3, 4)), + u8s::interleave(3, 7) + ); assert_eq!(u8s(200).saturating_hadd(u8s(200)), u8s(u8::max_value())); } #[test] fn saturating_hadd_u16s() { assert_eq!(u16s(1).saturating_hadd(u16s(2)), u16s::interleave(2, 4)); - assert_eq!(u16s::interleave(1, 2).saturating_hadd(u16s::interleave(3, 4)), u16s::interleave(3, 7)); - assert_eq!(u16s(60000).saturating_hadd(u16s(60000)), u16s(u16::max_value())); + assert_eq!( + u16s::interleave(1, 2).saturating_hadd(u16s::interleave(3, 4)), + u16s::interleave(3, 7) + ); + assert_eq!( + u16s(60000).saturating_hadd(u16s(60000)), + u16s(u16::max_value()) + ); } #[test] fn saturating_hadd_u32s() { assert_eq!(u32s(1).saturating_hadd(u32s(2)), u32s::interleave(2, 4)); - assert_eq!(u32s::interleave(1, 2).saturating_hadd(u32s::interleave(3, 4)), u32s::interleave(3, 7)); - assert_eq!(u32s(4_000_000_000).saturating_hadd(u32s(4_000_000_000)), u32s(u32::max_value())); + assert_eq!( + u32s::interleave(1, 2).saturating_hadd(u32s::interleave(3, 4)), + u32s::interleave(3, 7) + ); + assert_eq!( + u32s(4_000_000_000).saturating_hadd(u32s(4_000_000_000)), + u32s(u32::max_value()) + ); } #[test] fn saturating_hadd_u64s() { assert_eq!(u64s(1).saturating_hadd(u64s(2)), u64s::interleave(2, 4)); - assert_eq!(u64s::interleave(1, 2).saturating_hadd(u64s::interleave(3, 4)), u64s::interleave(3, 7)); - assert_eq!(u64s(18_000_000_000_000_000_000).saturating_hadd(u64s(18_000_000_000_000_000_000)), u64s(u64::max_value())); + assert_eq!( + u64s::interleave(1, 2).saturating_hadd(u64s::interleave(3, 4)), + u64s::interleave(3, 7) + ); + assert_eq!( + u64s(18_000_000_000_000_000_000).saturating_hadd(u64s(18_000_000_000_000_000_000)), + u64s(u64::max_value()) + ); } } diff --git a/src/intrin/saturating_hsub.rs b/src/intrin/saturating_hsub.rs index f55a22b..8216070 100644 --- a/src/intrin/saturating_hsub.rs +++ b/src/intrin/saturating_hsub.rs @@ -20,56 +20,109 @@ mod tests { #[test] fn saturating_hsub_i8s() { assert_eq!(i8s(1).saturating_hsub(i8s(2)), i8s::interleave(0, 0)); - assert_eq!(i8s::interleave(1, 2).saturating_hsub(i8s::interleave(3, 4)), i8s::interleave(-1, -1)); - assert_eq!(i8s::interleave(-100, 100).saturating_hsub(i8s::interleave(100, -100)), i8s::interleave(i8::min_value(), i8::max_value())); + assert_eq!( + i8s::interleave(1, 2).saturating_hsub(i8s::interleave(3, 4)), + i8s::interleave(-1, -1) + ); + assert_eq!( + i8s::interleave(-100, 100).saturating_hsub(i8s::interleave(100, -100)), + i8s::interleave(i8::min_value(), i8::max_value()) + ); } #[test] fn saturating_hsub_i16s() { assert_eq!(i16s(1).saturating_hsub(i16s(2)), i16s::interleave(0, 0)); - assert_eq!(i16s::interleave(1, 2).saturating_hsub(i16s::interleave(3, 4)), i16s::interleave(-1, -1)); - assert_eq!(i16s::interleave(-30000, 30000).saturating_hsub(i16s::interleave(30000, -30000)), i16s::interleave(i16::min_value(), i16::max_value())); + assert_eq!( + i16s::interleave(1, 2).saturating_hsub(i16s::interleave(3, 4)), + i16s::interleave(-1, -1) + ); + assert_eq!( + i16s::interleave(-30000, 30000).saturating_hsub(i16s::interleave(30000, -30000)), + i16s::interleave(i16::min_value(), i16::max_value()) + ); } #[test] fn saturating_hsub_i32s() { assert_eq!(i32s(1).saturating_hsub(i32s(2)), i32s::interleave(0, 0)); - assert_eq!(i32s::interleave(1, 2).saturating_hsub(i32s::interleave(3, 4)), i32s::interleave(-1, -1)); - assert_eq!(i32s::interleave(-2_000_000_000, 2_000_000_000).saturating_hsub(i32s::interleave(2_000_000_000, -2_000_000_000)), i32s::interleave(i32::min_value(), i32::max_value())); + assert_eq!( + i32s::interleave(1, 2).saturating_hsub(i32s::interleave(3, 4)), + i32s::interleave(-1, -1) + ); + assert_eq!( + i32s::interleave(-2_000_000_000, 2_000_000_000) + .saturating_hsub(i32s::interleave(2_000_000_000, -2_000_000_000)), + i32s::interleave(i32::min_value(), i32::max_value()) + ); } #[test] fn saturating_hsub_i64s() { assert_eq!(i64s(1).saturating_hsub(i64s(2)), i64s::interleave(0, 0)); - assert_eq!(i64s::interleave(1, 2).saturating_hsub(i64s::interleave(3, 4)), i64s::interleave(-1, -1)); - assert_eq!(i64s::interleave(-9_000_000_000_000_000_000, 9_000_000_000_000_000_000).saturating_hsub(i64s::interleave(9_000_000_000_000_000_000, -9_000_000_000_000_000_000)), i64s::interleave(i64::min_value(), i64::max_value())); + assert_eq!( + i64s::interleave(1, 2).saturating_hsub(i64s::interleave(3, 4)), + i64s::interleave(-1, -1) + ); + assert_eq!( + i64s::interleave(-9_000_000_000_000_000_000, 9_000_000_000_000_000_000) + .saturating_hsub(i64s::interleave( + 9_000_000_000_000_000_000, + -9_000_000_000_000_000_000 + )), + i64s::interleave(i64::min_value(), i64::max_value()) + ); } #[test] fn saturating_hsub_u8s() { assert_eq!(u8s(1).saturating_hsub(u8s(2)), u8s::interleave(0, 0)); - assert_eq!(u8s::interleave(1, 2).saturating_hsub(u8s::interleave(3, 4)), u8s::interleave(0, 0)); - assert_eq!(u8s::interleave(2, 1).saturating_hsub(u8s::interleave(4, 3)), u8s::interleave(1, 1)); + assert_eq!( + u8s::interleave(1, 2).saturating_hsub(u8s::interleave(3, 4)), + u8s::interleave(0, 0) + ); + assert_eq!( + u8s::interleave(2, 1).saturating_hsub(u8s::interleave(4, 3)), + u8s::interleave(1, 1) + ); } #[test] fn saturating_hsub_u16s() { assert_eq!(u16s(1).saturating_hsub(u16s(2)), u16s::interleave(0, 0)); - assert_eq!(u16s::interleave(1, 2).saturating_hsub(u16s::interleave(3, 4)), u16s::interleave(0, 0)); - assert_eq!(u16s::interleave(2, 1).saturating_hsub(u16s::interleave(4, 3)), u16s::interleave(1, 1)); + assert_eq!( + u16s::interleave(1, 2).saturating_hsub(u16s::interleave(3, 4)), + u16s::interleave(0, 0) + ); + assert_eq!( + u16s::interleave(2, 1).saturating_hsub(u16s::interleave(4, 3)), + u16s::interleave(1, 1) + ); } #[test] fn saturating_hsub_u32s() { assert_eq!(u32s(1).saturating_hsub(u32s(2)), u32s::interleave(0, 0)); - assert_eq!(u32s::interleave(1, 2).saturating_hsub(u32s::interleave(3, 4)), u32s::interleave(0, 0)); - assert_eq!(u32s::interleave(2, 1).saturating_hsub(u32s::interleave(4, 3)), u32s::interleave(1, 1)); + assert_eq!( + u32s::interleave(1, 2).saturating_hsub(u32s::interleave(3, 4)), + u32s::interleave(0, 0) + ); + assert_eq!( + u32s::interleave(2, 1).saturating_hsub(u32s::interleave(4, 3)), + u32s::interleave(1, 1) + ); } #[test] fn saturating_hsub_u64s() { assert_eq!(u64s(1).saturating_hsub(u64s(2)), u64s::interleave(0, 0)); - assert_eq!(u64s::interleave(1, 2).saturating_hsub(u64s::interleave(3, 4)), u64s::interleave(0, 0)); - assert_eq!(u64s::interleave(2, 1).saturating_hsub(u64s::interleave(4, 3)), u64s::interleave(1, 1)); + assert_eq!( + u64s::interleave(1, 2).saturating_hsub(u64s::interleave(3, 4)), + u64s::interleave(0, 0) + ); + assert_eq!( + u64s::interleave(2, 1).saturating_hsub(u64s::interleave(4, 3)), + u64s::interleave(1, 1) + ); } } diff --git a/src/intrin/sum.rs b/src/intrin/sum.rs index 79d4828..200ea6a 100644 --- a/src/intrin/sum.rs +++ b/src/intrin/sum.rs @@ -7,12 +7,12 @@ use crate::vecs::*; -pub trait Sum : Packed { +pub trait Sum: Packed { /// Return a scalar equivalent to the sum of all elements of this vector. fn sum(&self) -> Self::Scalar; } -pub trait UpcastSum : { +pub trait UpcastSum { /// Return a scalar equivalent to the sum of all elements of this vector, /// but collect the result in an i64 rather than the vector's type. fn sum_upcast(&self) -> i64; @@ -55,10 +55,11 @@ macro_rules! test_packed_sum_int { while i < $el::max_value() / 64 - 1 { let v = $vec::splat(i); - assert_eq!(v.sum(), - v.scalar_reduce(0 as $el, |acc, v| acc + v)); - assert_eq!(v.sum_upcast(), - v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64))); + assert_eq!(v.sum(), v.scalar_reduce(0 as $el, |acc, v| acc + v)); + assert_eq!( + v.sum_upcast(), + v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64)) + ); i += $el::max_value() / 20; } } @@ -71,10 +72,11 @@ macro_rules! test_packed_sum { fn $name() { for i in -100..100 { let v = $vec::splat(i as $el); - assert_eq!(v.sum(), - v.scalar_reduce(0 as $el, |acc, v| acc + v)); - assert_eq!(v.sum_upcast(), - v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64))); + assert_eq!(v.sum(), v.scalar_reduce(0 as $el, |acc, v| acc + v)); + assert_eq!( + v.sum_upcast(), + v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64)) + ); } } }; diff --git a/src/intrin/transmute.rs b/src/intrin/transmute.rs index 99ecbab..1cf3a36 100644 --- a/src/intrin/transmute.rs +++ b/src/intrin/transmute.rs @@ -106,7 +106,7 @@ mod tests { use crate::prelude::*; macro_rules! test_transmute { - ($name:ident, $val:expr, $xmute:ident) => ( + ($name:ident, $val:expr, $xmute:ident) => { #[test] fn $name() { #![allow(unused_unsafe)] @@ -119,7 +119,7 @@ mod tests { assert_eq!(unsafe { $val.be_i64s().$xmute() }, $val); assert_eq!(unsafe { $val.be_u64s().$xmute() }, $val); } - ) + }; } test_transmute!(transmute_u8s, u8s(1), be_u8s); diff --git a/src/intrin/upcast.rs b/src/intrin/upcast.rs index efd8fb0..8ad1c32 100644 --- a/src/intrin/upcast.rs +++ b/src/intrin/upcast.rs @@ -57,15 +57,27 @@ mod tests { #[test] fn upcast_i32s_i64s() { // TODO: Fix ugliness - assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).0, i64s::interleave(1, 2)); - assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).1, i64s::interleave(1, 2)); + assert_eq!( + Upcast::::upcast(i32s::interleave(1, 2)).0, + i64s::interleave(1, 2) + ); + assert_eq!( + Upcast::::upcast(i32s::interleave(1, 2)).1, + i64s::interleave(1, 2) + ); } #[test] fn upcast_i32s_f64s() { // TODO: Fix ugliness - assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).0, f64s::interleave(1.0, 2.0)); - assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).1, f64s::interleave(1.0, 2.0)); + assert_eq!( + Upcast::::upcast(i32s::interleave(1, 2)).0, + f64s::interleave(1.0, 2.0) + ); + assert_eq!( + Upcast::::upcast(i32s::interleave(1, 2)).1, + f64s::interleave(1.0, 2.0) + ); } #[test] @@ -76,8 +88,17 @@ mod tests { #[test] fn upcast_f32s() { - assert_eq!(f32s::interleave(1.0, 2.0).upcast(), (f64s::interleave(1.0, 2.0), f64s::interleave(1.0, 2.0))); - assert_eq!(f32s::interleave(1.0, 2.0).upcast().0, f64s::interleave(1.0, 2.0)); - assert_eq!(f32s::interleave(1.0, 2.0).upcast().1, f64s::interleave(1.0, 2.0)); + assert_eq!( + f32s::interleave(1.0, 2.0).upcast(), + (f64s::interleave(1.0, 2.0), f64s::interleave(1.0, 2.0)) + ); + assert_eq!( + f32s::interleave(1.0, 2.0).upcast().0, + f64s::interleave(1.0, 2.0) + ); + assert_eq!( + f32s::interleave(1.0, 2.0).upcast().1, + f64s::interleave(1.0, 2.0) + ); } } diff --git a/src/iters.rs b/src/iters.rs index 0c4c5a8..a88b7ef 100644 --- a/src/iters.rs +++ b/src/iters.rs @@ -5,12 +5,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::vecs::{Packable, Packed}; use crate::core::slice::from_raw_parts; +use crate::vecs::{Packable, Packed}; -pub trait SIMDObject : Sized { - type Scalar : Packable; - type Vector : Packed; +pub trait SIMDObject: Sized { + type Scalar: Packable; + type Vector: Packed; /// Return the vector length of this object. #[inline(always)] @@ -27,7 +27,9 @@ pub trait SIMDObject : Sized { /// An iterator which automatically packs the values it iterates over into SIMD /// vectors. -pub trait SIMDIterable : SIMDObject + SIMDSized + ExactSizeIterator::Vector> { +pub trait SIMDIterable: + SIMDObject + SIMDSized + ExactSizeIterator::Vector> +{ /// Return the current position of this iterator, measured in scalars fn scalar_pos(&self) -> usize; @@ -53,9 +55,7 @@ pub trait SIMDIterable : SIMDObject + SIMDSized + ExactSizeIterator Unpacked { - Unpacked { - iter: self, - } + Unpacked { iter: self } } #[inline(always)] @@ -65,7 +65,7 @@ pub trait SIMDIterable : SIMDObject + SIMDSized + ExactSizeIterator::Vector::default(); 8] + scratch: [::Vector::default(); 8], } } } @@ -73,7 +73,7 @@ pub trait SIMDIterable : SIMDObject + SIMDSized + ExactSizeIterator(self, func: F) -> SIMDMap - where F : FnMut(Self::Vector) -> A, A : Packed, B : Packable { + where + F: FnMut(Self::Vector) -> A, + A: Packed, + B: Packable, + { SIMDMap { iter: self, func: func, @@ -94,7 +98,9 @@ pub trait SIMDIterator : SIMDIterable { /// Pack and run `func` over the iterator, returning no value and not /// modifying the iterator. fn simd_do_each(&mut self, mut func: F) - where F : FnMut(Self::Vector) -> () { + where + F: FnMut(Self::Vector) -> (), + { while let Some(v) = self.next() { func(v); } @@ -155,8 +161,9 @@ pub trait SIMDIterator : SIMDIterable { /// [`Packed::sum`]: vecs/trait.Packed.html#tymethod.sum /// [`Packed::product`]: vecs/trait.Packed.html#tymethod.product fn simd_reduce(&mut self, mut start: A, mut func: F) -> A - where F : FnMut(A, Self::Vector) -> A { - + where + F: FnMut(A, Self::Vector) -> A, + { while let Some(v) = self.next() { start = func(start, v); } @@ -168,14 +175,15 @@ pub trait SIMDIterator : SIMDIterable { } /// A trait defining a SIMD iterator over a mutable blob of primitive data -pub trait SIMDIteratorMut : SIMDIterator { +pub trait SIMDIteratorMut: SIMDIterator { /// Pack and run `func` over the iterator, modifying each element in-place. fn simd_for_each(&mut self, func: F) - where F : FnMut(&mut Self::Vector) -> (); + where + F: FnMut(&mut Self::Vector) -> (); } /// A trait defining a sized blob of primitive data -pub trait SIMDSized : SIMDObject { +pub trait SIMDSized: SIMDObject { /// Return the length of this iterator, measured in scalars. fn scalar_len(&self) -> usize; @@ -187,7 +195,7 @@ pub trait SIMDSized : SIMDObject { } /// A trait defining a random-access blob of data which can be loaded via SIMD -pub trait SIMDArray : SIMDObject + SIMDSized { +pub trait SIMDArray: SIMDObject + SIMDSized { fn load(&self, offset: usize) -> Self::Vector; unsafe fn load_unchecked(&self, offset: usize) -> Self::Vector; fn load_scalar(&self, offset: usize) -> Self::Scalar; @@ -196,7 +204,7 @@ pub trait SIMDArray : SIMDObject + SIMDSized { /// A trait defining a random-access mutable blob of data which can be loaded /// and stored to via SIMD. -pub trait SIMDArrayMut : SIMDArray { +pub trait SIMDArrayMut: SIMDArray { fn store(&mut self, value: Self::Vector, offset: usize); unsafe fn store_unchecked(&mut self, value: Self::Vector, offset: usize); fn store_scalar(&mut self, value: Self::Scalar, offset: usize); @@ -206,7 +214,7 @@ pub trait SIMDArrayMut : SIMDArray { /// A slice-backed iterator which can automatically pack its constituent /// elements into vectors. #[derive(Clone, Debug)] -pub struct SIMDIter { +pub struct SIMDIter { pub position: usize, pub data: A, pub default: A::Vector, @@ -214,7 +222,10 @@ pub struct SIMDIter { /// A lazy mapping iterator which applies its function to a stream of vectors. #[derive(Debug)] -pub struct SIMDMap where I : SIMDIterable { +pub struct SIMDMap +where + I: SIMDIterable, +{ pub iter: I, pub func: F, } @@ -222,19 +233,31 @@ pub struct SIMDMap where I : SIMDIterable { /// An iterator which packs an iterator of scalars into an iterator of vectors. /// Cannot take advantage of vectorized loads, so it's very slow to gather data! #[derive(Clone)] -pub struct SIMDAdapter where I : ExactSizeIterator, V : Packed { +pub struct SIMDAdapter +where + I: ExactSizeIterator, + V: Packed, +{ pub iter: I, pub scratch: V, pub default: V, pub position: usize, } -impl SIMDObject for SIMDAdapter where I : ExactSizeIterator, V : Packed { +impl SIMDObject for SIMDAdapter +where + I: ExactSizeIterator, + V: Packed, +{ type Scalar = V::Scalar; type Vector = V; } -impl Iterator for SIMDAdapter where I : ExactSizeIterator, V : Packed { +impl Iterator for SIMDAdapter +where + I: ExactSizeIterator, + V: Packed, +{ type Item = V; #[inline(always)] @@ -244,7 +267,9 @@ impl Iterator for SIMDAdapter where I : ExactSizeIterator Iterator for SIMDAdapter where I : ExactSizeIterator SIMDIterator for SIMDAdapter where I : ExactSizeIterator, V : Packed { +impl SIMDIterator for SIMDAdapter +where + I: ExactSizeIterator, + V: Packed, +{ fn end(&mut self) -> Option<(Self::Vector, usize)> { if self.position < self.scalar_len() { // This is the last vector we can load, so we should load it @@ -277,21 +306,33 @@ impl SIMDIterator for SIMDAdapter where I : ExactSizeIterator ExactSizeIterator for SIMDAdapter where I : ExactSizeIterator, V : Packed { +impl ExactSizeIterator for SIMDAdapter +where + I: ExactSizeIterator, + V: Packed, +{ #[inline(always)] fn len(&self) -> usize { self.iter.len() / self.width() } } -impl SIMDSized for SIMDAdapter where I : ExactSizeIterator, V : Packed { +impl SIMDSized for SIMDAdapter +where + I: ExactSizeIterator, + V: Packed, +{ #[inline(always)] fn scalar_len(&self) -> usize { self.iter.len() } } -impl SIMDIterable for SIMDAdapter where I : ExactSizeIterator, V : Packed { +impl SIMDIterable for SIMDAdapter +where + I: ExactSizeIterator, + V: Packed, +{ #[inline(always)] fn scalar_pos(&self) -> usize { self.position @@ -308,7 +349,11 @@ impl SIMDIterable for SIMDAdapter where I : ExactSizeIterator SIMDArrayMut for &'a mut [S] where S : 'a + Packable, V : Packed { +impl<'a, S, V> SIMDArrayMut for &'a mut [S] +where + S: 'a + Packable, + V: Packed, +{ #[inline(always)] fn store(&mut self, value: Self::Vector, offset: usize) { value.store(self, offset) @@ -333,11 +378,17 @@ impl<'a, S, V> SIMDArrayMut for &'a mut [S] where S : 'a + Packable, /// A slice-backed iterator which yields scalar elements using the Iterator API. #[derive(Debug)] -pub struct Unpacked where T : SIMDIterable { - pub iter: T +pub struct Unpacked +where + T: SIMDIterable, +{ + pub iter: T, } -impl Iterator for Unpacked where T : SIMDIterable + SIMDArray { +impl Iterator for Unpacked +where + T: SIMDIterable + SIMDArray, +{ type Item = ::Scalar; #[inline(always)] @@ -352,7 +403,10 @@ impl Iterator for Unpacked where T : SIMDIterable + SIMDArray { } } -impl Unpacked where T : SIMDIterable { +impl Unpacked +where + T: SIMDIterable, +{ #[inline(always)] pub fn pack(self) -> T { self.iter @@ -361,13 +415,16 @@ impl Unpacked where T : SIMDIterable { /// An iterator which yields multiple elements of a PackedIter #[derive(Debug)] -pub struct Unrolled<'a, T : 'a + SIMDIterable> { +pub struct Unrolled<'a, T: 'a + SIMDIterable> { iter: &'a mut T, amt: usize, scratch: [T::Vector; 8], } -impl<'a, T> Unrolled<'a, T> where T : 'a + SIMDIterable { +impl<'a, T> Unrolled<'a, T> +where + T: 'a + SIMDIterable, +{ #[inline(always)] pub fn chunk_len(&self) -> usize { self.amt @@ -379,7 +436,10 @@ impl<'a, T> Unrolled<'a, T> where T : 'a + SIMDIterable { } } -impl<'a, T> Iterator for Unrolled<'a, T> where T : 'a + SIMDIterator { +impl<'a, T> Iterator for Unrolled<'a, T> +where + T: 'a + SIMDIterator, +{ type Item = &'a [T::Vector]; #[inline(always)] @@ -394,7 +454,8 @@ impl<'a, T> Iterator for Unrolled<'a, T> where T : 'a + SIMDIterator { } } if i > 0 { - unsafe { // TODO: Is this unsafe? Contravariant lifetimes? + unsafe { + // TODO: Is this unsafe? Contravariant lifetimes? Some(from_raw_parts((&mut self.scratch).as_mut_ptr(), i)) } } else { @@ -448,19 +509,34 @@ impl_iter!(Vec, ('a, S, V) where S : Packable, V : Packed, V : Packed); impl_iter!(&'a mut [S], ('a, S, V) where S : Packable, V : Packed); -impl SIMDObject for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Scalar : Packable { +impl SIMDObject for SIMDIter +where + A: SIMDArray, + A::Vector: Packed, + A::Scalar: Packable, +{ type Vector = A::Vector; type Scalar = A::Scalar; } -impl ExactSizeIterator for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Scalar : Packable { +impl ExactSizeIterator for SIMDIter +where + A: SIMDArray, + A::Vector: Packed, + A::Scalar: Packable, +{ #[inline(always)] fn len(&self) -> usize { self.data.scalar_len() / self.width() } } -impl Iterator for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Scalar : Packable { +impl Iterator for SIMDIter +where + A: SIMDArray, + A::Vector: Packed, + A::Scalar: Packable, +{ type Item = ::Vector; #[inline(always)] @@ -476,7 +552,12 @@ impl Iterator for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Sca } } -impl SIMDArray for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Scalar : Packable { +impl SIMDArray for SIMDIter +where + A: SIMDArray, + A::Vector: Packed, + A::Scalar: Packable, +{ #[inline(always)] fn load(&self, offset: usize) -> Self::Vector { self.data.load(offset) @@ -498,14 +579,24 @@ impl SIMDArray for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Sc } } -impl SIMDSized for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Scalar : Packable { +impl SIMDSized for SIMDIter +where + A: SIMDArray, + A::Vector: Packed, + A::Scalar: Packable, +{ #[inline(always)] fn scalar_len(&self) -> usize { self.data.scalar_len() } } -impl SIMDIterable for SIMDIter where A : SIMDArray, A::Vector : Packed, A::Scalar : Packable { +impl SIMDIterable for SIMDIter +where + A: SIMDArray, + A::Vector: Packed, + A::Scalar: Packable, +{ #[inline(always)] fn scalar_pos(&self) -> usize { self.position @@ -522,7 +613,12 @@ impl SIMDIterable for SIMDIter where A : SIMDArray, A::Vector : Packed, A: } } -impl SIMDIterator for T where T : SIMDIterable + SIMDArray, S : Packable, V : Packed { +impl SIMDIterator for T +where + T: SIMDIterable + SIMDArray, + S: Packable, + V: Packed, +{ #[inline(always)] fn end(&mut self) -> Option<(Self::Vector, usize)> { if self.scalar_pos() < self.scalar_len() { @@ -545,19 +641,25 @@ impl SIMDIterator for T where T : SIMDIterable + SIMDArray SIMDIteratorMut for SIMDIter where T : SIMDArrayMut { +impl SIMDIteratorMut for SIMDIter +where + T: SIMDArrayMut, +{ fn simd_for_each(&mut self, mut func: F) - where F : FnMut(&mut Self::Vector) -> () { + where + F: FnMut(&mut Self::Vector) -> (), + { let mut lastvec = Self::Vector::default(); while let Some(mut v) = self.next() { func(&mut v); lastvec = v; let offset = self.scalar_pos() - self.width(); - unsafe { self.data.store_unchecked(v, offset); } + unsafe { + self.data.store_unchecked(v, offset); + } } let offset = self.scalar_pos(); if let Some((mut p, n)) = self.end() { @@ -572,7 +674,10 @@ impl SIMDIteratorMut for SIMDIter where T : SIMDArrayMut { } else { // The buffer won't fit one vector; store elementwise for i in 0..(width - n) { - unsafe { self.data.store_scalar_unchecked(p.extract_unchecked(i + n), offset + i); } + unsafe { + self.data + .store_scalar_unchecked(p.extract_unchecked(i + n), offset + i); + } } } } @@ -580,12 +685,17 @@ impl SIMDIteratorMut for SIMDIter where T : SIMDArrayMut { } #[doc(hidden)] -pub trait UnsafeIterator : Iterator + SIMDIterable { +pub trait UnsafeIterator: Iterator + SIMDIterable { unsafe fn next_unchecked(&mut self, offset: usize) -> Self::Item; unsafe fn end_unchecked(&mut self, offset: usize, empty_amt: usize) -> Self::Vector; } -impl UnsafeIterator for T where T : SIMDIterable + SIMDArray, S : Packable, V : Packed { +impl UnsafeIterator for T +where + T: SIMDIterable + SIMDArray, + S: Packable, + V: Packed, +{ #[inline(always)] unsafe fn next_unchecked(&mut self, offset: usize) -> Self::Item { debug_assert!(offset + self.width() <= self.scalar_len()); @@ -608,11 +718,15 @@ impl UnsafeIterator for T where T : SIMDIterable + SIMDArray Iterator for SIMDMap - where I : SIMDIterable, F : FnMut(I::Vector) -> A, A : Packed, B : Packable { +where + I: SIMDIterable, + F: FnMut(I::Vector) -> A, + A: Packed, + B: Packable, +{ type Item = A; #[inline(always)] @@ -621,7 +735,11 @@ impl Iterator for SIMDMap } } -impl ExactSizeIterator for SIMDMap where Self : Iterator, I : SIMDIterable { +impl ExactSizeIterator for SIMDMap +where + Self: Iterator, + I: SIMDIterable, +{ #[inline(always)] fn len(&self) -> usize { self.iter.len() @@ -629,7 +747,12 @@ impl ExactSizeIterator for SIMDMap where Self : Iterator, I : SIMDIt } impl SIMDObject for SIMDMap - where I : SIMDIterable, F : FnMut(I::Vector) -> A, A : Packed, B : Packable { +where + I: SIMDIterable, + F: FnMut(I::Vector) -> A, + A: Packed, + B: Packable, +{ type Vector = A; type Scalar = B; @@ -640,7 +763,12 @@ impl SIMDObject for SIMDMap } impl SIMDSized for SIMDMap - where I : SIMDIterable, F : FnMut(I::Vector) -> A, A : Packed, B : Packable { +where + I: SIMDIterable, + F: FnMut(I::Vector) -> A, + A: Packed, + B: Packable, +{ #[inline(always)] fn scalar_len(&self) -> usize { self.iter.len() @@ -648,7 +776,12 @@ impl SIMDSized for SIMDMap } impl SIMDIterable for SIMDMap - where I : SIMDIterable, F : FnMut(I::Vector) -> A, A : Packed, B : Packable { +where + I: SIMDIterable, + F: FnMut(I::Vector) -> A, + A: Packed, + B: Packable, +{ #[inline(always)] fn scalar_pos(&self) -> usize { self.iter.scalar_pos() @@ -667,7 +800,12 @@ impl SIMDIterable for SIMDMap } impl<'a, A, B, I, F> SIMDIterator for SIMDMap - where I : SIMDIterator, F : FnMut(I::Vector) -> A, A : Packed, B : Packable { +where + I: SIMDIterator, + F: FnMut(I::Vector) -> A, + A: Packed, + B: Packable, +{ #[inline(always)] fn end(&mut self) -> Option<(Self::Vector, usize)> { let (v, n) = self.iter.end()?; @@ -678,7 +816,10 @@ impl<'a, A, B, I, F> SIMDIterator for SIMDMap /// A trait which can transform a stream of vectors into a contiguous /// collection of scalars. -pub trait IntoScalar : SIMDObject where T : Packable { +pub trait IntoScalar: SIMDObject +where + T: Packable, +{ /// Take an iterator of SIMD vectors, and store them in-order in a Vec. #[cfg(feature = "std")] fn scalar_collect(&mut self) -> Vec; @@ -697,8 +838,11 @@ pub trait IntoScalar : SIMDObject where T : Packable { } impl<'a, T, I> IntoScalar for I - where I : SIMDIterator, I::Vector : Packed, T : Packable { - +where + I: SIMDIterator, + I::Vector: Packed, + T: Packable, +{ #[inline(always)] #[cfg(feature = "std")] fn scalar_collect(&mut self) -> Vec { @@ -739,7 +883,9 @@ impl<'a, T, I> IntoScalar for I let mut lastvec = Self::Vector::default(); while let Some(vec) = self.next() { - unsafe { vec.store_unchecked(fill, offset); } + unsafe { + vec.store_unchecked(fill, offset); + } offset += self.width(); lastvec = vec; } @@ -781,12 +927,16 @@ impl<'a, T, I> IntoScalar for I let mut offset = 0; while let Some(vec) = self.next() { - unsafe { vec.store_unchecked(fill, offset); } + unsafe { + vec.store_unchecked(fill, offset); + } offset += self.width(); } if let Some((vec, _)) = self.end() { - unsafe { vec.store_unchecked(fill, offset); } + unsafe { + vec.store_unchecked(fill, offset); + } } fill @@ -800,7 +950,8 @@ mod tests { #[test] #[cfg(feature = "std")] fn bitcast_map_width_doubles() { - let y = [1, 2, 3, 4, 5i64].simd_iter(i64s(0)) + let y = [1, 2, 3, 4, 5i64] + .simd_iter(i64s(0)) .simd_map(|v| v.to_le().be_u32s()) .scalar_collect(); @@ -810,24 +961,31 @@ mod tests { #[test] #[cfg(feature = "std")] fn bitcast_map_width_quadruples() { - let y = [1, 2, 3, 4, 5i64].simd_iter(i64s(0)) + let y = [1, 2, 3, 4, 5i64] + .simd_iter(i64s(0)) .simd_map(|v| v.to_le().be_u16s()) .scalar_collect(); - assert_eq!(y, [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, - 4, 0, 0, 0, 5, 0, 0, 0]); + assert_eq!( + y, + [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0] + ); } #[test] #[cfg(feature = "std")] fn bitcast_map_width_octuples() { - let y = [1, 2, 3, 4, 5i64].simd_iter(i64s(0)) + let y = [1, 2, 3, 4, 5i64] + .simd_iter(i64s(0)) .simd_map(|v| v.to_le().be_u8s()) .scalar_collect(); - assert_eq!(y.as_slice(), - &[1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, - 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, - 5, 0, 0, 0, 0, 0, 0, 0u8][..]); + assert_eq!( + y.as_slice(), + &[ + 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0u8 + ][..] + ); } } diff --git a/src/lib.rs b/src/lib.rs index ec6612d..0264f58 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -211,16 +211,21 @@ mod core { extern crate packed_simd; extern crate vektor; -#[macro_use] pub(crate) mod debug; -#[macro_use] pub mod zip; -#[macro_use] pub mod vecs; -pub mod vec_patterns; -pub mod iters; +#[macro_use] +pub(crate) mod debug; +#[macro_use] +pub mod zip; +#[macro_use] +pub mod vecs; pub mod into_iters; -#[macro_use] pub mod intrin; -#[macro_use] pub mod arch; +pub mod iters; +pub mod vec_patterns; +#[macro_use] +pub mod intrin; +#[macro_use] +pub mod arch; pub mod prelude; -pub mod stride_zip; pub mod stride; +pub mod stride_zip; pub use crate::prelude::*; diff --git a/src/prelude.rs b/src/prelude.rs index f4331d0..ef2c02f 100644 --- a/src/prelude.rs +++ b/src/prelude.rs @@ -5,12 +5,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -pub use crate::iters::*; -pub use crate::into_iters::*; -pub use crate::vecs::{Packed, Pattern}; -pub use crate::arch::current::vecs::{u8s, i8s, u16s, i16s, u32s, i32s, f32s, u64s, i64s, f64s}; pub use crate::arch::current::intrin::prelude::*; +pub use crate::arch::current::vecs::{f32s, f64s, i16s, i32s, i64s, i8s, u16s, u32s, u64s, u8s}; +pub use crate::into_iters::*; pub use crate::intrin::prelude::*; -pub use crate::zip::*; -pub use crate::stride_zip::*; +pub use crate::iters::*; pub use crate::stride::*; +pub use crate::stride_zip::*; +pub use crate::vecs::{Packed, Pattern}; +pub use crate::zip::*; diff --git a/src/stride.rs b/src/stride.rs index 4a27b95..45ee824 100644 --- a/src/stride.rs +++ b/src/stride.rs @@ -8,8 +8,8 @@ #![allow(unused_imports)] use crate::arch::current::vecs::*; -use crate::iters::{SIMDIterable, SIMDIterator, SIMDArray, SIMDObject, UnsafeIterator, SIMDSized}; -use crate::core::iter::{Iterator, ExactSizeIterator, FromIterator}; +use crate::core::iter::{ExactSizeIterator, FromIterator, Iterator}; +use crate::iters::{SIMDArray, SIMDIterable, SIMDIterator, SIMDObject, SIMDSized, UnsafeIterator}; use crate::vecs::*; // For AVX2 gathers @@ -19,15 +19,21 @@ use crate::intrin::transmute::*; /// A slice-backed iterator which packs every nth element of its constituent /// elements into a vector. #[derive(Clone)] -pub struct PackedStride<'a, A> where A : 'a + SIMDArray { +pub struct PackedStride<'a, A> +where + A: 'a + SIMDArray, +{ iter: &'a A, pos: usize, base: usize, // TODO: Can we get rid of this? stride: usize, - default: ::Vector + default: ::Vector, } -impl<'a, A> Iterator for PackedStride<'a, A> where A : 'a + SIMDArray { +impl<'a, A> Iterator for PackedStride<'a, A> +where + A: 'a + SIMDArray, +{ type Item = ::Vector; #[inline(always)] @@ -48,14 +54,20 @@ impl<'a, A> Iterator for PackedStride<'a, A> where A : 'a + SIMDArray { } } -impl<'a, A> ExactSizeIterator for PackedStride<'a, A> where A : SIMDArray { +impl<'a, A> ExactSizeIterator for PackedStride<'a, A> +where + A: SIMDArray, +{ #[inline(always)] fn len(&self) -> usize { self.iter.vector_len() / self.stride } } -pub trait Stride where A : SIMDArray { +pub trait Stride +where + A: SIMDArray, +{ /// Return a vec of iterators which pack every `count`th element into an /// iterator. The nth iterator of the tuple is offset by n - 1. Therefore, /// the 1st iterator will pack the 0th, `count`th, `count * 2`th... @@ -89,28 +101,77 @@ pub trait Stride where A : SIMDArray { /// iterator. The nth iterator of the tuple is offset by n - 1. Therefore, /// the 1st iterator will pack the 0th, 2nd, 4th... elements, while the 2nd /// iterator will pack the 1st, 3rd, 5th... elements. - fn stride_two(&self, default: (::Vector, ::Vector)) -> (PackedStride, PackedStride); + fn stride_two( + &self, + default: (::Vector, ::Vector), + ) -> (PackedStride, PackedStride); /// Return a tuple of iterators which pack every 3rd element into an /// iterator. The nth iterator of the tuple is offset by n - 1. Therefore, /// the 1st iterator will pack the 0th, 3rd, 6th... elements, while the 2nd /// iterator will pack the 1st, 4th, 7th... elements. - fn stride_three(&self, default: (::Vector, ::Vector , ::Vector)) -> (PackedStride, PackedStride , PackedStride); + fn stride_three( + &self, + default: ( + ::Vector, + ::Vector, + ::Vector, + ), + ) -> (PackedStride, PackedStride, PackedStride); /// Return a tuple of iterators which pack every 4th element into an /// iterator. The nth iterator of the tuple is offset by n - 1. Therefore, /// the 1st iterator will pack the 0th, 4th, 8th... elements, while the 2nd /// iterator will pack the 1st, 5th, 9th... elements. - fn stride_four(&self, default: (::Vector, ::Vector, ::Vector, ::Vector)) -> (PackedStride, PackedStride, PackedStride, PackedStride); + fn stride_four( + &self, + default: ( + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ), + ) -> ( + PackedStride, + PackedStride, + PackedStride, + PackedStride, + ); /// Return a tuple of iterators which pack every 9th element into an /// iterator. The nth iterator of the tuple is offset by n - 1. Therefore, /// the 1st iterator will pack the 0th, 9th, 18th... elements, while the 2nd /// iterator will pack the 1st, 10th, 19th... elements. - fn stride_nine(&self, default: (::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector)) -> (PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride); + fn stride_nine( + &self, + default: ( + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ), + ) -> ( + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + ); } -impl Stride for A where A : SIMDArray { +impl Stride for A +where + A: SIMDArray, +{ #[inline(always)] fn stride_into<'s, C>(&'s self, count: usize, default: &[::Vector]) -> C where @@ -118,169 +179,218 @@ impl Stride for A where A : SIMDArray { A: 's, { assert!(default.len() == count); - (0..count).map(move |offset| { - PackedStride { + (0..count) + .map(move |offset| PackedStride { iter: self, pos: offset, base: offset, stride: count, - default: unsafe { *default.get_unchecked(offset) } - } - }).collect() + default: unsafe { *default.get_unchecked(offset) }, + }) + .collect() } #[inline(always)] - fn stride_two(&self, default: (::Vector, ::Vector)) -> (PackedStride, PackedStride) { + fn stride_two( + &self, + default: (::Vector, ::Vector), + ) -> (PackedStride, PackedStride) { ( PackedStride { iter: self, pos: 0, base: 0, stride: 2, - default: default.0 + default: default.0, }, PackedStride { iter: self, pos: 1, base: 1, stride: 2, - default: default.1 - } + default: default.1, + }, ) } #[inline(always)] - fn stride_three(&self, default: (::Vector, ::Vector , ::Vector)) -> (PackedStride, PackedStride , PackedStride) { + fn stride_three( + &self, + default: ( + ::Vector, + ::Vector, + ::Vector, + ), + ) -> (PackedStride, PackedStride, PackedStride) { ( PackedStride { iter: self, pos: 0, base: 0, stride: 3, - default: default.0 + default: default.0, }, PackedStride { iter: self, pos: 1, base: 1, stride: 3, - default: default.1 + default: default.1, }, PackedStride { iter: self, pos: 2, base: 2, stride: 3, - default: default.2 - } + default: default.2, + }, ) } #[inline(always)] - fn stride_four(&self, default: (::Vector, ::Vector, ::Vector, ::Vector)) -> (PackedStride, PackedStride, PackedStride, PackedStride) { + fn stride_four( + &self, + default: ( + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ), + ) -> ( + PackedStride, + PackedStride, + PackedStride, + PackedStride, + ) { ( PackedStride { iter: self, pos: 0, base: 0, stride: 4, - default: default.0 + default: default.0, }, PackedStride { iter: self, pos: 1, base: 1, stride: 4, - default: default.1 + default: default.1, }, PackedStride { iter: self, pos: 2, base: 2, stride: 4, - default: default.2 + default: default.2, }, PackedStride { iter: self, pos: 3, base: 3, stride: 4, - default: default.3 - } + default: default.3, + }, ) } #[inline(always)] - fn stride_nine(&self, default: (::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector, ::Vector)) -> (PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride, PackedStride) { + fn stride_nine( + &self, + default: ( + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ::Vector, + ), + ) -> ( + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + PackedStride, + ) { ( PackedStride { iter: self, pos: 0, base: 0, stride: 9, - default: default.0 + default: default.0, }, PackedStride { iter: self, pos: 1, base: 1, stride: 9, - default: default.1 + default: default.1, }, PackedStride { iter: self, pos: 2, base: 2, stride: 9, - default: default.2 + default: default.2, }, PackedStride { iter: self, pos: 3, base: 3, stride: 9, - default: default.3 + default: default.3, }, PackedStride { iter: self, pos: 4, base: 4, stride: 9, - default: default.4 + default: default.4, }, PackedStride { iter: self, pos: 5, base: 5, stride: 9, - default: default.5 + default: default.5, }, PackedStride { iter: self, pos: 6, base: 6, stride: 9, - default: default.6 + default: default.6, }, PackedStride { iter: self, pos: 7, base: 7, stride: 9, - default: default.7 + default: default.7, }, PackedStride { iter: self, pos: 8, base: 8, stride: 9, - default: default.8 - } + default: default.8, + }, ) } } -impl<'a, A> SIMDObject for PackedStride<'a, A> where A : SIMDArray { +impl<'a, A> SIMDObject for PackedStride<'a, A> +where + A: SIMDArray, +{ type Scalar = ::Scalar; type Vector = ::Vector; @@ -290,7 +400,10 @@ impl<'a, A> SIMDObject for PackedStride<'a, A> where A : SIMDArray { } } -impl<'a, A> SIMDArray for PackedStride<'a, A> where A : SIMDArray { +impl<'a, A> SIMDArray for PackedStride<'a, A> +where + A: SIMDArray, +{ #[inline(always)] fn load(&self, offset: usize) -> Self::Vector { assert!(self.base + self.stride * (offset + (self.width() - 1)) < self.iter.scalar_len()); @@ -303,7 +416,11 @@ impl<'a, A> SIMDArray for PackedStride<'a, A> where A : SIMDArray { let mut ret = ::Vector::default(); for i in 0..self.width() { - ret = ret.replace(i, self.iter.load_scalar_unchecked(self.base + self.stride * (offset + i))); + ret = ret.replace( + i, + self.iter + .load_scalar_unchecked(self.base + self.stride * (offset + i)), + ); } ret } @@ -315,18 +432,25 @@ impl<'a, A> SIMDArray for PackedStride<'a, A> where A : SIMDArray { #[inline(always)] unsafe fn load_scalar_unchecked(&self, offset: usize) -> Self::Scalar { - self.iter.load_scalar_unchecked(self.base + offset * self.stride) + self.iter + .load_scalar_unchecked(self.base + offset * self.stride) } } -impl<'a, A> SIMDSized for PackedStride<'a, A> where A : SIMDArray { +impl<'a, A> SIMDSized for PackedStride<'a, A> +where + A: SIMDArray, +{ #[inline(always)] fn scalar_len(&self) -> usize { self.iter.scalar_len() / self.stride } } -impl<'a, A> SIMDIterable for PackedStride<'a, A> where A : SIMDArray { +impl<'a, A> SIMDIterable for PackedStride<'a, A> +where + A: SIMDArray, +{ #[inline(always)] fn scalar_pos(&self) -> usize { (self.pos - self.base) / self.stride @@ -350,8 +474,8 @@ impl<'a, A> SIMDIterable for PackedStride<'a, A> where A : SIMDArray { #[cfg(test)] mod tests { - use super::super::*; use super::super::zip::*; + use super::super::*; use super::*; diff --git a/src/stride_zip.rs b/src/stride_zip.rs index eed06d3..86ddb0d 100644 --- a/src/stride_zip.rs +++ b/src/stride_zip.rs @@ -1,32 +1,46 @@ -use crate::iters::{SIMDIterator}; -use crate::vecs::{Packed, Packable}; use crate::intrin::destride::*; +use crate::iters::SIMDIterator; +use crate::vecs::{Packable, Packed}; use crate::zip::{SIMDZippedIterable, SIMDZippedIterator, SIMDZippedObject}; -pub struct StrideZip where T : SIMDIterator, T::Vector : Destride { +pub struct StrideZip +where + T: SIMDIterator, + T::Vector: Destride, +{ base: usize, peek: Option, - iter: T + iter: T, } /// A trait which can transform a collection of iterators into a `Zip` -pub trait IntoStrideZip : Sized { +pub trait IntoStrideZip: Sized { /// Return an iterator which may iterate over `self` in lockstep. fn stride_zip(self) -> StrideZip - where Self : SIMDIterator, Self::Vector : Destride; + where + Self: SIMDIterator, + Self::Vector: Destride; } -impl IntoStrideZip for T where T : SIMDIterator, T::Vector : Destride { +impl IntoStrideZip for T +where + T: SIMDIterator, + T::Vector: Destride, +{ fn stride_zip(self) -> StrideZip { StrideZip { base: self.scalar_pos(), peek: None, - iter: self + iter: self, } } } -impl SIMDZippedObject for StrideZip where T : SIMDIterator, T::Vector : Destride { +impl SIMDZippedObject for StrideZip +where + T: SIMDIterator, + T::Vector: Destride, +{ type Scalars = (T::Scalar, T::Scalar); type Vectors = (T::Vector, T::Vector); @@ -43,14 +57,22 @@ impl SIMDZippedObject for StrideZip where T : SIMDIterator, T::Vector : De } } -impl ExactSizeIterator for StrideZip where T : SIMDIterator, T::Vector : Destride { +impl ExactSizeIterator for StrideZip +where + T: SIMDIterator, + T::Vector: Destride, +{ #[inline(always)] fn len(&self) -> usize { self.iter.len() / 2 } } -impl SIMDZippedIterable for StrideZip where T : SIMDIterator, T::Vector : Destride { +impl SIMDZippedIterable for StrideZip +where + T: SIMDIterator, + T::Vector: Destride, +{ #[inline(always)] fn scalar_pos(&self) -> usize { (self.iter.scalar_pos() - self.base) / 2 @@ -77,9 +99,13 @@ impl SIMDZippedIterable for StrideZip where T : SIMDIterator, T::Vector : } } -impl Iterator for StrideZip where T : SIMDIterator, T::Vector : Destride { +impl Iterator for StrideZip +where + T: SIMDIterator, + T::Vector: Destride, +{ type Item = ::Vectors; - + fn next(&mut self) -> Option { let first = self.iter.next()?; let second = self.iter.next(); @@ -92,7 +118,11 @@ impl Iterator for StrideZip where T : SIMDIterator, T::Vector : Destride { } } -impl SIMDZippedIterator for StrideZip where T : SIMDIterator, T::Vector : Destride { +impl SIMDZippedIterator for StrideZip +where + T: SIMDIterator, + T::Vector: Destride, +{ fn end(&mut self) -> Option<(Self::Vectors, usize)> { let first = self.iter.next(); let (end, n) = self.iter.end().unwrap_or((self.iter.default(), 0)); diff --git a/src/vec_patterns.rs b/src/vec_patterns.rs index 3e9256e..5bc83ef 100644 --- a/src/vec_patterns.rs +++ b/src/vec_patterns.rs @@ -7,10 +7,10 @@ // This file is machine-generated. See vec_patterns_gen.py for more info. -use crate::vecs::*; - +use crate::vecs::*; + /// Constructors which may be used to instantiate vectors with patterned data. -pub trait Pattern : Packed { +pub trait Pattern: Packed { /// Return a vector whose first `Self::WIDTH / 2` elements are `hi`, and /// whose last `Self::WIDTH / 2` elements are `lo`. fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self; diff --git a/src/vecs.rs b/src/vecs.rs index f9b0241..8d6886b 100644 --- a/src/vecs.rs +++ b/src/vecs.rs @@ -6,14 +6,14 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. #![allow(dead_code)] -pub use crate::vec_patterns::Pattern; use crate::core::fmt::Debug; use crate::intrin::merge::*; +pub use crate::vec_patterns::Pattern; /// A SIMD vector of some type. -pub trait Packed : Sized + Copy + Debug + Merge { +pub trait Packed: Sized + Copy + Debug + Merge { /// The type which fits into this SIMD vector - type Scalar : Packable; + type Scalar: Packable; /// The number of elements in this vector const WIDTH: usize; @@ -70,12 +70,16 @@ pub trait Packed : Sized + Copy + Debug + Merge { /// Return the result of a scalar reduction over this vector fn scalar_reduce(&self, acc: T, func: F) -> T - where F: FnMut(T, Self::Scalar) -> T; + where + F: FnMut(T, Self::Scalar) -> T; } /// A type that may be packed into a SIMD vector. -pub trait Packable where Self : Sized + Copy + Debug { - type Vector : Packed + Clone; +pub trait Packable +where + Self: Sized + Copy + Debug, +{ + type Vector: Packed + Clone; const SIZE: usize; } @@ -191,4 +195,3 @@ macro_rules! impl_packed { } ); } - diff --git a/src/zip.rs b/src/zip.rs index 960e2c4..7c8160d 100644 --- a/src/zip.rs +++ b/src/zip.rs @@ -5,8 +5,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -use crate::iters::{SIMDIterator, SIMDIterable, SIMDObject, UnsafeIterator, SIMDSized}; -use crate::vecs::{Packed, Packable}; +use crate::iters::{SIMDIterable, SIMDIterator, SIMDObject, SIMDSized, UnsafeIterator}; +use crate::vecs::{Packable, Packed}; /// A macro which takes a number n and an expression, and returns a tuple /// containing n copies of the expression. Only works for numbers less than or @@ -22,41 +22,69 @@ use crate::vecs::{Packed, Packable}; /// assert_eq!(tuplify!(3, i8s::splat(0)), (i8s::splat(0), i8s::splat(0), i8s::splat(0))); /// # } /// ``` -#[macro_export] macro_rules! tuplify { - (1, $i:expr) => { ($i) }; - (2, $i:expr) => { ($i, $i) }; - (3, $i:expr) => { ($i, $i, $i) }; - (4, $i:expr) => { ($i, $i, $i, $i) }; - (5, $i:expr) => { ($i, $i, $i, $i, $i) }; - (6, $i:expr) => { ($i, $i, $i, $i, $i, $i) }; - (7, $i:expr) => { ($i, $i, $i, $i, $i, $i, $i) }; - (8, $i:expr) => { ($i, $i, $i, $i, $i, $i, $i, $i) }; - (9, $i:expr) => { ($i, $i, $i, $i, $i, $i, $i, $i, $i) }; - (10, $i:expr) => { ($i, $i, $i, $i, $i, $i, $i, $i, $i, $i) }; - (11, $i:expr) => { ($i, $i, $i, $i, $i, $i, $i, $i, $i, $i, $i) }; - (12, $i:expr) => { ($i, $i, $i, $i, $i, $i, $i, $i, $i, $i, $i, $i) }; +#[macro_export] +macro_rules! tuplify { + (1, $i:expr) => { + ($i) + }; + (2, $i:expr) => { + ($i, $i) + }; + (3, $i:expr) => { + ($i, $i, $i) + }; + (4, $i:expr) => { + ($i, $i, $i, $i) + }; + (5, $i:expr) => { + ($i, $i, $i, $i, $i) + }; + (6, $i:expr) => { + ($i, $i, $i, $i, $i, $i) + }; + (7, $i:expr) => { + ($i, $i, $i, $i, $i, $i, $i) + }; + (8, $i:expr) => { + ($i, $i, $i, $i, $i, $i, $i, $i) + }; + (9, $i:expr) => { + ($i, $i, $i, $i, $i, $i, $i, $i, $i) + }; + (10, $i:expr) => { + ($i, $i, $i, $i, $i, $i, $i, $i, $i, $i) + }; + (11, $i:expr) => { + ($i, $i, $i, $i, $i, $i, $i, $i, $i, $i, $i) + }; + (12, $i:expr) => { + ($i, $i, $i, $i, $i, $i, $i, $i, $i, $i, $i, $i) + }; } /// A lazy iterator which returns tuples of the elements of its contained /// iterators. pub struct Zip { - iters: T + iters: T, } /// A lazy mapping iterator which applies its function to a stream of tuples of /// vectors. -pub struct SIMDZipMap where I : SIMDZippedIterator { +pub struct SIMDZipMap +where + I: SIMDZippedIterator, +{ iter: I, func: F, } /// A trait which can transform a collection of iterators into a `Zip` -pub trait IntoSIMDZip : Sized { +pub trait IntoSIMDZip: Sized { /// Return an iterator which may iterate over `self` in lockstep. fn zip(self) -> Zip; } -pub trait SIMDZippedObject : Sized { +pub trait SIMDZippedObject: Sized { type Scalars; type Vectors; @@ -69,7 +97,9 @@ pub trait SIMDZippedObject : Sized { /// An iterator which automatically packs the values it iterates over into SIMD /// vectors. -pub trait SIMDZippedIterable : SIMDZippedObject + ExactSizeIterator::Vectors> { +pub trait SIMDZippedIterable: + SIMDZippedObject + ExactSizeIterator::Vectors> +{ /// Return the current position of this iterator, measured in scalars fn scalar_pos(&self) -> usize; @@ -127,7 +157,7 @@ pub trait SIMDZippedIterable : SIMDZippedObject + ExactSizeIterator(self, func: F) -> SIMDZipMap - where F : FnMut(Self::Vectors) -> A, A : Packed, B : Packable { + where + F: FnMut(Self::Vectors) -> A, + A: Packed, + B: Packable, + { SIMDZipMap { iter: self, func: func, @@ -147,7 +181,9 @@ pub trait SIMDZippedIterator : SIMDZippedIterable { /// modifying the iterator. #[inline(always)] fn simd_do_each(&mut self, mut func: F) - where F : FnMut(Self::Vectors) -> () { + where + F: FnMut(Self::Vectors) -> (), + { while let Some(v) = self.next() { func(v); } @@ -208,8 +244,9 @@ pub trait SIMDZippedIterator : SIMDZippedIterable { /// [`Packed::product`]: vecs/trait.Packed.html#tymethod.product #[inline(always)] fn simd_reduce(&mut self, mut start: A, mut func: F) -> A - where F : FnMut(A, Self::Vectors) -> A { - + where + F: FnMut(A, Self::Vectors) -> A, + { while let Some(v) = self.next() { start = func(start, v); } @@ -303,7 +340,11 @@ macro_rules! impl_iter_zip { } impl Iterator for SIMDZipMap - where I : SIMDZippedIterator, F : FnMut(I::Vectors) -> A, A : Packed { +where + I: SIMDZippedIterator, + F: FnMut(I::Vectors) -> A, + A: Packed, +{ type Item = A; #[inline(always)] @@ -313,7 +354,11 @@ impl Iterator for SIMDZipMap } impl ExactSizeIterator for SIMDZipMap - where I : SIMDZippedIterator, F : FnMut(I::Vectors) -> A, A : Packed { +where + I: SIMDZippedIterator, + F: FnMut(I::Vectors) -> A, + A: Packed, +{ #[inline(always)] fn len(&self) -> usize { self.iter.len() @@ -321,13 +366,21 @@ impl ExactSizeIterator for SIMDZipMap } impl SIMDObject for SIMDZipMap - where I : SIMDZippedIterator, F : FnMut(I::Vectors) -> A, A : Packed { +where + I: SIMDZippedIterator, + F: FnMut(I::Vectors) -> A, + A: Packed, +{ type Vector = A; type Scalar = A::Scalar; } impl SIMDSized for SIMDZipMap - where I : SIMDZippedIterator, F : FnMut(I::Vectors) -> A, A : Packed { +where + I: SIMDZippedIterator, + F: FnMut(I::Vectors) -> A, + A: Packed, +{ /// Return the length of this iterator, measured in scalars. #[inline(always)] fn scalar_len(&self) -> usize { @@ -342,7 +395,11 @@ impl SIMDSized for SIMDZipMap } impl SIMDIterable for SIMDZipMap - where I : SIMDZippedIterator, F : FnMut(I::Vectors) -> A, A : Packed { +where + I: SIMDZippedIterator, + F: FnMut(I::Vectors) -> A, + A: Packed, +{ #[inline(always)] fn scalar_pos(&self) -> usize { self.iter.scalar_pos() @@ -361,7 +418,11 @@ impl SIMDIterable for SIMDZipMap } impl SIMDIterator for SIMDZipMap -where I : SIMDZippedIterator, F : FnMut(I::Vectors) -> A, A : Packed { +where + I: SIMDZippedIterator, + F: FnMut(I::Vectors) -> A, + A: Packed, +{ #[inline(always)] fn end(&mut self) -> Option<(Self::Vector, usize)> { let (v, n) = self.iter.end()?; @@ -370,39 +431,47 @@ where I : SIMDZippedIterator, F : FnMut(I::Vectors) -> A, A : Packed { } } -impl_iter_zip!((A, B), - (AA, BB), - (1)); -impl_iter_zip!((A, B, C), - (AA, BB, CC), - (1, 2)); -impl_iter_zip!((A, B, C, D), - (AA, BB, CC, DD), - (1, 2, 3)); -impl_iter_zip!((A, B, C, D, E), - (AA, BB, CC, DD, EE), - (1, 2, 3, 4)); -impl_iter_zip!((A, B, C, D, E, F), - (AA, BB, CC, DD, EE, FF), - (1, 2, 3, 4, 5)); -impl_iter_zip!((A, B, C, D, E, F, G), - (AA, BB, CC, DD, EE, FF, GG), - (1, 2, 3, 4, 5, 6)); -impl_iter_zip!((A, B, C, D, E, F, G, H), - (AA, BB, CC, DD, EE, FF, GG, HH), - (1, 2, 3, 4, 5, 6, 7)); -impl_iter_zip!((A, B, C, D, E, F, G, H, I), - (AA, BB, CC, DD, EE, FF, GG, HH, II), - (1, 2, 3, 4, 5, 6, 7, 8)); -impl_iter_zip!((A, B, C, D, E, F, G, H, I, J), - (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ), - (1, 2, 3, 4, 5, 6, 7, 8, 9)); -impl_iter_zip!((A, B, C, D, E, F, G, H, I, J, K), - (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ, KK), - (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); -impl_iter_zip!((A, B, C, D, E, F, G, H, I, J, K, L), - (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ, KK, LL), - (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)); -impl_iter_zip!((A, B, C, D, E, F, G, H, I, J, K, L, M), - (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ, KK, LL, MM), - (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)); +impl_iter_zip!((A, B), (AA, BB), (1)); +impl_iter_zip!((A, B, C), (AA, BB, CC), (1, 2)); +impl_iter_zip!((A, B, C, D), (AA, BB, CC, DD), (1, 2, 3)); +impl_iter_zip!((A, B, C, D, E), (AA, BB, CC, DD, EE), (1, 2, 3, 4)); +impl_iter_zip!( + (A, B, C, D, E, F), + (AA, BB, CC, DD, EE, FF), + (1, 2, 3, 4, 5) +); +impl_iter_zip!( + (A, B, C, D, E, F, G), + (AA, BB, CC, DD, EE, FF, GG), + (1, 2, 3, 4, 5, 6) +); +impl_iter_zip!( + (A, B, C, D, E, F, G, H), + (AA, BB, CC, DD, EE, FF, GG, HH), + (1, 2, 3, 4, 5, 6, 7) +); +impl_iter_zip!( + (A, B, C, D, E, F, G, H, I), + (AA, BB, CC, DD, EE, FF, GG, HH, II), + (1, 2, 3, 4, 5, 6, 7, 8) +); +impl_iter_zip!( + (A, B, C, D, E, F, G, H, I, J), + (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ), + (1, 2, 3, 4, 5, 6, 7, 8, 9) +); +impl_iter_zip!( + (A, B, C, D, E, F, G, H, I, J, K), + (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ, KK), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10) +); +impl_iter_zip!( + (A, B, C, D, E, F, G, H, I, J, K, L), + (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ, KK, LL), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) +); +impl_iter_zip!( + (A, B, C, D, E, F, G, H, I, J, K, L, M), + (AA, BB, CC, DD, EE, FF, GG, HH, II, JJ, KK, LL, MM), + (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) +); diff --git a/tests/iters.rs b/tests/iters.rs index 674d7b8..55f56c4 100644 --- a/tests/iters.rs +++ b/tests/iters.rs @@ -14,8 +14,7 @@ mod tests { vec.simd_iter_mut(f32s(0.0)) .simd_for_each(|x| *x /= f32s(2f32)); - scl.iter_mut() - .for_each(|x| *x /= 2f32); + scl.iter_mut().for_each(|x| *x /= 2f32); assert_eq!(vec, scl); }; @@ -33,7 +32,10 @@ mod tests { #[test] fn simd_reduce() { let vec = [2u32; 129]; - let sum = vec.simd_iter(u32s(0u32)).simd_reduce(u32s(0u32), |acc, x| acc + x).sum(); + let sum = vec + .simd_iter(u32s(0u32)) + .simd_reduce(u32s(0u32), |acc, x| acc + x) + .sum(); assert_eq!(sum, 2 * 129); } } diff --git a/tests/kernel.rs b/tests/kernel.rs index 931f968..90ee342 100644 --- a/tests/kernel.rs +++ b/tests/kernel.rs @@ -5,7 +5,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -#![feature(test,stdsimd)] +#![feature(test, stdsimd)] extern crate faster; @@ -14,19 +14,21 @@ mod tests { use faster::*; macro_rules! kernel_definite { - ($name:ident, $native_type:ty, $simd_type:ident) => ( - + ($name:ident, $native_type:ty, $simd_type:ident) => { /// Tests a number of simple kernel computations with integer values. #[test] fn $name() { - for n in 0 .. 16 { - + for n in 0..16 { let vec_of_1 = vec![1 as $native_type; n]; let vec_of_3 = vec![3 as $native_type; n]; let mut out_vec = vec![0 as $native_type; n]; // Should produce n times (3 - 1) * (3 - 1) == n * 4 for each element - let sum: $native_type = ((&vec_of_3[..]).simd_iter($simd_type(0)), (&vec_of_1[..]).simd_iter($simd_type(0))).zip() + let sum: $native_type = ( + (&vec_of_3[..]).simd_iter($simd_type(0)), + (&vec_of_1[..]).simd_iter($simd_type(0)), + ) + .zip() .simd_map(|(a, b)| (a - b) * (a - b)) .scalar_fill(&mut out_vec) .iter() @@ -35,7 +37,11 @@ mod tests { assert_eq!(sum, (n * 4) as $native_type); // Same as above, but this time we reduce with simd_reduce - let sum: $native_type = ((&vec_of_3[..]).simd_iter($simd_type(0)), (&vec_of_1[..]).simd_iter($simd_type(0))).zip() + let sum: $native_type = ( + (&vec_of_3[..]).simd_iter($simd_type(0)), + (&vec_of_1[..]).simd_iter($simd_type(0)), + ) + .zip() .simd_map(|(a, b)| (a - b) * (a - b)) .simd_reduce($simd_type(0), |a, v| a + v) .sum(); @@ -43,7 +49,7 @@ mod tests { assert_eq!(sum, (n * 4) as $native_type); } } - ) + }; } kernel_definite!(kernel_i64, i64, i64s); @@ -57,24 +63,26 @@ mod tests { kernel_definite!(kernel_u8, u8, u8s); macro_rules! kernel_relative { - ($name:ident, $native_type:ty, $simd_type:ident) => ( - + ($name:ident, $native_type:ty, $simd_type:ident) => { /// Tests a number of simple kernel computations with float values. #[test] fn $name() { - for n in 0 .. 16 { + for n in 0..16 { let vec_of_1 = vec![1 as $native_type; n]; let vec_of_3 = vec![3 as $native_type; n]; // Should produce n times (1 - 3) * (1 - 3) == n * 4 for each element - let sum_scalar: $native_type = vec_of_1.iter() + let sum_scalar: $native_type = vec_of_1 + .iter() .zip(vec_of_3.iter()) .map(|(a, b)| (a - b) * (a - b)) .sum(); // Same as above, but this time we reduce with simd_reduce - let sum_simd: $native_type = (vec_of_1.simd_iter($simd_type(0.0 as $native_type)), - vec_of_3.simd_iter($simd_type(0.0 as $native_type))) + let sum_simd: $native_type = ( + vec_of_1.simd_iter($simd_type(0.0 as $native_type)), + vec_of_3.simd_iter($simd_type(0.0 as $native_type)), + ) .zip() .simd_map(|(a, b)| (a - b) * (a - b)) .simd_reduce($simd_type(0.0 as $native_type), |a, v| a + v) @@ -87,7 +95,7 @@ mod tests { assert!((sum_simd - (n * 4) as $native_type).abs() < 0.0001); } } - ) + }; } kernel_relative!(kernel_f32, f32, f32s); diff --git a/tests/zip.rs b/tests/zip.rs index 1f3b345..e9668f6 100644 --- a/tests/zip.rs +++ b/tests/zip.rs @@ -17,8 +17,15 @@ mod tests { #[test] #[cfg(feature = "std")] fn zipped_stride_iters() { - let matrices = [1i16, 2, 3, 4, 5, 6, 7, 8, 9][..].iter().cycle().take(9 * 100).map(|i| i.clone()).collect::>(); - let determinants = (&matrices[..]).stride_nine(tuplify!(9, i16s(0))).zip() + let matrices = [1i16, 2, 3, 4, 5, 6, 7, 8, 9][..] + .iter() + .cycle() + .take(9 * 100) + .map(|i| i.clone()) + .collect::>(); + let determinants = (&matrices[..]) + .stride_nine(tuplify!(9, i16s(0))) + .zip() .simd_map(|(a, b, c, d, e, f, g, h, i)| { assert_eq!(a.extract(a.width() - 1), 1); assert_eq!(b.extract(b.width() - 1), 2); @@ -30,49 +37,71 @@ mod tests { assert_eq!(h.extract(h.width() - 1), 8); assert_eq!(i.extract(i.width() - 1), 9); (a * e * i) + (b * f * g) + (c * d * h) - (c * e * g) - (b * d * i) - (a * f * h) - }).scalar_collect(); + }) + .scalar_collect(); assert!(determinants.iter().fold(true, |acc, x| acc && x == &0)); - let matrices = [1i64, 0, 0, 0, 5, 4, 2, 3, 0][..].iter().cycle().take(9 * 100).map(|i| i.clone()).collect::>(); - let determinants = (&matrices[..]).stride_nine(tuplify!(9, i64s(0))).zip() + let matrices = [1i64, 0, 0, 0, 5, 4, 2, 3, 0][..] + .iter() + .cycle() + .take(9 * 100) + .map(|i| i.clone()) + .collect::>(); + let determinants = (&matrices[..]) + .stride_nine(tuplify!(9, i64s(0))) + .zip() .simd_map(|(a, b, c, d, e, f, g, h, i)| { (a * e * i) + (b * f * g) + (c * d * h) - (c * e * g) - (b * d * i) - (a * f * h) - }).scalar_collect(); - assert!(determinants.iter().fold(true, |acc, x| { acc && x == &-12 })); + }) + .scalar_collect(); + assert!(determinants + .iter() + .fold(true, |acc, x| { acc && x == &-12 })); } #[test] #[cfg(feature = "std")] fn zipped_heterogeneous_iters() { - let to_stride = [1i8, 2, 3, 4, 5, 6, 7, 8][..].iter().cycle().take(512).map(|i| i.clone()).collect::>(); + let to_stride = [1i8, 2, 3, 4, 5, 6, 7, 8][..] + .iter() + .cycle() + .take(512) + .map(|i| i.clone()) + .collect::>(); let (a, b) = to_stride.stride_two(tuplify!(2, i8s(0))); - let standard_iter_a = vec!(3i8; 256).into_simd_iter(i8s(0)); - let standard_iter_b = vec!(7i8; 256).into_simd_iter(i8s(0)); + let standard_iter_a = vec![3i8; 256].into_simd_iter(i8s(0)); + let standard_iter_b = vec![7i8; 256].into_simd_iter(i8s(0)); - let a_times_three = (a, standard_iter_a).zip() + let a_times_three = (a, standard_iter_a) + .zip() .simd_map(|(s, c)| s * c) .scalar_collect(); - let b_times_three = (b, standard_iter_b).zip() + let b_times_three = (b, standard_iter_b) + .zip() .simd_map(|(s, c)| s * c) .scalar_collect(); let a_times_three_check = to_stride.chunks(2).map(|c| c[0] * 3); let b_times_three_check = to_stride.chunks(2).map(|c| c[1] * 7); - assert!(a_times_three_check.zip(a_times_three) - .fold(true, |acc, (a, b)| acc && a == b)); + assert!(a_times_three_check + .zip(a_times_three) + .fold(true, |acc, (a, b)| acc && a == b)); - assert!(b_times_three_check.zip(b_times_three) - .fold(true, |acc, (a, b)| acc && a == b)); + assert!(b_times_three_check + .zip(b_times_three) + .fold(true, |acc, (a, b)| acc && a == b)); } #[test] fn zip_simd_reduce() { let vec1 = [2u32; 129]; let vec2 = [3u32; 129]; - let result = (vec1.simd_iter(u32s(0u32)), vec2.simd_iter(u32s(0u32))).zip().simd_reduce(u32s(0u32), |acc, (x, y)| acc + x * y).sum(); + let result = (vec1.simd_iter(u32s(0u32)), vec2.simd_iter(u32s(0u32))) + .zip() + .simd_reduce(u32s(0u32), |acc, (x, y)| acc + x * y) + .sum(); assert_eq!(result, 2 * 3 * 129); } - }