From 74c165116d76e7b591194d1cc7fd5d4eeae22251 Mon Sep 17 00:00:00 2001 From: james7132 Date: Wed, 20 Mar 2024 22:42:09 -0700 Subject: [PATCH 1/6] Add support for AVX --- .github/workflows/rust.yml | 2 +- src/block/avx.rs | 88 ++++++++++++++++++++++++++++++++ src/block/avx2.rs | 21 ++------ src/block/default.rs | 19 +------ src/block/mod.rs | 66 ++++++++++++++++-------- src/block/sse2.rs | 17 +----- src/block/{wasm32.rs => wasm.rs} | 19 +------ src/lib.rs | 2 +- 8 files changed, 143 insertions(+), 91 deletions(-) create mode 100644 src/block/avx.rs rename src/block/{wasm32.rs => wasm.rs} (69%) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 367a703..d5a8653 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -17,7 +17,7 @@ jobs: strategy: matrix: rust: [stable, nightly] - features: ["+avx2", "+sse2"] + features: ["+avx2", "+avx", "+sse2,+sse4.1", "+sse2"] env: RUSTFLAGS: "-C target-feature=${{matrix.features}} -D warnings" steps: diff --git a/src/block/avx.rs b/src/block/avx.rs new file mode 100644 index 0000000..a097eb3 --- /dev/null +++ b/src/block/avx.rs @@ -0,0 +1,88 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; +use core::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not}; + +#[derive(Copy, Clone, Debug)] +#[repr(transparent)] +pub struct Block(pub(super) __m256d); + +impl Block { + #[inline] + pub fn is_empty(self) -> bool { + unsafe { _mm256_testz_pd(self.0, self.0) == 1 } + } + + #[inline] + pub fn andnot(self, other: Self) -> Self { + unsafe { Self(_mm256_andnot_pd(other.0, self.0)) } + } +} + +impl Not for Block { + type Output = Block; + #[inline] + fn not(self) -> Self::Output { + unsafe { Self(_mm256_xor_pd(self.0, Self::ALL.0)) } + } +} + +impl BitAnd for Block { + type Output = Block; + #[inline] + fn bitand(self, other: Self) -> Self::Output { + unsafe { Self(_mm256_and_pd(self.0, other.0)) } + } +} + +impl BitAndAssign for Block { + #[inline] + fn bitand_assign(&mut self, other: Self) { + unsafe { + self.0 = _mm256_and_pd(self.0, other.0); + } + } +} + +impl BitOr for Block { + type Output = Block; + #[inline] + fn bitor(self, other: Self) -> Self::Output { + unsafe { Self(_mm256_or_pd(self.0, other.0)) } + } +} + +impl BitOrAssign for Block { + #[inline] + fn bitor_assign(&mut self, other: Self) { + unsafe { + self.0 = _mm256_or_pd(self.0, other.0); + } + } +} + +impl BitXor for Block { + type Output = Block; + #[inline] + fn bitxor(self, other: Self) -> Self::Output { + unsafe { Self(_mm256_xor_pd(self.0, other.0)) } + } +} + +impl BitXorAssign for Block { + #[inline] + fn bitxor_assign(&mut self, other: Self) { + unsafe { self.0 = _mm256_xor_pd(self.0, other.0) } + } +} + +impl PartialEq for Block { + #[inline] + fn eq(&self, other: &Self) -> bool { + unsafe { + let neq = _mm256_xor_pd(self.0, other.0); + _mm256_testz_pd(neq, neq) == 1 + } + } +} diff --git a/src/block/avx2.rs b/src/block/avx2.rs index 4258a5a..b359377 100644 --- a/src/block/avx2.rs +++ b/src/block/avx2.rs @@ -6,24 +6,9 @@ use core::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, #[derive(Copy, Clone, Debug)] #[repr(transparent)] -pub struct Block(__m256i); +pub struct Block(pub(super) __m256i); impl Block { - pub const USIZE_COUNT: usize = core::mem::size_of::() / core::mem::size_of::(); - pub const NONE: Self = Self::from_usize_array([0; Self::USIZE_COUNT]); - pub const ALL: Self = Self::from_usize_array([core::usize::MAX; Self::USIZE_COUNT]); - pub const BITS: usize = core::mem::size_of::() * 8; - - #[inline] - pub fn into_usize_array(self) -> [usize; Self::USIZE_COUNT] { - unsafe { core::mem::transmute(self.0) } - } - - #[inline] - pub const fn from_usize_array(array: [usize; Self::USIZE_COUNT]) -> Self { - Self(unsafe { core::mem::transmute(array) }) - } - #[inline] pub fn is_empty(self) -> bool { unsafe { _mm256_testz_si256(self.0, self.0) == 1 } @@ -96,8 +81,8 @@ impl PartialEq for Block { #[inline] fn eq(&self, other: &Self) -> bool { unsafe { - let eq = _mm256_cmpeq_epi8(self.0, other.0); - _mm256_movemask_epi8(eq) == !(0i32) + let neq = _mm256_xor_si256(self.0, other.0); + _mm256_testz_si256(neq, neq) == 1 } } } diff --git a/src/block/default.rs b/src/block/default.rs index 7545ad1..7fc460f 100644 --- a/src/block/default.rs +++ b/src/block/default.rs @@ -2,26 +2,9 @@ use core::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, #[derive(Copy, Clone, PartialEq, Debug)] #[repr(transparent)] -pub struct Block(usize); +pub struct Block(pub(super) usize); impl Block { - pub const USIZE_COUNT: usize = 1; - pub const NONE: Self = Block(0); - #[allow(dead_code)] - pub const ALL: Self = Block(!0); - pub const BITS: usize = core::mem::size_of::() * 8; - - #[inline] - pub fn into_usize_array(self) -> [usize; Self::USIZE_COUNT] { - [self.0] - } - - #[inline] - #[allow(dead_code)] - pub const fn from_usize_array(array: [usize; Self::USIZE_COUNT]) -> Self { - Self(array[0]) - } - #[inline] pub const fn is_empty(self) -> bool { self.0 == Self::NONE.0 diff --git a/src/block/mod.rs b/src/block/mod.rs index 52158ce..f4f06de 100644 --- a/src/block/mod.rs +++ b/src/block/mod.rs @@ -4,40 +4,74 @@ use core::cmp::Ordering; use core::hash::{Hash, Hasher}; #[cfg(all( - not(target_arch = "wasm32"), + not(all(target_family = "wasm", target_feature = "simd128")), not(target_feature = "sse2"), + not(target_feature = "avx"), not(target_feature = "avx2"), ))] mod default; #[cfg(all( - not(target_arch = "wasm32"), + not(all(target_family = "wasm", target_feature = "simd128")), not(target_feature = "sse2"), + not(target_feature = "avx"), not(target_feature = "avx2"), ))] pub use self::default::*; #[cfg(all( - not(target_arch = "wasm32"), + any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", + not(target_feature = "avx"), not(target_feature = "avx2"), ))] mod sse2; #[cfg(all( - not(target_arch = "wasm32"), + any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", + not(target_feature = "avx"), not(target_feature = "avx2"), ))] pub use self::sse2::*; -#[cfg(all(not(target_arch = "wasm32"), target_feature = "avx2",))] +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx", + not(target_feature = "avx2") +))] +mod avx; +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx", + not(target_feature = "avx2") +))] +pub use self::avx::*; + +#[cfg(all(not(target_arch = "wasm32"), target_feature = "avx2"))] mod avx2; -#[cfg(all(not(target_arch = "wasm32"), target_feature = "avx2",))] +#[cfg(all(not(target_arch = "wasm32"), target_feature = "avx2"))] pub use self::avx2::*; -#[cfg(target_arch = "wasm32")] -mod wasm32; -#[cfg(target_arch = "wasm32")] -pub use self::wasm32::*; +#[cfg(all(target_family = "wasm", target_feature="simd128"))] +mod wasm; +#[cfg(all(target_arch = "wasm", target_feature="simd128"))] +pub use self::wasm::*; + +impl Block { + pub const USIZE_COUNT: usize = core::mem::size_of::() / core::mem::size_of::(); + pub const NONE: Self = Self::from_usize_array([0; Self::USIZE_COUNT]); + pub const ALL: Self = Self::from_usize_array([core::usize::MAX; Self::USIZE_COUNT]); + pub const BITS: usize = core::mem::size_of::() * 8; + + #[inline] + pub fn into_usize_array(self) -> [usize; Self::USIZE_COUNT] { + unsafe { core::mem::transmute(self.0) } + } + + #[inline] + pub const fn from_usize_array(array: [usize; Self::USIZE_COUNT]) -> Self { + Self(unsafe { core::mem::transmute(array) }) + } +} impl Eq for Block {} @@ -51,15 +85,7 @@ impl PartialOrd for Block { impl Ord for Block { #[inline] fn cmp(&self, other: &Self) -> Ordering { - let a = self.into_usize_array(); - let b = other.into_usize_array(); - for i in 0..Self::USIZE_COUNT { - match a[i].cmp(&b[i]) { - Ordering::Equal => continue, - cmp => return cmp, - } - } - Ordering::Equal + self.into_usize_array().cmp(&other.into_usize_array()) } } @@ -73,6 +99,6 @@ impl Default for Block { impl Hash for Block { #[inline] fn hash(&self, hasher: &mut H) { - self.into_usize_array().hash(hasher) + Hash::hash_slice(&self.into_usize_array(), hasher); } } diff --git a/src/block/sse2.rs b/src/block/sse2.rs index 6f61948..e68d49f 100644 --- a/src/block/sse2.rs +++ b/src/block/sse2.rs @@ -8,24 +8,9 @@ use core::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, #[derive(Copy, Clone, Debug)] #[repr(transparent)] -pub struct Block(__m128i); +pub struct Block(pub(super) __m128i); impl Block { - pub const USIZE_COUNT: usize = core::mem::size_of::() / core::mem::size_of::(); - pub const NONE: Self = Self::from_usize_array([0; Self::USIZE_COUNT]); - pub const ALL: Self = Self::from_usize_array([core::usize::MAX; Self::USIZE_COUNT]); - pub const BITS: usize = core::mem::size_of::() * 8; - - #[inline] - pub fn into_usize_array(self) -> [usize; Self::USIZE_COUNT] { - unsafe { core::mem::transmute(self.0) } - } - - #[inline] - pub const fn from_usize_array(array: [usize; Self::USIZE_COUNT]) -> Self { - Self(unsafe { core::mem::transmute(array) }) - } - #[inline] pub fn is_empty(self) -> bool { #[cfg(not(target_feature = "sse4.1"))] diff --git a/src/block/wasm32.rs b/src/block/wasm.rs similarity index 69% rename from src/block/wasm32.rs rename to src/block/wasm.rs index 2dac999..cef686d 100644 --- a/src/block/wasm32.rs +++ b/src/block/wasm.rs @@ -8,24 +8,9 @@ use core::{ #[derive(Copy, Clone, Debug)] #[repr(transparent)] -pub struct Block(v128); +pub struct Block(pub(super) v128); impl Block { - pub const USIZE_COUNT: usize = core::mem::size_of::() / core::mem::size_of::(); - pub const NONE: Self = Self::from_usize_array([0; Self::USIZE_COUNT]); - pub const ALL: Self = Self::from_usize_array([core::usize::MAX; Self::USIZE_COUNT]); - pub const BITS: usize = core::mem::size_of::() * 8; - - #[inline] - pub fn into_usize_array(self) -> [usize; Self::USIZE_COUNT] { - unsafe { core::mem::transmute(self.0) } - } - - #[inline] - pub const fn from_usize_array(array: [usize; Self::USIZE_COUNT]) -> Self { - Self(unsafe { core::mem::transmute(array) }) - } - #[inline] pub fn is_empty(self) -> bool { !v128_any_true(self.0) @@ -33,7 +18,7 @@ impl Block { #[inline] pub fn andnot(self, other: Self) -> Self { - Self(unsafe { v128_andnot(self.0, other.0) }) + Self(v128_andnot(self.0, other.0)) } } diff --git a/src/lib.rs b/src/lib.rs index 7008147..1e19140 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ //! When SIMD is not available on the target, the crate will gracefully fallback to a default implementation. It is intended to add support for other SIMD architectures //! once they appear in stable Rust. //! -//! Currently only SSE2/AVX2 on x86/x86_64 and wasm32 SIMD are supported as this is what stable Rust supports. +//! Currently only SSE2/AVX/AVX2 on x86/x86_64 and wasm32 SIMD are supported as this is what stable Rust supports. #![no_std] #![deny(clippy::undocumented_unsafe_blocks)] From 94e772f8dd4a756c069d504ee331db785a377f83 Mon Sep 17 00:00:00 2001 From: james7132 Date: Wed, 20 Mar 2024 22:51:04 -0700 Subject: [PATCH 2/6] Formatting --- src/block/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/block/mod.rs b/src/block/mod.rs index f4f06de..c419471 100644 --- a/src/block/mod.rs +++ b/src/block/mod.rs @@ -51,9 +51,9 @@ mod avx2; #[cfg(all(not(target_arch = "wasm32"), target_feature = "avx2"))] pub use self::avx2::*; -#[cfg(all(target_family = "wasm", target_feature="simd128"))] +#[cfg(all(target_family = "wasm", target_feature = "simd128"))] mod wasm; -#[cfg(all(target_arch = "wasm", target_feature="simd128"))] +#[cfg(all(target_arch = "wasm", target_feature = "simd128"))] pub use self::wasm::*; impl Block { From 6e4d588944c570297f359a154d1993552749bb59 Mon Sep 17 00:00:00 2001 From: james7132 Date: Wed, 20 Mar 2024 22:58:38 -0700 Subject: [PATCH 3/6] Try to fix CI --- src/block/avx.rs | 2 +- src/block/avx2.rs | 2 +- src/block/mod.rs | 10 ++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/block/avx.rs b/src/block/avx.rs index a097eb3..f67338c 100644 --- a/src/block/avx.rs +++ b/src/block/avx.rs @@ -11,7 +11,7 @@ pub struct Block(pub(super) __m256d); impl Block { #[inline] pub fn is_empty(self) -> bool { - unsafe { _mm256_testz_pd(self.0, self.0) == 1 } + unsafe { _mm256_testz_pd(self.0, Self::ALL.0) == 1 } } #[inline] diff --git a/src/block/avx2.rs b/src/block/avx2.rs index b359377..d2a1ece 100644 --- a/src/block/avx2.rs +++ b/src/block/avx2.rs @@ -11,7 +11,7 @@ pub struct Block(pub(super) __m256i); impl Block { #[inline] pub fn is_empty(self) -> bool { - unsafe { _mm256_testz_si256(self.0, self.0) == 1 } + unsafe { _mm256_testz_si256(self.0, Self::ALL.0) == 1 } } #[inline] diff --git a/src/block/mod.rs b/src/block/mod.rs index c419471..23534e2 100644 --- a/src/block/mod.rs +++ b/src/block/mod.rs @@ -46,9 +46,15 @@ mod avx; ))] pub use self::avx::*; -#[cfg(all(not(target_arch = "wasm32"), target_feature = "avx2"))] +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2" +))] mod avx2; -#[cfg(all(not(target_arch = "wasm32"), target_feature = "avx2"))] +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2" +))] pub use self::avx2::*; #[cfg(all(target_family = "wasm", target_feature = "simd128"))] From c4dd0cc9dda50017266c0e6197b1db83a079047d Mon Sep 17 00:00:00 2001 From: james7132 Date: Wed, 20 Mar 2024 23:00:34 -0700 Subject: [PATCH 4/6] Fix aarch64 --- src/block/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/block/mod.rs b/src/block/mod.rs index 23534e2..226f379 100644 --- a/src/block/mod.rs +++ b/src/block/mod.rs @@ -1,4 +1,5 @@ #![allow(clippy::undocumented_unsafe_blocks)] +#![allow(dead_code)] use core::cmp::Ordering; use core::hash::{Hash, Hasher}; From dcd96dfa710fba4b262e1d90449e5dbbd5446dc6 Mon Sep 17 00:00:00 2001 From: james7132 Date: Wed, 20 Mar 2024 23:08:24 -0700 Subject: [PATCH 5/6] Try fixing tests again --- src/block/avx.rs | 6 +++--- src/block/avx2.rs | 6 +++--- src/block/sse2.rs | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/block/avx.rs b/src/block/avx.rs index f67338c..748f8f7 100644 --- a/src/block/avx.rs +++ b/src/block/avx.rs @@ -11,7 +11,7 @@ pub struct Block(pub(super) __m256d); impl Block { #[inline] pub fn is_empty(self) -> bool { - unsafe { _mm256_testz_pd(self.0, Self::ALL.0) == 1 } + unsafe { _mm256_testz_pd(self.0, self.0) == 1 } } #[inline] @@ -81,8 +81,8 @@ impl PartialEq for Block { #[inline] fn eq(&self, other: &Self) -> bool { unsafe { - let neq = _mm256_xor_pd(self.0, other.0); - _mm256_testz_pd(neq, neq) == 1 + let eq = _mm256_cmpeq_pd(self.0, other.0); + _mm256_movemask_pd(eq) == !(0i32) } } } diff --git a/src/block/avx2.rs b/src/block/avx2.rs index d2a1ece..43da2d7 100644 --- a/src/block/avx2.rs +++ b/src/block/avx2.rs @@ -11,7 +11,7 @@ pub struct Block(pub(super) __m256i); impl Block { #[inline] pub fn is_empty(self) -> bool { - unsafe { _mm256_testz_si256(self.0, Self::ALL.0) == 1 } + unsafe { _mm256_testz_si256(self.0, self.0) == 1 } } #[inline] @@ -81,8 +81,8 @@ impl PartialEq for Block { #[inline] fn eq(&self, other: &Self) -> bool { unsafe { - let neq = _mm256_xor_si256(self.0, other.0); - _mm256_testz_si256(neq, neq) == 1 + let eq = _mm256_cmpeq_si256(self.0, other.0); + _mm256_movemask_si256(eq) == !(0i32) } } } diff --git a/src/block/sse2.rs b/src/block/sse2.rs index e68d49f..6db08f7 100644 --- a/src/block/sse2.rs +++ b/src/block/sse2.rs @@ -19,7 +19,7 @@ impl Block { } #[cfg(target_feature = "sse4.1")] { - unsafe { _mm_test_all_zeros(self.0, Self::ALL.0) == 1 } + unsafe { _mm_test_all_zeros(self.0, self.0) == 1 } } } From 37f2f4101e22a9273c0635bb7fc7de8d1ed0d92f Mon Sep 17 00:00:00 2001 From: james7132 Date: Wed, 20 Mar 2024 23:35:38 -0700 Subject: [PATCH 6/6] Fix AVX --- src/block/avx.rs | 10 +++++++--- src/block/avx2.rs | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/block/avx.rs b/src/block/avx.rs index 748f8f7..88c2704 100644 --- a/src/block/avx.rs +++ b/src/block/avx.rs @@ -11,7 +11,10 @@ pub struct Block(pub(super) __m256d); impl Block { #[inline] pub fn is_empty(self) -> bool { - unsafe { _mm256_testz_pd(self.0, self.0) == 1 } + unsafe { + let value = core::mem::transmute(self); + _mm256_testz_si256(value, value) == 1 + } } #[inline] @@ -81,8 +84,9 @@ impl PartialEq for Block { #[inline] fn eq(&self, other: &Self) -> bool { unsafe { - let eq = _mm256_cmpeq_pd(self.0, other.0); - _mm256_movemask_pd(eq) == !(0i32) + let new = _mm256_xor_pd(self.0, other.0); + let neq = core::mem::transmute(new); + _mm256_testz_si256(neq, neq) == 1 } } } diff --git a/src/block/avx2.rs b/src/block/avx2.rs index 43da2d7..b359377 100644 --- a/src/block/avx2.rs +++ b/src/block/avx2.rs @@ -81,8 +81,8 @@ impl PartialEq for Block { #[inline] fn eq(&self, other: &Self) -> bool { unsafe { - let eq = _mm256_cmpeq_si256(self.0, other.0); - _mm256_movemask_si256(eq) == !(0i32) + let neq = _mm256_xor_si256(self.0, other.0); + _mm256_testz_si256(neq, neq) == 1 } } }