diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs index eaf297ba3e3..0619404e5f7 100644 --- a/crates/core_simd/src/swizzle_dyn.rs +++ b/crates/core_simd/src/swizzle_dyn.rs @@ -57,6 +57,13 @@ where target_endian = "little" ))] 16 => transize(vqtbl1q_u8, self, idxs), + #[cfg(all( + target_arch = "arm", + target_feature = "v7", + target_feature = "neon", + target_endian = "little" + ))] + 16 => transize(armv7_neon_swizzle_u8x16, self, idxs), #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))] 32 => transize(avx2_pshufb, self, idxs), #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))] @@ -98,6 +105,28 @@ where } } +/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks +/// with a u8x8x2 lookup table. +/// +/// # Safety +/// This requires armv7 neon to work +#[cfg(all( + target_arch = "arm", + target_feature = "v7", + target_feature = "neon", + target_endian = "little" +))] +unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd, idxs: Simd) -> Simd { + use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8}; + // SAFETY: Caller promised arm neon support + unsafe { + let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into())); + let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into())); + let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into())); + vcombine_u8(lo, hi).into() + } +} + /// "vpshufb like it was meant to be" on AVX2 /// /// # Safety