Auto merge of #3640 - folkertdev:add-pclmulqdq, r=RalfJung

add support for `pclmulqdq` intrinsic This instruction is required in fast implementations of the crc32 checksum algorithm, and used in the https://crates.io/crates/crc32fast and https://crates.io/crates/zlib-rs crates. Some questions from my side - is my method for decomposing a `__m128i` into two separate `i64` values allright?
rust-lang · Jun 8, 2024 · e191fee · e191fee
2 parents c51b733 + 6a86688
commit e191fee
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 0 deletions.
diff --git a/src/shims/x86/mod.rs b/src/shims/x86/mod.rs
@@ -105,6 +105,13 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
                 }
             }
 
+            "pclmulqdq" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                pclmulqdq(this, left, right, imm, dest)?;
+            }
+
             name if name.starts_with("sse.") => {
                 return sse::EvalContextExt::emulate_x86_sse_intrinsic(
                     this, link_name, abi, args, dest,
@@ -1133,6 +1140,68 @@ fn pmulhrsw<'tcx>(
     Ok(())
 }
 
+/// Perform a carry-less multiplication of two 64-bit integers, selected from `left` and `right` according to `imm8`,
+/// and store the results in `dst`.
+///
+/// `left` and `right` are both vectors of type 2 x i64. Only bits 0 and 4 of `imm8` matter;
+/// they select the element of `left` and `right`, respectively.
+///
+/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128>
+fn pclmulqdq<'tcx>(
+    this: &mut MiriInterpCx<'tcx>,
+    left: &OpTy<'tcx>,
+    right: &OpTy<'tcx>,
+    imm8: &OpTy<'tcx>,
+    dest: &MPlaceTy<'tcx>,
+) -> InterpResult<'tcx, ()> {
+    assert_eq!(left.layout, right.layout);
+    assert_eq!(left.layout.size, dest.layout.size);
+
+    // Transmute to `[u64; 2]`
+
+    let array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?;
+    let left = left.transmute(array_layout, this)?;
+    let right = right.transmute(array_layout, this)?;
+    let dest = dest.transmute(array_layout, this)?;
+
+    let imm8 = this.read_scalar(imm8)?.to_u8()?;
+
+    // select the 64-bit integer from left that the user specified (low or high)
+    let index = if (imm8 & 0x01) == 0 { 0 } else { 1 };
+    let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?;
+
+    // select the 64-bit integer from right that the user specified (low or high)
+    let index = if (imm8 & 0x10) == 0 { 0 } else { 1 };
+    let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?;
+
+    // Perform carry-less multiplication
+    //
+    // This operation is like long multiplication, but ignores all carries.
+    // That idea corresponds to the xor operator, which is used in the implementation.
+    //
+    // Wikipedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example
+    let mut result: u128 = 0;
+
+    for i in 0..64 {
+        // if the i-th bit in right is set
+        if (right & (1 << i)) != 0 {
+            // xor result with `left` shifted to the left by i positions
+            result ^= (left as u128) << i;
+        }
+    }
+
+    let result_low = (result & 0xFFFF_FFFF_FFFF_FFFF) as u64;
+    let result_high = (result >> 64) as u64;
+
+    let dest_low = this.project_index(&dest, 0)?;
+    this.write_scalar(Scalar::from_u64(result_low), &dest_low)?;
+
+    let dest_high = this.project_index(&dest, 1)?;
+    this.write_scalar(Scalar::from_u64(result_high), &dest_high)?;
+
+    Ok(())
+}
+
 /// Packs two N-bit integer vectors to a single N/2-bit integers.
 ///
 /// The conversion from N-bit to N/2-bit should be provided by `f`.

diff --git a/tests/pass/shims/x86/intrinsics-x86-pclmulqdq.rs b/tests/pass/shims/x86/intrinsics-x86-pclmulqdq.rs
@@ -0,0 +1,48 @@
+// Ignore everything except x86 and x86_64
+// Any new targets that are added to CI should be ignored here.
+// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32
+//@compile-flags: -C target-feature=+pclmulqdq
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+fn main() {
+    assert!(is_x86_feature_detected!("pclmulqdq"));
+
+    let a = (0x7fffffffffffffff, 0x4317e40ab4ddcf05);
+    let b = (0xdd358416f52ecd34, 0x633d11cc638ca16b);
+
+    unsafe {
+        assert_eq!(clmulepi64_si128::<0x00>(a, b), (13036940098130298092, 2704901987789626761));
+        assert_eq!(clmulepi64_si128::<0x01>(a, b), (6707488474444649956, 3901733953304450635));
+        assert_eq!(clmulepi64_si128::<0x10>(a, b), (11607166829323378905, 1191897396234301548));
+        assert_eq!(clmulepi64_si128::<0x11>(a, b), (7731954893213347271, 1760130762532070957));
+    }
+}
+
+#[target_feature(enable = "pclmulqdq")]
+unsafe fn clmulepi64_si128<const IMM8: i32>(
+    (a1, a2): (u64, u64),
+    (b1, b2): (u64, u64),
+) -> (u64, u64) {
+    // SAFETY: There are no safety requirements for calling `_mm_clmulepi64_si128`.
+    // It's just unsafe for API consistency with other intrinsics.
+    unsafe {
+        let a = core::mem::transmute::<_, __m128i>([a1, a2]);
+        let b = core::mem::transmute::<_, __m128i>([b1, b2]);
+
+        let out = _mm_clmulepi64_si128::<IMM8>(a, b);
+
+        let [c1, c2] = core::mem::transmute::<_, [u64; 2]>(out);
+
+        (c1, c2)
+    }
+}