Skip to content
This repository has been archived by the owner on Oct 23, 2024. It is now read-only.

Backport of 8324874: AArch64: crypto pmull based CRC32/CRC32C intrinsics clobber V8-V15 registers #2

Merged
merged 1 commit into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 91 additions & 82 deletions src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4258,108 +4258,117 @@ void MacroAssembler::kernel_crc32_common_fold_using_crypto_pmull(Register crc, R
}
add(table, table, table_offset);

// Registers v0..v7 are used as data registers.
// Registers v16..v31 are used as tmp registers.
sub(buf, buf, 0x10);
ldrq(v1, Address(buf, 0x10));
ldrq(v2, Address(buf, 0x20));
ldrq(v3, Address(buf, 0x30));
ldrq(v4, Address(buf, 0x40));
ldrq(v5, Address(buf, 0x50));
ldrq(v6, Address(buf, 0x60));
ldrq(v7, Address(buf, 0x70));
ldrq(v8, Address(pre(buf, 0x80)));

movi(v25, T4S, 0);
mov(v25, S, 0, crc);
eor(v1, T16B, v1, v25);

ldrq(v0, Address(table));
ldrq(v0, Address(buf, 0x10));
ldrq(v1, Address(buf, 0x20));
ldrq(v2, Address(buf, 0x30));
ldrq(v3, Address(buf, 0x40));
ldrq(v4, Address(buf, 0x50));
ldrq(v5, Address(buf, 0x60));
ldrq(v6, Address(buf, 0x70));
ldrq(v7, Address(pre(buf, 0x80)));

movi(v31, T4S, 0);
mov(v31, S, 0, crc);
eor(v0, T16B, v0, v31);

// Register v16 contains constants from the crc table.
ldrq(v16, Address(table));
b(CRC_by128_loop);

align(OptoLoopAlignment);
BIND(CRC_by128_loop);
pmull (v9, T1Q, v1, v0, T1D);
pmull2(v10, T1Q, v1, v0, T2D);
ldrq(v1, Address(buf, 0x10));
eor3(v1, T16B, v9, v10, v1);

pmull (v11, T1Q, v2, v0, T1D);
pmull2(v12, T1Q, v2, v0, T2D);
ldrq(v2, Address(buf, 0x20));
eor3(v2, T16B, v11, v12, v2);

pmull (v13, T1Q, v3, v0, T1D);
pmull2(v14, T1Q, v3, v0, T2D);
ldrq(v3, Address(buf, 0x30));
eor3(v3, T16B, v13, v14, v3);

pmull (v15, T1Q, v4, v0, T1D);
pmull2(v16, T1Q, v4, v0, T2D);
ldrq(v4, Address(buf, 0x40));
eor3(v4, T16B, v15, v16, v4);

pmull (v17, T1Q, v5, v0, T1D);
pmull2(v18, T1Q, v5, v0, T2D);
ldrq(v5, Address(buf, 0x50));
eor3(v5, T16B, v17, v18, v5);

pmull (v19, T1Q, v6, v0, T1D);
pmull2(v20, T1Q, v6, v0, T2D);
ldrq(v6, Address(buf, 0x60));
eor3(v6, T16B, v19, v20, v6);

pmull (v21, T1Q, v7, v0, T1D);
pmull2(v22, T1Q, v7, v0, T2D);
ldrq(v7, Address(buf, 0x70));
eor3(v7, T16B, v21, v22, v7);

pmull (v23, T1Q, v8, v0, T1D);
pmull2(v24, T1Q, v8, v0, T2D);
ldrq(v8, Address(pre(buf, 0x80)));
eor3(v8, T16B, v23, v24, v8);
pmull (v17, T1Q, v0, v16, T1D);
pmull2(v18, T1Q, v0, v16, T2D);
ldrq(v0, Address(buf, 0x10));
eor3(v0, T16B, v17, v18, v0);

pmull (v19, T1Q, v1, v16, T1D);
pmull2(v20, T1Q, v1, v16, T2D);
ldrq(v1, Address(buf, 0x20));
eor3(v1, T16B, v19, v20, v1);

pmull (v21, T1Q, v2, v16, T1D);
pmull2(v22, T1Q, v2, v16, T2D);
ldrq(v2, Address(buf, 0x30));
eor3(v2, T16B, v21, v22, v2);

pmull (v23, T1Q, v3, v16, T1D);
pmull2(v24, T1Q, v3, v16, T2D);
ldrq(v3, Address(buf, 0x40));
eor3(v3, T16B, v23, v24, v3);

pmull (v25, T1Q, v4, v16, T1D);
pmull2(v26, T1Q, v4, v16, T2D);
ldrq(v4, Address(buf, 0x50));
eor3(v4, T16B, v25, v26, v4);

pmull (v27, T1Q, v5, v16, T1D);
pmull2(v28, T1Q, v5, v16, T2D);
ldrq(v5, Address(buf, 0x60));
eor3(v5, T16B, v27, v28, v5);

pmull (v29, T1Q, v6, v16, T1D);
pmull2(v30, T1Q, v6, v16, T2D);
ldrq(v6, Address(buf, 0x70));
eor3(v6, T16B, v29, v30, v6);

// Reuse registers v23, v24.
// Using them won't block the first instruction of the next iteration.
pmull (v23, T1Q, v7, v16, T1D);
pmull2(v24, T1Q, v7, v16, T2D);
ldrq(v7, Address(pre(buf, 0x80)));
eor3(v7, T16B, v23, v24, v7);

subs(len, len, 0x80);
br(Assembler::GE, CRC_by128_loop);

// fold into 512 bits
ldrq(v0, Address(table, 0x10));
// Use v31 for constants because v16 can be still in use.
ldrq(v31, Address(table, 0x10));

pmull (v10, T1Q, v1, v0, T1D);
pmull2(v11, T1Q, v1, v0, T2D);
eor3(v1, T16B, v10, v11, v5);
pmull (v17, T1Q, v0, v31, T1D);
pmull2(v18, T1Q, v0, v31, T2D);
eor3(v0, T16B, v17, v18, v4);

pmull (v12, T1Q, v2, v0, T1D);
pmull2(v13, T1Q, v2, v0, T2D);
eor3(v2, T16B, v12, v13, v6);
pmull (v19, T1Q, v1, v31, T1D);
pmull2(v20, T1Q, v1, v31, T2D);
eor3(v1, T16B, v19, v20, v5);

pmull (v14, T1Q, v3, v0, T1D);
pmull2(v15, T1Q, v3, v0, T2D);
eor3(v3, T16B, v14, v15, v7);
pmull (v21, T1Q, v2, v31, T1D);
pmull2(v22, T1Q, v2, v31, T2D);
eor3(v2, T16B, v21, v22, v6);

pmull (v16, T1Q, v4, v0, T1D);
pmull2(v17, T1Q, v4, v0, T2D);
eor3(v4, T16B, v16, v17, v8);
pmull (v23, T1Q, v3, v31, T1D);
pmull2(v24, T1Q, v3, v31, T2D);
eor3(v3, T16B, v23, v24, v7);

// fold into 128 bits
ldrq(v5, Address(table, 0x20));
pmull (v10, T1Q, v1, v5, T1D);
pmull2(v11, T1Q, v1, v5, T2D);
eor3(v4, T16B, v4, v10, v11);

ldrq(v6, Address(table, 0x30));
pmull (v12, T1Q, v2, v6, T1D);
pmull2(v13, T1Q, v2, v6, T2D);
eor3(v4, T16B, v4, v12, v13);

ldrq(v7, Address(table, 0x40));
pmull (v14, T1Q, v3, v7, T1D);
pmull2(v15, T1Q, v3, v7, T2D);
eor3(v1, T16B, v4, v14, v15);
// Use v17 for constants because v31 can be still in use.
ldrq(v17, Address(table, 0x20));
pmull (v25, T1Q, v0, v17, T1D);
pmull2(v26, T1Q, v0, v17, T2D);
eor3(v3, T16B, v3, v25, v26);

// Use v18 for constants because v17 can be still in use.
ldrq(v18, Address(table, 0x30));
pmull (v27, T1Q, v1, v18, T1D);
pmull2(v28, T1Q, v1, v18, T2D);
eor3(v3, T16B, v3, v27, v28);

// Use v19 for constants because v18 can be still in use.
ldrq(v19, Address(table, 0x40));
pmull (v29, T1Q, v2, v19, T1D);
pmull2(v30, T1Q, v2, v19, T2D);
eor3(v0, T16B, v3, v29, v30);

add(len, len, 0x80);
add(buf, buf, 0x10);

mov(tmp0, v1, D, 0);
mov(tmp1, v1, D, 1);
mov(tmp0, v0, D, 0);
mov(tmp1, v0, D, 1);
}

SkipIfEqual::SkipIfEqual(
Expand Down
163 changes: 163 additions & 0 deletions test/hotspot/jtreg/compiler/intrinsics/zip/TestFpRegsABI.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/*
* Copyright Amazon.com Inc. or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/


/**
* @test TestFpRegsABI
* @bug 8324874
* @summary ABI for the Arm 64-bit Architecture requires to preserve registers v8-v15 by a callee across subroutine calls
*
* @run main/othervm -XX:-TieredCompilation -Xbatch -XX:CompileCommand=inline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
* @run main/othervm -XX:-TieredCompilation -Xbatch -XX:CompileCommand=dontinline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
* @run main/othervm -XX:+TieredCompilation -XX:TieredStopAtLevel=1 -Xbatch -XX:CompileCommand=inline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
* @run main/othervm -XX:+TieredCompilation -XX:TieredStopAtLevel=1 -Xbatch -XX:CompileCommand=dontinline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
* @run main/othervm -Xbatch -XX:CompileCommand=inline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
* @run main/othervm -Xbatch -XX:CompileCommand=dontinline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
* @run main/othervm -Xint compiler.intrinsics.zip.TestFpRegsABI
*/

package compiler.intrinsics.zip;

import java.util.zip.Checksum;
import java.util.zip.CRC32;
import java.util.zip.CRC32C;

public class TestFpRegsABI {
private static byte[] buf;

static {
buf = new byte[1024];
for (int i = 0; i < buf.length; ++i) {
buf[i] = (byte)i;
}
}

private static class RegressionTest {
Checksum checksum;

RegressionTest(Checksum checksum) {
this.checksum = checksum;
}

public void run(byte[] buf, long expectedValue) {
for (int i = 0; i < 20_000; ++i) {
runIteration(buf, expectedValue);
}
}

// If checksum intrinsic does not save fp registers as ABI requires,
// the second call of calcValue might produce a wrong result.
private void runIteration(byte[] buf, long expectedValue) {
int v1 = calcValue(buf);
checksum.reset();
checksum.update(buf, 0, buf.length);
long checksumValue = checksum.getValue();
if (checksumValue != expectedValue) {
System.err.printf("ERROR: checksum = 0x%016x, expected = 0x%016x\n",
checksumValue, expectedValue);
throw new RuntimeException("Checksum Error");
}
int v2 = calcValue(buf);
if (v1 != v2) {
throw new RuntimeException("Expect v2(" + v2 + ") to equal v1(" + v1 + ")");
}
}

private int calcValue(byte[] buf) {
return (int)(2.5 * buf.length);
}
}

private static class TestIntrinsic {
Checksum checksum;

TestIntrinsic(Checksum checksum) {
this.checksum = checksum;
}

public void run(byte[] buf, long expectedValue) {
for (int i = 0; i < 20_000; ++i) {
runIteration(buf, expectedValue);
}
}

// If checksum intrinsic does not save fp registers as ABI requires,
// the second call of calcValue might produce a wrong result.
private void runIteration(byte[] buf, long expectedValue) {
int v1 = calcValue(buf);
checksum.reset();
checksum.update(buf, 0, buf.length);
long checksumValue = checksum.getValue();
if (checksumValue != expectedValue) {
System.err.printf("ERROR: checksum = 0x%016x, expected = 0x%016x\n",
checksumValue, expectedValue);
throw new RuntimeException("Checksum Error");
}
int v2 = calcValue(buf);
if (v1 != v2) {
throw new RuntimeException("Expect v2(" + v2 + ") to equal v1(" + v1 + ")");
}
}

// ABI can require some fp registers to be saved by a callee, e.g. v8-15 in ARM64 ABI.
// We create fp register pressure to get as many fp registers used as possible.
private int calcValue(byte[] buf) {
double v = 0.0;
for (int i = 24; i <= buf.length; i += 24) {
v += buf[i - 1] * ((double)i - 1.0) + (double)i - 1.0;
v += buf[i - 2] * ((double)i - 2.0) + (double)i - 2.0;
v += buf[i - 3] * ((double)i - 3.0) + (double)i - 3.0;
v += buf[i - 4] * ((double)i - 4.0) + (double)i - 4.0;
v += buf[i - 5] * ((double)i - 5.0) + (double)i - 5.0;
v += buf[i - 6] * ((double)i - 6.0) + (double)i - 6.0;
v += buf[i - 7] * ((double)i - 7.0) + (double)i - 7.0;
v += buf[i - 8] * ((double)i - 8.0) + (double)i - 8.0;
v += buf[i - 9] * ((double)i - 9.0) + (double)i - 9.0;
v += buf[i - 10] * ((double)i - 10.0) + (double)i - 10.0;
v += buf[i - 11] * ((double)i - 11.0) + (double)i - 11.0;
v += buf[i - 12] * ((double)i - 12.0) + (double)i - 12.0;
v += buf[i - 13] * ((double)i - 13.0) + (double)i - 13.0;
v += buf[i - 14] * ((double)i - 14.0) + (double)i - 14.0;
v += buf[i - 15] * ((double)i - 15.0) + (double)i - 15.0;
v += buf[i - 16] * ((double)i - 16.0) + (double)i - 16.0;
v += buf[i - 17] * ((double)i - 17.0) + (double)i - 17.0;
v += buf[i - 18] * ((double)i - 18.0) + (double)i - 18.0;
v += buf[i - 19] * ((double)i - 19.0) + (double)i - 19.0;
v += buf[i - 20] * ((double)i - 20.0) + (double)i - 20.0;
v += buf[i - 21] * ((double)i - 21.0) + (double)i - 21.0;
v += buf[i - 22] * ((double)i - 22.0) + (double)i - 22.0;
v += buf[i - 23] * ((double)i - 23.0) + (double)i - 23.0;
v += buf[i - 24] * ((double)i - 24.0) + (double)i - 24.0;
}
return (int)v;
}
}

public static void main(final String[] argv) {
new TestIntrinsic(new CRC32()).run(buf, 0x00000000b70b4c26L);
new TestIntrinsic(new CRC32C()).run(buf, 0x000000002cdf6e8fL);
new RegressionTest(new CRC32()).run(buf, 0x00000000b70b4c26L);
new RegressionTest(new CRC32C()).run(buf, 0x000000002cdf6e8fL);
}
}

Loading