Merge upstream-jdk

corretto · Jun 11, 2024 · 2b9071b · 2b9071b
2 parents 1212f03 + 788b876
commit 2b9071b
Show file tree

Hide file tree

Showing 51 changed files with 638 additions and 175 deletions.
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -135,9 +135,9 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          // The vector implementation of Op_AddReductionVD/F is for the Vector API only.
-          // It is not suitable for auto-vectorization because it does not add the elements
-          // in the same order as sequential code, and FP addition is non-associative.
+          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
+          // They are not suitable for auto-vectorization because the result would not conform
+          // to the JLS, Section Evaluation Order.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
           opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
@@ -2858,26 +2858,28 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
 %}
 
 // reduction addF
-// Floating-point addition is not associative, so the rules for AddReductionVF
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVF is only generated by Vector API.
-instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
+
+instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
+  format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
     __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
+instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+  // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
+  format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
   ins_encode %{
     __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
     __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
   ins_pipe(pipe_slow);
 %}
 
+// This rule calculates the reduction result in strict order. Two cases will
+// reach here:
+// 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
+//    AddReductionVF generated by Vector API. For vector size > 128-bits, it is more
+//    beneficial performance-wise to generate direct SVE instruction even if it is
+//    strictly ordered.
+// 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
+//    auto-vectorization on SVE machine.
 instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
-  predicate(UseSVE > 0);
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+            n->as_Reduction()->requires_strict_order());
   match(Set dst_src1 (AddReductionVF dst_src1 src2));
   format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister);
@@ -2899,26 +2911,36 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
 %}
 
 // reduction addD
-// Floating-point addition is not associative, so the rule for AddReductionVD
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVD is only generated by Vector API.
-instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
-  predicate(UseSVE == 0);
+
+instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for doubles. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(!n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVD dsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
+  format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
     __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}
 
+// This rule calculates the reduction result in strict order. Two cases will
+// reach here:
+// 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example -
+//    AddReductionVD generated by Vector API. For vector size > 128-bits, it is more
+//    beneficial performance-wise to generate direct SVE instruction even if it is
+//    strictly ordered.
+// 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
+//    auto-vectorization on SVE machine.
 instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
-  predicate(UseSVE > 0);
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+            n->as_Reduction()->requires_strict_order());
   match(Set dst_src1 (AddReductionVD dst_src1 src2));
   format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -125,9 +125,9 @@ source %{
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
-          // The vector implementation of Op_AddReductionVD/F is for the Vector API only.
-          // It is not suitable for auto-vectorization because it does not add the elements
-          // in the same order as sequential code, and FP addition is non-associative.
+          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
+          // They are not suitable for auto-vectorization because the result would not conform
+          // to the JLS, Section Evaluation Order.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
           opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
@@ -1752,26 +1752,28 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)
 REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)
 
 // reduction addF
-// Floating-point addition is not associative, so the rules for AddReductionVF
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVF is only generated by Vector API.
-instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
+
+instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
+  format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
     __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
-  predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
+instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+  // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVF fsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
+  format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
   ins_encode %{
     __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
     __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@@ -1783,11 +1785,21 @@ dnl
 dnl REDUCE_ADD_FP_SVE($1,   $2  )
 dnl REDUCE_ADD_FP_SVE(type, size)
 define(`REDUCE_ADD_FP_SVE', `
+// This rule calculates the reduction result in strict order. Two cases will
+// reach here:
+// 1. Non strictly-ordered AddReductionV$1 when vector size > 128-bits. For example -
+//    AddReductionV$1 generated by Vector API. For vector size > 128-bits, it is more
+//    beneficial performance-wise to generate direct SVE instruction even if it is
+//    strictly ordered.
+// 2. Strictly-ordered AddReductionV$1. For example - AddReductionV$1 generated by
+//    auto-vectorization on SVE machine.
 instruct reduce_add$1_sve(vReg$1 dst_src1, vReg src2) %{
-  predicate(UseSVE > 0);
+  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
+            n->as_Reduction()->requires_strict_order());
   match(Set dst_src1 (AddReductionV$1 dst_src1 src2));
   format %{ "reduce_add$1_sve $dst_src1, $dst_src1, $src2" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_fadda($dst_src1$$FloatRegister, __ $2, ptrue, $src2$$FloatRegister);
@@ -1798,14 +1810,14 @@ dnl
 REDUCE_ADD_FP_SVE(F, S)
 
 // reduction addD
-// Floating-point addition is not associative, so the rule for AddReductionVD
-// on NEON can't be used to auto-vectorize floating-point reduce-add.
-// Currently, on NEON, AddReductionVD is only generated by Vector API.
-instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
-  predicate(UseSVE == 0);
+
+instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
+  // Non-strictly ordered floating-point add reduction for doubles. This rule is
+  // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
+  predicate(!n->as_Reduction()->requires_strict_order());
   match(Set dst (AddReductionVD dsrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
+  format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
   ins_encode %{
     __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
     __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);

diff --git a/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp b/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp
@@ -168,6 +168,7 @@ class NativeCall: public NativeInstruction {
     return_address_offset       =    4
   };
 
+  static int byte_size() { return instruction_size; }
   address instruction_address() const { return addr_at(instruction_offset); }
   address next_instruction_address() const { return addr_at(return_address_offset); }
   int displacement() const { return (int_at(displacement_offset) << 6) >> 4; }

diff --git a/src/hotspot/cpu/arm/nativeInst_arm_32.hpp b/src/hotspot/cpu/arm/nativeInst_arm_32.hpp
@@ -415,6 +415,7 @@ inline NativeJump* nativeJump_at(address address) {
 
 class NativeCall: public RawNativeCall {
 public:
+  static int byte_size() { return instruction_size; }
   // NativeCall::next_instruction_address() is used only to define the
   // range where to look for the relocation information. We need not
   // walk over composed instructions (as long as the relocation information

diff --git a/src/hotspot/cpu/ppc/nativeInst_ppc.hpp b/src/hotspot/cpu/ppc/nativeInst_ppc.hpp
@@ -137,6 +137,8 @@ class NativeCall: public NativeInstruction {
     instruction_size                            = 16 // Used in shared code for calls with reloc_info.
   };
 
+  static int byte_size() { return instruction_size; }
+
   static bool is_call_at(address a) {
     return Assembler::is_bl(*(int*)(a));
   }

diff --git a/src/hotspot/cpu/riscv/nativeInst_riscv.hpp b/src/hotspot/cpu/riscv/nativeInst_riscv.hpp
@@ -126,6 +126,7 @@ class NativeCall: public NativeInstruction {
     return_address_offset       =    4
   };
 
+  static int byte_size()                    { return instruction_size; }
   address instruction_address() const       { return addr_at(instruction_offset); }
   address next_instruction_address() const  { return addr_at(return_address_offset); }
   address return_address() const            { return addr_at(return_address_offset); }

diff --git a/src/hotspot/cpu/s390/nativeInst_s390.hpp b/src/hotspot/cpu/s390/nativeInst_s390.hpp
@@ -212,6 +212,7 @@ class NativeCall: public NativeInstruction {
     call_far_pcrelative_displacement_alignment =  4
   };
 
+  static int byte_size() { return instruction_size; }
 
   // Maximum size (in bytes) of a call to an absolute address.
   // Used when emitting call to deopt handler blob, which is a

diff --git a/src/hotspot/cpu/x86/nativeInst_x86.hpp b/src/hotspot/cpu/x86/nativeInst_x86.hpp
@@ -160,6 +160,7 @@ class NativeCall: public NativeInstruction {
     return_address_offset       =    5
   };
 
+  static int byte_size()                    { return instruction_size; }
   address instruction_address() const       { return addr_at(instruction_offset); }
   address next_instruction_address() const  { return addr_at(return_address_offset); }
   int   displacement() const                { return (jint) int_at(displacement_offset); }

diff --git a/src/hotspot/cpu/zero/nativeInst_zero.hpp b/src/hotspot/cpu/zero/nativeInst_zero.hpp
@@ -70,6 +70,8 @@ class NativeCall : public NativeInstruction {
     instruction_size = 0 // not used within the interpreter
   };
 
+  static int byte_size() { return instruction_size; }
+
   address instruction_address() const {
     ShouldNotCallThis();
     return nullptr;

diff --git a/src/hotspot/share/code/nmethod.inline.hpp b/src/hotspot/share/code/nmethod.inline.hpp
@@ -37,15 +37,15 @@ inline bool nmethod::is_deopt_pc(address pc) { return is_deopt_entry(pc) || is_d
 inline bool nmethod::is_deopt_entry(address pc) {
   return pc == deopt_handler_begin()
 #if INCLUDE_JVMCI
-    || (is_compiled_by_jvmci() && pc == (deopt_handler_begin() + NativeCall::instruction_size))
+    || (is_compiled_by_jvmci() && pc == (deopt_handler_begin() + NativeCall::byte_size()))
 #endif
     ;
 }
 
 inline bool nmethod::is_deopt_mh_entry(address pc) {
   return pc == deopt_mh_handler_begin()
 #if INCLUDE_JVMCI
-    || (is_compiled_by_jvmci() && pc == (deopt_mh_handler_begin() + NativeCall::instruction_size))
+    || (is_compiled_by_jvmci() && pc == (deopt_mh_handler_begin() + NativeCall::byte_size()))
 #endif
     ;
 }

diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
@@ -227,7 +227,7 @@ void G1CollectedHeap::register_region_with_region_attr(G1HeapRegion* r) {
 void G1CollectedHeap::register_old_region_with_region_attr(G1HeapRegion* r) {
   assert(!r->has_pinned_objects(), "must be");
   assert(r->rem_set()->is_complete(), "must be");
-  _region_attr.set_in_old(r->hrm_index(), r->rem_set()->is_tracked());
+  _region_attr.set_in_old(r->hrm_index(), true);
   _rem_set->exclude_region_from_scan(r->hrm_index());
 }
 

diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2024, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1460,7 +1460,7 @@ class PhaseIdealLoop : public PhaseTransform {
   };
   AutoVectorizeStatus auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared);
 
-  // Move UnorderedReduction out of loop if possible
+  // Move an unordered Reduction out of loop if possible
   void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
 
   // Create a scheduled list of nodes control dependent on ctrl set.