-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: MC support for v_cvt_sr_{f16|bf16}_f32 instructions #117796
Merged
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Nov 26, 2024
Merged
arsenm
requested review from
jayfoad,
pravinjagtap,
rampitec,
shiltian,
Sisyph and
srpande
November 26, 2024 21:34
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesCo-authored-by: Shilei Tian <[email protected]> Full diff: https://github.com/llvm/llvm-project/pull/117796.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 83a74b4a435909..6bac2d2b590ffa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -420,6 +420,12 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
"Has f16bf16 to fp6bf6 conversion scale instructions"
>;
+def FeatureF32ToF16BF16ConversionSRInsts : SubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
+ "HasF32ToF16BF16ConversionSRInsts",
+ "true",
+ "Has f32 to f16bf16 conversion scale instructions"
+>;
+
def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
"HasAshrPkInsts",
"true",
@@ -438,6 +444,7 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
FeatureFP4ConversionScaleInsts,
FeatureFP6BF6ConversionScaleInsts,
FeatureF16BF16ToFP6BF6ConversionScaleInsts,
+ FeatureF32ToF16BF16ConversionSRInsts,
FeatureMinimum3Maximum3F32,
FeatureMinimum3Maximum3PKF16
]
@@ -2504,6 +2511,9 @@ def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionSca
def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;
+def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">,
+ AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>;
+
def HasGDS : Predicate<"Subtarget->hasGDS()">;
def HasGWS : Predicate<"Subtarget->hasGWS()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 742f4e6e80f1a9..c5c951b58b8d6d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -55,6 +55,7 @@ class AMDGPUSubtarget {
bool HasFP4ConversionScaleInsts = false;
bool HasFP6BF6ConversionScaleInsts = false;
bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
+ bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
@@ -190,6 +191,10 @@ class AMDGPUSubtarget {
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }
+ bool hasF32ToF16BF16ConversionSRInsts() const {
+ return HasF32ToF16BF16ConversionSRInsts;
+ }
+
bool hasMadMacF32Insts() const {
return HasMadMacF32Insts || !isGCN();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d08bda37292105..0938a11077cfb1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2850,6 +2850,8 @@ def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
+def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
+def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 237bb0faffff8b..2c441910fe21a8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1268,6 +1268,12 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
} // End SubtargetPredicate = isGFX11Plus
+class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
+ let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+ Int32InputMods:$src1_modifiers, Src1RC64:$src1,
+ VGPR_32:$vdst_in, op_sel0:$op_sel);
+}
+
// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 patterns
// instead of less complex f16. Disable GlobalISel for these for now.
def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; }]> {
@@ -1292,6 +1298,13 @@ let SubtargetPredicate = HasBF16ConversionInsts in {
(V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 $src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
}
+let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
+ let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+ defm V_CVT_SR_F16_F32 : VOP3Inst<"v_cvt_sr_f16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_F16_F32_I32>>;
+ defm V_CVT_SR_BF16_F32 : VOP3Inst<"v_cvt_sr_bf16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_BF16_F32_I32>>;
+ }
+}
+
let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -2164,6 +2177,11 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
}
+let OtherPredicates = [HasF32ToF16BF16ConversionSRInsts] in {
+defm V_CVT_SR_F16_F32 : VOP3OpSel_Real_gfx9 <0x2a6>;
+defm V_CVT_SR_BF16_F32: VOP3OpSel_Real_gfx9 <0x2a7>;
+}
+
defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
index 1a776c1050578c..dac7c33eaa6b2e 100644
--- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s
@@ -1375,3 +1375,35 @@ v_cvt_scalef32_sr_pk_fp4_f32 v0, |v[2:3]|, v4, v5
// NOT-GFX950: error: instruction not supported on this GPU
// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5|
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
+v_cvt_sr_f16_f32 v0, v1, v2
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
+v_cvt_sr_bf16_f32 v0, v1, v2
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
+v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1]
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
+v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1]
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
+v_cvt_sr_f16_f32 v0, -v1, v2
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
+v_cvt_sr_f16_f32 v0, |v1|, v2
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
+v_cvt_sr_bf16_f32 v0, -v1, v2
+
+// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
+// GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
+v_cvt_sr_bf16_f32 v0, |v1|, v2
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index c5450e48558bfd..55cd57a1bc398e 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -392,3 +392,9 @@ v_pk_minimum3_f16 v0, s1, s2, v3
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
v_pk_maximum3_f16 v0, s1, s2, v3
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_sr_f16_f32 v1, v2, v3 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_sr_bf16_f32 v1, v2, v3 clamp
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
index 2b5b4c7770924d..ee9a7c5d2006fd 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
@@ -1014,3 +1014,27 @@
# GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04
+
+# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
+0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00
+
+# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
+0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00
+
+# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
+0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00
+
+# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
+0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00
+
+# GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
+0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20
+
+# GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
+0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00
+
+# GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
+0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20
+
+# GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
+0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00
|
shiltian
approved these changes
Nov 26, 2024
Merge activity
|
arsenm
force-pushed
the
users/arsenm/gfx950/mc-v_cvt_scale_sr_fp4
branch
from
November 27, 2024 00:38
50432fb
to
244a560
Compare
Base automatically changed from
users/arsenm/gfx950/mc-v_cvt_scale_sr_fp4
to
main
November 27, 2024 00:41
arsenm
force-pushed
the
users/arsenm/gfx950/mc-v_cvt_sr_f16_bf16_f32
branch
from
November 27, 2024 00:42
d880fd9
to
9fddf8c
Compare
Co-authored-by: Shilei Tian <[email protected]>
arsenm
force-pushed
the
users/arsenm/gfx950/mc-v_cvt_sr_f16_bf16_f32
branch
from
November 27, 2024 00:45
9fddf8c
to
d286f79
Compare
This was referenced Nov 27, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Co-authored-by: Shilei Tian [email protected]