Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: MC support for v_cvt_sr_{f16|bf16}_f32 instructions #117796

Merged
merged 1 commit into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,12 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
"Has f16bf16 to fp6bf6 conversion scale instructions"
>;

def FeatureF32ToF16BF16ConversionSRInsts : SubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
"HasF32ToF16BF16ConversionSRInsts",
"true",
"Has f32 to f16bf16 conversion scale instructions"
>;

def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
"HasAshrPkInsts",
"true",
Expand All @@ -438,6 +444,7 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
FeatureFP4ConversionScaleInsts,
FeatureFP6BF6ConversionScaleInsts,
FeatureF16BF16ToFP6BF6ConversionScaleInsts,
FeatureF32ToF16BF16ConversionSRInsts,
FeatureMinimum3Maximum3F32,
FeatureMinimum3Maximum3PKF16
]
Expand Down Expand Up @@ -2504,6 +2511,9 @@ def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionSca
def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;

def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">,
AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>;

def HasGDS : Predicate<"Subtarget->hasGDS()">;

def HasGWS : Predicate<"Subtarget->hasGWS()">;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class AMDGPUSubtarget {
bool HasFP4ConversionScaleInsts = false;
bool HasFP6BF6ConversionScaleInsts = false;
bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
Expand Down Expand Up @@ -190,6 +191,10 @@ class AMDGPUSubtarget {

bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }

bool hasF32ToF16BF16ConversionSRInsts() const {
return HasF32ToF16BF16ConversionSRInsts;
}

bool hasMadMacF32Insts() const {
return HasMadMacF32Insts || !isGCN();
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2850,6 +2850,8 @@ def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;

def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,12 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
} // End SubtargetPredicate = isGFX11Plus

class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
VGPR_32:$vdst_in, op_sel0:$op_sel);
}

// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 patterns
// instead of less complex f16. Disable GlobalISel for these for now.
def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; }]> {
Expand All @@ -1292,6 +1298,13 @@ let SubtargetPredicate = HasBF16ConversionInsts in {
(V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 $src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
}

let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
defm V_CVT_SR_F16_F32 : VOP3Inst<"v_cvt_sr_f16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_F16_F32_I32>>;
defm V_CVT_SR_BF16_F32 : VOP3Inst<"v_cvt_sr_bf16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_BF16_F32_I32>>;
}
}

let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
Expand Down Expand Up @@ -2164,6 +2177,11 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
}

let OtherPredicates = [HasF32ToF16BF16ConversionSRInsts] in {
defm V_CVT_SR_F16_F32 : VOP3OpSel_Real_gfx9 <0x2a6>;
defm V_CVT_SR_BF16_F32: VOP3OpSel_Real_gfx9 <0x2a7>;
}

defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;

Expand Down
32 changes: 32 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_features.s
Original file line number Diff line number Diff line change
Expand Up @@ -1375,3 +1375,35 @@ v_cvt_scalef32_sr_pk_fp4_f32 v0, |v[2:3]|, v4, v5
// NOT-GFX950: error: instruction not supported on this GPU
// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5|

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_f16_f32 v0, v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_bf16_f32 v0, v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
v_cvt_sr_f16_f32 v0, -v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_f16_f32 v0, |v1|, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
v_cvt_sr_bf16_f32 v0, -v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_bf16_f32 v0, |v1|, v2
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,9 @@ v_pk_minimum3_f16 v0, s1, s2, v3

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
v_pk_maximum3_f16 v0, s1, s2, v3

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_sr_f16_f32 v1, v2, v3 clamp

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_sr_bf16_f32 v1, v2, v3 clamp
24 changes: 24 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1014,3 +1014,27 @@

# GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04

# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20

# GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20

# GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00