Skip to content

Commit

Permalink
AMDGPU: MC support for v_cvt_sr_{f16|bf16}_f32 instructions
Browse files Browse the repository at this point in the history
Co-authored-by: Shilei Tian <[email protected]>
  • Loading branch information
shiltian authored and arsenm committed Nov 27, 2024
1 parent e335563 commit d286f79
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 0 deletions.
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,12 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
"Has f16bf16 to fp6bf6 conversion scale instructions"
>;

def FeatureF32ToF16BF16ConversionSRInsts : SubtargetFeature<"f32-to-f16bf16-cvt-sr-insts",
"HasF32ToF16BF16ConversionSRInsts",
"true",
"Has f32 to f16bf16 conversion scale instructions"
>;

def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
"HasAshrPkInsts",
"true",
Expand All @@ -438,6 +444,7 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
FeatureFP4ConversionScaleInsts,
FeatureFP6BF6ConversionScaleInsts,
FeatureF16BF16ToFP6BF6ConversionScaleInsts,
FeatureF32ToF16BF16ConversionSRInsts,
FeatureMinimum3Maximum3F32,
FeatureMinimum3Maximum3PKF16
]
Expand Down Expand Up @@ -2504,6 +2511,9 @@ def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionSca
def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;

def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">,
AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>;

def HasGDS : Predicate<"Subtarget->hasGDS()">;

def HasGWS : Predicate<"Subtarget->hasGWS()">;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class AMDGPUSubtarget {
bool HasFP4ConversionScaleInsts = false;
bool HasFP6BF6ConversionScaleInsts = false;
bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
Expand Down Expand Up @@ -190,6 +191,10 @@ class AMDGPUSubtarget {

bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }

bool hasF32ToF16BF16ConversionSRInsts() const {
return HasF32ToF16BF16ConversionSRInsts;
}

bool hasMadMacF32Insts() const {
return HasMadMacF32Insts || !isGCN();
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2850,6 +2850,8 @@ def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;

def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,12 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
} // End SubtargetPredicate = isGFX11Plus

class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
VGPR_32:$vdst_in, op_sel0:$op_sel);
}

// FIXME: GlobalISel cannot distinguish f16 and bf16 and may start using bf16 patterns
// instead of less complex f16. Disable GlobalISel for these for now.
def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true; }]> {
Expand All @@ -1292,6 +1298,13 @@ let SubtargetPredicate = HasBF16ConversionInsts in {
(V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 $src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>;
}

let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
defm V_CVT_SR_F16_F32 : VOP3Inst<"v_cvt_sr_f16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_F16_F32_I32>>;
defm V_CVT_SR_BF16_F32 : VOP3Inst<"v_cvt_sr_bf16_f32", VOP3_CVT_SR_FP16_TiedInput_Profile<VOP_BF16_F32_I32>>;
}
}

let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
Expand Down Expand Up @@ -2164,6 +2177,11 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
}

let OtherPredicates = [HasF32ToF16BF16ConversionSRInsts] in {
defm V_CVT_SR_F16_F32 : VOP3OpSel_Real_gfx9 <0x2a6>;
defm V_CVT_SR_BF16_F32: VOP3OpSel_Real_gfx9 <0x2a7>;
}

defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;

Expand Down
32 changes: 32 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_features.s
Original file line number Diff line number Diff line change
Expand Up @@ -1375,3 +1375,35 @@ v_cvt_scalef32_sr_pk_fp4_f32 v0, |v[2:3]|, v4, v5
// NOT-GFX950: error: instruction not supported on this GPU
// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5|

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_f16_f32 v0, v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_bf16_f32 v0, v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
v_cvt_sr_f16_f32 v0, -v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_f16_f32 v0, |v1|, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
v_cvt_sr_bf16_f32 v0, -v1, v2

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
v_cvt_sr_bf16_f32 v0, |v1|, v2
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -392,3 +392,9 @@ v_pk_minimum3_f16 v0, s1, s2, v3

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
v_pk_maximum3_f16 v0, s1, s2, v3

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_sr_f16_f32 v1, v2, v3 clamp

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_sr_bf16_f32 v1, v2, v3 clamp
24 changes: 24 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1014,3 +1014,27 @@

# GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, v[2:3], v4, |v5| ; encoding: [0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04]
0x00,0x04,0x3e,0xd2,0x02,0x09,0x16,0x04

# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00]
0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00]
0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_f16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00]
0x00,0x40,0xa6,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_bf16_f32 v0, v1, v2 op_sel:[0,0,1] ; encoding: [0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00]
0x00,0x40,0xa7,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_f16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20]
0x00,0x00,0xa6,0xd2,0x01,0x05,0x02,0x20

# GFX950: v_cvt_sr_f16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00]
0x00,0x01,0xa6,0xd2,0x01,0x05,0x02,0x00

# GFX950: v_cvt_sr_bf16_f32 v0, -v1, v2 ; encoding: [0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20]
0x00,0x00,0xa7,0xd2,0x01,0x05,0x02,0x20

# GFX950: v_cvt_sr_bf16_f32 v0, |v1|, v2 ; encoding: [0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00]
0x00,0x01,0xa7,0xd2,0x01,0x05,0x02,0x00

0 comments on commit d286f79

Please sign in to comment.