-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32* #117822
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/allocate-different-regs-v_cvt_scalef32
Nov 27, 2024
Merged
AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32* #117822
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/allocate-different-regs-v_cvt_scalef32
Nov 27, 2024
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Nov 27, 2024
Merged
This was referenced Nov 27, 2024
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesFor multipass instructions, overlap on VDST and SRC’s Co-authored-by: Pravin Jagtap <[email protected]> Patch is 68.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117822.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 00caea1f923391..9ef52c0feb7233 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1088,9 +1088,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+ let Constraints = "@earlyclobber $vdst" in {
+ defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
+ defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
+ defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+ }
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
@@ -1103,7 +1105,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
}
}
-let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>;
defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>;
defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>;
@@ -1112,7 +1114,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>;
}
-let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>;
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index 6d627186d25816..f80f2935856e36 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) {
}
define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scale_pk32_f32_fp6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
ret <32 x float> %ret
}
define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scale_pk32_f32_bf6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
ret <32 x float> %ret
}
define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
ret <32 x half> %ret
}
@@ -897,14 +957,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
@@ -912,11 +972,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
ret <32 x half> %ret
@@ -926,7 +986,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
@@ -958,14 +1025,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
@@ -1000,11 +1067,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
}
define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
ret <32 x half> %ret
}
@@ -1013,14 +1100,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
@@ -1028,11 +1115,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
ret <32 x half> %ret
@@ -1042,7 +1129,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
@@ -1074,14 +1168,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
index 4153bc8f43563b..f9fd7e253b1243 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
@@ -10,24 +10,24 @@ declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float
define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv:
; GFX950-SDAG: ; %bb.0:
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17
-; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16
-; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v18
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v17
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv:
; GFX950-GISEL: ; %bb.0:
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17
+; GFX950-GISEL-NEXT: v_mov_b...
[truncated]
|
3e03537
to
b26cbd2
Compare
418ffe2
to
bbc7178
Compare
shiltian
approved these changes
Nov 27, 2024
b26cbd2
to
f123c3b
Compare
bbc7178
to
7ce3533
Compare
This was referenced Nov 27, 2024
f123c3b
to
1d0d9b2
Compare
Base automatically changed from
users/arsenm/gfx950/codegen-v_cvt_scalef32_sr_bf8_fp8_f16_bf16_f32
to
main
November 27, 2024 04:24
For multipass instructions, overlap on VDST and SRC’s would result in HW race & undefined results. Co-authored-by: Pravin Jagtap <[email protected]>
7ce3533
to
ddb0c55
Compare
This was referenced Dec 2, 2024
searlmc1
pushed a commit
to ROCm/llvm-project
that referenced
this pull request
Feb 3, 2025
llvm#117822) For multipass instructions, overlap on VDST and SRC’s would result in HW race & undefined results. Co-authored-by: Pravin Jagtap <[email protected]>
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
For multipass instructions, overlap on VDST and SRC’s
would result in HW race & undefined results.
Co-authored-by: Pravin Jagtap [email protected]