Skip to content

Commit 5208f72

Browse files
authored
[AMDGPU] Fix SIFoldOperandsImpl::canUseImmWithOpSel() for VOP3 packed [B]F16 imms. (#142142)
VOP3 instructions ignore opsel source modifiers, so a constant that contains two different [B]F16 imms cannot be encoded into instruction with an src opsel. E.g. without the fix the following instructions `s_mov_b32 s0, 0x40003c00 // <half 1.0, half 2.0>` `v_cvt_scalef32_pk_fp8_f16 v0, s0, v2` lose `2.0` imm and are folded into `v_cvt_scalef32_pk_fp8_f16 v1, 1.0, 1.0` Fixes SWDEV-531672
1 parent 597340b commit 5208f72

File tree

2 files changed

+158
-0
lines changed

2 files changed

+158
-0
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,12 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(FoldCandidate &Fold) const {
374374
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
375375
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
376376
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
377+
// VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
378+
// two different constants.
379+
if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
380+
static_cast<uint16_t>(Fold.ImmToFold) !=
381+
static_cast<uint16_t>(Fold.ImmToFold >> 16))
382+
return false;
377383
break;
378384
}
379385

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,34 @@ define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word1(<2 x i16> %old, <2 x half>
601601
ret <2 x i16> %ret
602602
}
603603

604+
define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_imm1(<2 x i16> %old, float %scale) {
605+
; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_imm1:
606+
; GCN: ; %bb.0:
607+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608+
; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, 4.0, v1
609+
; GCN-NEXT: s_setpc_b64 s[30:31]
610+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> <half 4.0, half 4.0>, float %scale, i1 false)
611+
ret <2 x i16> %ret
612+
}
613+
614+
define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_imm2(<2 x i16> %old, float %scale) {
615+
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk_fp8_f16_imm2:
616+
; GFX950-SDAG: ; %bb.0:
617+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618+
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x40004400
619+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1
620+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
621+
;
622+
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk_fp8_f16_imm2:
623+
; GFX950-GISEL: ; %bb.0:
624+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x40004400
626+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, v2, v1
627+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
628+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> <half 4.0, half 2.0>, float %scale, i1 false)
629+
ret <2 x i16> %ret
630+
}
631+
604632
define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word0(<2 x i16> %old, <2 x bfloat> %src, float %scale) {
605633
; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word0:
606634
; GCN: ; %bb.0:
@@ -621,6 +649,27 @@ define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word1(<2 x i16> %old, <2 x bfloa
621649
ret <2 x i16> %ret
622650
}
623651

652+
define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_imm1(<2 x i16> %old, float %scale) {
653+
; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_imm1:
654+
; GCN: ; %bb.0:
655+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656+
; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, 4.0, v1
657+
; GCN-NEXT: s_setpc_b64 s[30:31]
658+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> <bfloat 4.0, bfloat 4.0>, float %scale, i1 false)
659+
ret <2 x i16> %ret
660+
}
661+
662+
define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_imm2(<2 x i16> %old, float %scale) {
663+
; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_imm2:
664+
; GCN: ; %bb.0:
665+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666+
; GCN-NEXT: s_mov_b32 s0, 0x40004080
667+
; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1
668+
; GCN-NEXT: s_setpc_b64 s[30:31]
669+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> <bfloat 4.0, bfloat 2.0>, float %scale, i1 false)
670+
ret <2 x i16> %ret
671+
}
672+
624673
define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word0(<2 x i16> %old, <2 x half> %src, float %scale) {
625674
; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word0:
626675
; GCN: ; %bb.0:
@@ -641,6 +690,34 @@ define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word1(<2 x i16> %old, <2 x half>
641690
ret <2 x i16> %ret
642691
}
643692

693+
define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_imm1(<2 x i16> %old, float %scale) {
694+
; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_imm1:
695+
; GCN: ; %bb.0:
696+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697+
; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, 4.0, v1
698+
; GCN-NEXT: s_setpc_b64 s[30:31]
699+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> <half 4.0, half 4.0>, float %scale, i1 false)
700+
ret <2 x i16> %ret
701+
}
702+
703+
define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_imm2(<2 x i16> %old, float %scale) {
704+
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk_bf8_f16_imm2:
705+
; GFX950-SDAG: ; %bb.0:
706+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
707+
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x40004400
708+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1
709+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
710+
;
711+
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk_bf8_f16_imm2:
712+
; GFX950-GISEL: ; %bb.0:
713+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
714+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x40004400
715+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, v2, v1
716+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
717+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> <half 4.0, half 2.0>, float %scale, i1 false)
718+
ret <2 x i16> %ret
719+
}
720+
644721
define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word0(<2 x i16> %old, <2 x bfloat> %src, float %scale) {
645722
; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word0:
646723
; GCN: ; %bb.0:
@@ -661,6 +738,27 @@ define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word1(<2 x i16> %old, <2 x bfloa
661738
ret <2 x i16> %ret
662739
}
663740

741+
define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_imm1(<2 x i16> %old, float %scale) {
742+
; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_imm1:
743+
; GCN: ; %bb.0:
744+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
745+
; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, 4.0, v1
746+
; GCN-NEXT: s_setpc_b64 s[30:31]
747+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> <bfloat 4.0, bfloat 4.0>, float %scale, i1 false)
748+
ret <2 x i16> %ret
749+
}
750+
751+
define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_imm2(<2 x i16> %old, float %scale) {
752+
; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_imm2:
753+
; GCN: ; %bb.0:
754+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755+
; GCN-NEXT: s_mov_b32 s0, 0x40004080
756+
; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1
757+
; GCN-NEXT: s_setpc_b64 s[30:31]
758+
%ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> <bfloat 4.0, bfloat 2.0>, float %scale, i1 false)
759+
ret <2 x i16> %ret
760+
}
761+
664762
define <2 x float> @test_cvt_scale_f32_fp4_byte0(i32 %src, float %scale) {
665763
; GCN-LABEL: test_cvt_scale_f32_fp4_byte0:
666764
; GCN: ; %bb.0:
@@ -1236,6 +1334,37 @@ define i32 @test_cvt_scalef32_fp4_f16_byte3(<2 x half> %src0, float %scale, i32
12361334
ret i32 %ret
12371335
}
12381336

1337+
define i32 @test_cvt_scalef32_fp4_f16_imm1(float %scale, i32 %old) {
1338+
; GCN-LABEL: test_cvt_scalef32_fp4_f16_imm1:
1339+
; GCN: ; %bb.0:
1340+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1341+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, 4.0, v0
1342+
; GCN-NEXT: v_mov_b32_e32 v0, v1
1343+
; GCN-NEXT: s_setpc_b64 s[30:31]
1344+
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> <half 4.0, half 4.0>, float %scale, i32 0)
1345+
ret i32 %ret
1346+
}
1347+
1348+
define i32 @test_cvt_scalef32_fp4_f16_imm2(float %scale, i32 %old) {
1349+
; GFX950-SDAG-LABEL: test_cvt_scalef32_fp4_f16_imm2:
1350+
; GFX950-SDAG: ; %bb.0:
1351+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1352+
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x40004400
1353+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0
1354+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v1
1355+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
1356+
;
1357+
; GFX950-GISEL-LABEL: test_cvt_scalef32_fp4_f16_imm2:
1358+
; GFX950-GISEL: ; %bb.0:
1359+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1360+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0x40004400
1361+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, v2, v0
1362+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v1
1363+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
1364+
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> <half 4.0, half 2.0>, float %scale, i32 0)
1365+
ret i32 %ret
1366+
}
1367+
12391368
define i32 @test_cvt_scalef32_fp4_bf16_byte0(<2 x bfloat> %src0, float %scale, i32 %old) {
12401369
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte0:
12411370
; GCN: ; %bb.0:
@@ -1283,6 +1412,29 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte3(<2 x bfloat> %src0, float %scale, i
12831412
ret i32 %ret
12841413
}
12851414

1415+
define i32 @test_cvt_scalef32_fp4_bf16_imm1(float %scale, i32 %old) {
1416+
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_imm1:
1417+
; GCN: ; %bb.0:
1418+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1419+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, 4.0, v0
1420+
; GCN-NEXT: v_mov_b32_e32 v0, v1
1421+
; GCN-NEXT: s_setpc_b64 s[30:31]
1422+
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> <bfloat 4.0, bfloat 4.0>, float %scale, i32 0)
1423+
ret i32 %ret
1424+
}
1425+
1426+
define i32 @test_cvt_scalef32_fp4_bf16_imm2(float %scale, i32 %old) {
1427+
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_imm2:
1428+
; GCN: ; %bb.0:
1429+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1430+
; GCN-NEXT: s_mov_b32 s0, 0x40004080
1431+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0
1432+
; GCN-NEXT: v_mov_b32_e32 v0, v1
1433+
; GCN-NEXT: s_setpc_b64 s[30:31]
1434+
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> <bfloat 4.0, bfloat 2.0>, float %scale, i32 0)
1435+
ret i32 %ret
1436+
}
1437+
12861438
define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv_inreg_src(<16 x float> inreg %src, float %scale, ptr addrspace(1) %out) {
12871439
; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv_inreg_src:
12881440
; GFX950-SDAG: ; %bb.0:

0 commit comments

Comments
 (0)