Skip to content

Commit 616db45

Browse files
committed
fix cndmask pattern
1 parent df544b7 commit 616db45

File tree

7 files changed

+37
-23
lines changed

7 files changed

+37
-23
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,13 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
257257
return false;
258258
}
259259
// Change VGPR to SGPR destination.
260-
MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
260+
const auto *RC = MRI.getRegClass(DstReg);
261+
const auto *TRC = TRI->getEquivalentSGPRClass(RC);
262+
// 16-bit SGPRs are not legal operands in True16 instructions. Convert them to
263+
// 32-bit SGPRs
264+
if (RC == &AMDGPU::VGPR_16RegClass)
265+
TRC = &AMDGPU::SGPR_32RegClass;
266+
MRI.setRegClass(DstReg, TRC);
261267
return true;
262268
}
263269

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6297,6 +6297,12 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
62976297
continue;
62986298
}
62996299

6300+
// True16 Operands cannot contain VGPR_32 (typically occurs during
6301+
// SIFixSGPRCopies). True16 instructions are always selected as VOP3
6302+
if (ST.useRealTrue16Insts() && AMDGPU::isTrue16Inst(Opc) && MO.isReg() &&
6303+
MRI.getRegClass(MO.getReg()) == &AMDGPU::VGPR_32RegClass)
6304+
legalizeOpWithMove(MI, Idx);
6305+
63006306
if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
63016307
continue; // VGPRs are legal
63026308

@@ -8632,7 +8638,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
86328638
break;
86338639
}
86348640

8635-
if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8641+
if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo)) ||
8642+
AMDGPU::isTrue16Inst(UseMI.getOpcode())) {
86368643
Worklist.insert(&UseMI);
86378644

86388645
do {

llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -698,9 +698,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
698698
; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
699699
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l
700700
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
701-
; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
702-
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo
703-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
701+
; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
702+
; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
703+
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
704+
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, vcc_lo
704705
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
705706
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
706707
; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
255255
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256256
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257257
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258-
; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
258+
; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
259259
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260-
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
260+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
261+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
262+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261263
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264264
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l
265+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
265266
; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h
266-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267267
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268268
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269269
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16(
259259
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
260260
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
261261
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
262-
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
262+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
263263
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
264264
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
265-
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l
265+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
266+
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l
267+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
266268
; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l
267-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
268-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
269269
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
270270
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
271271
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
255255
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256256
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
257257
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
258-
; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
258+
; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3]
259259
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
260-
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l
260+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
261+
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
262+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261263
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l
262-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
263-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
264264
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l
265+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
265266
; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h
266-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
267267
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
268268
; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
269269
; GFX12-TRUE16-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16(
238238
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
239239
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
240240
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
241-
; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null
241+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
242242
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
243243
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
244-
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l
244+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
245+
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
246+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
245247
; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l
246-
; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec
247-
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
248248
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
249249
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
250250
; GFX12-TRUE16-NEXT: s_endpgm

0 commit comments

Comments
 (0)