Skip to content

Commit 1a892c7

Browse files
broxigarchenGeorgeARM
authored andcommitted
[AMDGPU][True16][CodeGen] readfirstlane for vgpr16 copy to sgpr32 (llvm#118037)
i16 can be selected into sgpr32 or vgpr16 in isel lowering in true16 mode. And thus, it creates cases that we copy from vgpr16 to sgpr32 in ext selection and this seems inevitable without sgpr16 support. legalize the src/dst reg when we decide to lower this special copy to a readfirstlane in fix-sgpr-copy pass and add a lit test
1 parent f3f0049 commit 1a892c7

File tree

4 files changed

+128
-12
lines changed

4 files changed

+128
-12
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,10 +1086,25 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10861086
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
10871087
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
10881088
if (SrcSize == 16) {
1089-
// HACK to handle possible 16bit VGPR source
1090-
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
1091-
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
1092-
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
1089+
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
1090+
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
1091+
"we have 16-bit VGPRs");
1092+
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
1093+
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
1094+
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
1095+
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
1096+
MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
1097+
Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1098+
const DebugLoc &DL = MI->getDebugLoc();
1099+
Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
1100+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
1101+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
1102+
.addReg(SrcReg, 0, SubReg)
1103+
.addImm(AMDGPU::lo16)
1104+
.addReg(Undef)
1105+
.addImm(AMDGPU::hi16);
1106+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
1107+
.addReg(VReg32);
10931108
} else if (SrcSize == 32) {
10941109
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
10951110
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1472,16 +1472,9 @@ def : GCNPat <
14721472

14731473
} // End OtherPredicates = [isGFX8Plus, p]
14741474

1475-
let True16Predicate = UseFakeTrue16Insts in {
1476-
def : GCNPat<
1477-
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
1478-
(COPY $src)
1479-
>;
1480-
} // End True16Predicate = UseFakeTrue16Insts
1481-
14821475
let True16Predicate = UseRealTrue16Insts in {
14831476
def : GCNPat<
1484-
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
1477+
(i32 (UniformUnaryFrag<anyext> i16:$src)),
14851478
(COPY $src)
14861479
>;
14871480

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,67 @@ body: |
5353
%3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
5454
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
5555
...
56+
57+
---
58+
name: vgpr16_to_spgr32
59+
body: |
60+
; GCN-LABEL: name: vgpr16_to_spgr32
61+
; GCN: bb.0.entry:
62+
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
63+
; GCN-NEXT: {{ $}}
64+
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
65+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
66+
; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 killed [[COPY]], 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
67+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
68+
; GCN-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
69+
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
70+
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
71+
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
72+
; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec
73+
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], killed [[V_READFIRSTLANE_B32_]], implicit-def dead $scc
74+
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5
75+
; GCN-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]]
76+
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
77+
; GCN-NEXT: S_CMP_LG_U32 killed [[S_MUL_I32_]], killed [[S_MOV_B32_2]], implicit-def $scc
78+
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
79+
; GCN-NEXT: S_BRANCH %bb.1
80+
; GCN-NEXT: {{ $}}
81+
; GCN-NEXT: bb.1:
82+
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
83+
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]]
84+
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_4]]
85+
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
86+
; GCN-NEXT: {{ $}}
87+
; GCN-NEXT: bb.2:
88+
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 2
89+
; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_5]]
90+
; GCN-NEXT: $sgpr0 = COPY [[S_MOV_B32_6]]
91+
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0
92+
bb.0.entry:
93+
successors: %bb.1(0x40000000), %bb.2(0x40000000); %bb.1(50.00%), %bb.2(50.00%)
94+
95+
%5:sreg_32 = IMPLICIT_DEF
96+
%6:vgpr_32 = COPY %5:sreg_32
97+
%4:vreg_64 = DS_READ2_B32_gfx9 killed %6:vgpr_32, 0, 1, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) poison` + 8, align 4, addrspace 3)
98+
%7:sgpr_32 = COPY %4.sub0:vreg_64
99+
%8:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, killed %7:sgpr_32, 0, 0, 0, implicit $mode, implicit $exec
100+
%9:sreg_32 = S_MOV_B32 65535
101+
%11:sreg_32 = COPY %8:vgpr_16
102+
%10:sreg_32 = S_AND_B32 killed %9:sreg_32, killed %11:sreg_32, implicit-def dead $scc
103+
%12:sreg_32 = S_MOV_B32 5
104+
%13:sreg_32 = S_MUL_I32 killed %10:sreg_32, killed %12:sreg_32
105+
%14:sreg_32 = S_MOV_B32 2
106+
S_CMP_LG_U32 killed %13:sreg_32, killed %14:sreg_32, implicit-def $scc
107+
S_CBRANCH_SCC1 %bb.2, implicit $scc
108+
S_BRANCH %bb.1
109+
bb.1:
110+
%17:sreg_32 = S_MOV_B32 1
111+
%18:sreg_32 = S_MOV_B32 killed %17:sreg_32
112+
$sgpr0 = COPY %18:sreg_32
113+
SI_RETURN_TO_EPILOG $sgpr0
114+
bb.2:
115+
%15:sreg_32 = S_MOV_B32 2
116+
%16:sreg_32 = S_MOV_B32 killed %15:sreg_32
117+
$sgpr0 = COPY %16:sreg_32
118+
SI_RETURN_TO_EPILOG $sgpr0
119+
...
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
3+
4+
@lds = external local_unnamed_addr addrspace(3) global [4 x float], align 4
5+
6+
; expect readfirstlane to pick the 32bit register
7+
define amdgpu_gs i32 @vgpr16_copyto_sgpr() {
8+
; CHECK-LABEL: vgpr16_copyto_sgpr:
9+
; CHECK: ; %bb.0: ; %entry
10+
; CHECK-NEXT: v_mov_b32_e32 v0, lds@abs32@lo
11+
; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
14+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
15+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
16+
; CHECK-NEXT: s_and_b32 s0, 0xffff, s0
17+
; CHECK-NEXT: s_mul_i32 s0, s0, 5
18+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
19+
; CHECK-NEXT: s_cmp_lg_u32 s0, 2
20+
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
21+
; CHECK-NEXT: ; %bb.1: ; %a1
22+
; CHECK-NEXT: s_mov_b32 s0, 1
23+
; CHECK-NEXT: s_branch .LBB0_3
24+
; CHECK-NEXT: .LBB0_2: ; %a2
25+
; CHECK-NEXT: s_mov_b32 s0, 2
26+
; CHECK-NEXT: s_branch .LBB0_3
27+
; CHECK-NEXT: .LBB0_3:
28+
entry:
29+
%ptr = load <4 x float>, ptr addrspace(3) @lds, align 4
30+
%f = extractelement <4 x float> %ptr, i32 0
31+
%half = fptrunc float %f to half
32+
%i16 = bitcast half %half to i16
33+
%i32 = zext i16 %i16 to i32
34+
%add = add i32 %i32, 1
35+
%mul = mul i32 %add, 5
36+
%icmp = icmp eq i32 %mul, 7
37+
br i1 %icmp, label %a1, label %a2
38+
39+
a1:
40+
ret i32 1
41+
42+
a2:
43+
ret i32 2
44+
}

0 commit comments

Comments
 (0)