Skip to content

Commit bc0eb44

Browse files
committed
Support V2S copy with True16 inst format.
V2S COPY can be emitted as either sgpr_32 = COPY vgpr_16 or sgpr_lo16 = COPY vgpr_16 Emit REG_SEQUENCE with hi16 bits undef in readfirstlane for 16 bit src
1 parent c6c864d commit bc0eb44

File tree

2 files changed

+137
-4
lines changed

2 files changed

+137
-4
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,10 +1075,25 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10751075
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
10761076
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
10771077
if (SrcSize == 16) {
1078-
// HACK to handle possible 16bit VGPR source
1079-
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
1080-
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
1081-
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
1078+
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
1079+
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
1080+
"we have 16-bit VGPRs");
1081+
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
1082+
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass);
1083+
// There is no V_READFIRSTLANE_B16, so widen the destination scalar
1084+
// value to 32 bits
1085+
MRI->setRegClass(DstReg, &AMDGPU::SGPR_32RegClass);
1086+
Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1087+
const DebugLoc &DL = MI->getDebugLoc();
1088+
Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
1089+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
1090+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), TmpReg)
1091+
.addReg(SrcReg, 0, SubReg)
1092+
.addImm(AMDGPU::lo16)
1093+
.addReg(Undef)
1094+
.addImm(AMDGPU::hi16);
1095+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
1096+
.addReg(TmpReg);
10821097
} else if (SrcSize == 32) {
10831098
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
10841099
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s
3+
4+
# Ensure READFIRSTLANE is generated, and that its src is REG_SEQUENCE.
5+
6+
---
7+
name: test4
8+
tracksRegLiveness: true
9+
body: |
10+
; CHECK-LABEL: name: test4
11+
; CHECK: bb.0:
12+
; CHECK-NEXT: successors: %bb.1(0x80000000)
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
15+
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
16+
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
17+
; CHECK-NEXT: S_BRANCH %bb.1
18+
; CHECK-NEXT: {{ $}}
19+
; CHECK-NEXT: bb.1:
20+
; CHECK-NEXT: successors: %bb.2(0x80000000)
21+
; CHECK-NEXT: {{ $}}
22+
; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.3
23+
; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3
24+
; CHECK-NEXT: {{ $}}
25+
; CHECK-NEXT: bb.2:
26+
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
27+
; CHECK-NEXT: {{ $}}
28+
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
29+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY [[PHI]]
30+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
31+
; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY1]], [[DEF]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec
32+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
33+
; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY2]], [[DEF]], [[S_MOV_B32_1]], 2, 0, 0, implicit $exec
34+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN]].lo16
35+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[BUFFER_LOAD_USHORT_OFFEN1]].lo16
36+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
37+
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
38+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
39+
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[V_MOV_B32_e32_]], [[COPY5]], implicit $exec
40+
; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[COPY6]], 16, killed [[V_AND_B32_e64_]], implicit $exec
41+
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_lo16 = COPY [[PHI1]].lo16
42+
; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_16 = COPY [[COPY7]]
43+
; CHECK-NEXT: [[V_SUB_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SUB_NC_U16_t16_e64 0, [[COPY8]], 0, killed [[COPY3]], 0, 0, implicit $exec
44+
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
45+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SUB_NC_U16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
46+
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]], implicit $exec
47+
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255
48+
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[V_READFIRSTLANE_B32_]], killed [[S_MOV_B32_2]], implicit-def dead $scc
49+
; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12
50+
; CHECK-NEXT: S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_3]], implicit-def $scc
51+
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.4, implicit $scc
52+
; CHECK-NEXT: S_BRANCH %bb.3
53+
; CHECK-NEXT: {{ $}}
54+
; CHECK-NEXT: bb.3:
55+
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.1(0x40000000)
56+
; CHECK-NEXT: {{ $}}
57+
; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
58+
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
59+
; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 18
60+
; CHECK-NEXT: S_CMP_LT_I32 [[S_AND_B32_]], killed [[S_MOV_B32_5]], implicit-def $scc
61+
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
62+
; CHECK-NEXT: S_BRANCH %bb.4
63+
; CHECK-NEXT: {{ $}}
64+
; CHECK-NEXT: bb.4:
65+
; CHECK-NEXT: S_ENDPGM 0
66+
67+
bb.0:
68+
successors: %bb.1(0x80000000)
69+
70+
%0:sgpr_128 = IMPLICIT_DEF
71+
%2:sreg_32 = S_MOV_B32 0
72+
%3:sgpr_128 = IMPLICIT_DEF
73+
S_BRANCH %bb.1
74+
75+
bb.1:
76+
successors: %bb.2(0x80000000)
77+
78+
%5:sreg_32 = PHI %2, %bb.0, %6, %bb.3
79+
%7:sreg_32 = PHI %2, %bb.0, %8, %bb.3
80+
81+
bb.2:
82+
successors: %bb.4(0x40000000), %bb.3(0x40000000)
83+
84+
%9:sreg_32 = S_MOV_B32 0
85+
%10:vgpr_32 = COPY %5
86+
%11:vgpr_32 = COPY %5
87+
%12:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %11, %0, %9, 0, 0, 0, implicit $exec
88+
%13:vgpr_32 = COPY %5
89+
%14:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN %13, %0, %9, 2, 0, 0, implicit $exec
90+
%15:vgpr_16 = COPY %12.lo16
91+
%16:vgpr_16 = COPY %14.lo16
92+
%17:sreg_32 = COPY %15
93+
%18:sreg_32 = COPY %16
94+
%19:sreg_32 = S_PACK_LL_B32_B16 %17, %18
95+
%20:sgpr_lo16 = COPY %7.lo16
96+
%21:vgpr_16 = COPY %20
97+
%22:vgpr_16 = V_SUB_NC_U16_t16_e64 0, %21, 0, killed %15, 0, 0, implicit $exec
98+
%23:sreg_32 = COPY killed %22
99+
%24:sreg_32 = S_MOV_B32 255
100+
%25:sreg_32 = S_AND_B32 killed %23, killed %24, implicit-def dead $scc
101+
%26:sreg_32 = S_MOV_B32 12
102+
S_CMP_LT_I32 %25, killed %26, implicit-def $scc
103+
S_CBRANCH_SCC1 %bb.4, implicit $scc
104+
S_BRANCH %bb.3
105+
106+
bb.3:
107+
successors: %bb.4(0x40000000), %bb.1(0x40000000)
108+
109+
%6:sreg_32 = S_MOV_B32 -1
110+
%8:sreg_32 = IMPLICIT_DEF
111+
%27:sreg_32 = S_MOV_B32 18
112+
S_CMP_LT_I32 %25, killed %27, implicit-def $scc
113+
S_CBRANCH_SCC1 %bb.1, implicit $scc
114+
S_BRANCH %bb.4
115+
116+
bb.4:
117+
S_ENDPGM 0
118+
...

0 commit comments

Comments
 (0)