Skip to content

Commit 9adcefe

Browse files
committed
fix vgpr16 copy to sgpr32
1 parent 0b2ab11 commit 9adcefe

File tree

3 files changed

+59
-12
lines changed

3 files changed

+59
-12
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,10 +1086,23 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10861086
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
10871087
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
10881088
if (SrcSize == 16) {
1089-
// HACK to handle possible 16bit VGPR source
1090-
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
1091-
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
1092-
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
1089+
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
1090+
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
1091+
"we have 16-bit VGPRs");
1092+
assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
1093+
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
1094+
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
1095+
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
1096+
if (MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass)
1097+
MRI->setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
1098+
Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1099+
const DebugLoc &DL = MI->getDebugLoc();
1100+
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), VReg32)
1101+
.addImm(0)
1102+
.addReg(SrcReg, 0)
1103+
.addImm(AMDGPU::lo16);
1104+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
1105+
.addReg(VReg32);
10931106
} else if (SrcSize == 32) {
10941107
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
10951108
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1472,16 +1472,9 @@ def : GCNPat <
14721472

14731473
} // End OtherPredicates = [isGFX8Plus, p]
14741474

1475-
let True16Predicate = UseFakeTrue16Insts in {
1476-
def : GCNPat<
1477-
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
1478-
(COPY $src)
1479-
>;
1480-
} // End True16Predicate = UseFakeTrue16Insts
1481-
14821475
let True16Predicate = UseRealTrue16Insts in {
14831476
def : GCNPat<
1484-
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
1477+
(i32 (UniformUnaryFrag<anyext> i16:$src)),
14851478
(COPY $src)
14861479
>;
14871480

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s
3+
4+
; expect readfirstlane to pick the 32bit register
5+
define amdgpu_gs i32 @vgpr16_copyto_sgpr(ptr addrspace(3) %a, i32 %b, ptr addrspace(1) %out) {
6+
; CHECK-LABEL: vgpr16_copyto_sgpr:
7+
; CHECK: ; %bb.0: ; %entry
8+
; CHECK-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1
9+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
10+
; CHECK-NEXT: v_cvt_f16_f32_e32 v0.l, v0
11+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12+
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
13+
; CHECK-NEXT: s_and_b32 s0, 0xffff, s0
14+
; CHECK-NEXT: s_mul_i32 s0, s0, 5
15+
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
16+
; CHECK-NEXT: s_cmp_lg_u32 s0, 2
17+
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
18+
; CHECK-NEXT: ; %bb.1: ; %a1
19+
; CHECK-NEXT: s_mov_b32 s0, 1
20+
; CHECK-NEXT: s_branch .LBB0_3
21+
; CHECK-NEXT: .LBB0_2: ; %a2
22+
; CHECK-NEXT: s_mov_b32 s0, 2
23+
; CHECK-NEXT: s_branch .LBB0_3
24+
; CHECK-NEXT: .LBB0_3:
25+
entry:
26+
%1 = load <4 x float>, ptr addrspace(3) poison, align 4
27+
%2 = extractelement <4 x float> %1, i32 0
28+
%3 = fptrunc float %2 to half
29+
%4 = bitcast half %3 to i16
30+
%5 = zext i16 %4 to i32
31+
%6 = add i32 %5, 1
32+
%7 = mul i32 %6, 5
33+
%8 = icmp eq i32 %7, 7
34+
br i1 %8, label %a1, label %a2
35+
36+
a1:
37+
ret i32 1
38+
39+
a2:
40+
ret i32 2
41+
}

0 commit comments

Comments
 (0)