Skip to content

Commit 8005ee6

Browse files
authored
[AMDGPU] CodeGen for GFX12 64-bit scalar add/sub (#75070)
1 parent 671fa91 commit 8005ee6

File tree

7 files changed

+1833
-112
lines changed

7 files changed

+1833
-112
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -681,13 +681,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
681681

682682
if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
683683
// Full set of gfx9 features.
684-
getActionDefinitionsBuilder({G_ADD, G_SUB})
685-
.legalFor({S32, S16, V2S16})
686-
.clampMaxNumElementsStrict(0, S16, 2)
687-
.scalarize(0)
688-
.minScalar(0, S16)
689-
.widenScalarToNextMultipleOf(0, 32)
690-
.maxScalar(0, S32);
684+
if (ST.hasScalarAddSub64()) {
685+
getActionDefinitionsBuilder({G_ADD, G_SUB})
686+
.legalFor({S64, S32, S16, V2S16})
687+
.clampMaxNumElementsStrict(0, S16, 2)
688+
.scalarize(0)
689+
.minScalar(0, S16)
690+
.widenScalarToNextMultipleOf(0, 32)
691+
.maxScalar(0, S32);
692+
} else {
693+
getActionDefinitionsBuilder({G_ADD, G_SUB})
694+
.legalFor({S32, S16, V2S16})
695+
.clampMaxNumElementsStrict(0, S16, 2)
696+
.scalarize(0)
697+
.minScalar(0, S16)
698+
.widenScalarToNextMultipleOf(0, 32)
699+
.maxScalar(0, S32);
700+
}
691701

692702
getActionDefinitionsBuilder(G_MUL)
693703
.legalFor({S32, S16, V2S16})

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
677677
return AddNoCarryInsts;
678678
}
679679

680+
bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
681+
680682
bool hasUnpackedD16VMem() const {
681683
return HasUnpackedD16VMem;
682684
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4555,40 +4555,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
45554555
}
45564556
case AMDGPU::S_ADD_U64_PSEUDO:
45574557
case AMDGPU::S_SUB_U64_PSEUDO: {
4558-
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4558+
// For targets older than GFX12, we emit a sequence of 32-bit operations.
4559+
// For GFX12, we emit s_add_u64 and s_sub_u64.
45594560
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4560-
const SIRegisterInfo *TRI = ST.getRegisterInfo();
4561-
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4561+
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
45624562
const DebugLoc &DL = MI.getDebugLoc();
4563-
45644563
MachineOperand &Dest = MI.getOperand(0);
45654564
MachineOperand &Src0 = MI.getOperand(1);
45664565
MachineOperand &Src1 = MI.getOperand(2);
4567-
4568-
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4569-
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4570-
4571-
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4572-
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4573-
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4574-
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4575-
4576-
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4577-
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4578-
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4579-
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4580-
45814566
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4582-
4583-
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4584-
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4585-
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
4586-
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
4587-
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4588-
.addReg(DestSub0)
4589-
.addImm(AMDGPU::sub0)
4590-
.addReg(DestSub1)
4591-
.addImm(AMDGPU::sub1);
4567+
if (Subtarget->hasScalarAddSub64()) {
4568+
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4569+
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4570+
.addReg(Src0.getReg())
4571+
.addReg(Src1.getReg());
4572+
} else {
4573+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
4574+
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4575+
4576+
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4577+
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4578+
4579+
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4580+
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4581+
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4582+
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4583+
4584+
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4585+
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4586+
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4587+
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4588+
4589+
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4590+
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4591+
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4592+
.add(Src0Sub0)
4593+
.add(Src1Sub0);
4594+
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4595+
.add(Src0Sub1)
4596+
.add(Src1Sub1);
4597+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4598+
.addReg(DestSub0)
4599+
.addImm(AMDGPU::sub0)
4600+
.addReg(DestSub1)
4601+
.addImm(AMDGPU::sub1);
4602+
}
45924603
MI.eraseFromParent();
45934604
return BB;
45944605
}
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
3+
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
4+
5+
define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
6+
; GFX11-LABEL: s_add_u64:
7+
; GFX11: ; %bb.0: ; %entry
8+
; GFX11-NEXT: s_clause 0x1
9+
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
10+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
11+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
12+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
13+
; GFX11-NEXT: s_add_u32 s0, s6, s0
14+
; GFX11-NEXT: s_addc_u32 s1, s7, s1
15+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
16+
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
17+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
18+
; GFX11-NEXT: s_nop 0
19+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
20+
; GFX11-NEXT: s_endpgm
21+
;
22+
; GFX12-LABEL: s_add_u64:
23+
; GFX12: ; %bb.0: ; %entry
24+
; GFX12-NEXT: s_clause 0x1
25+
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
26+
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
27+
; GFX12-NEXT: v_mov_b32_e32 v2, 0
28+
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
29+
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
30+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
32+
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
33+
; GFX12-NEXT: s_nop 0
34+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
35+
; GFX12-NEXT: s_endpgm
36+
entry:
37+
%add = add i64 %a, %b
38+
store i64 %add, i64 addrspace(1)* %out
39+
ret void
40+
}
41+
42+
define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
43+
; GCN-LABEL: v_add_u64:
44+
; GCN: ; %bb.0: ; %entry
45+
; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
46+
; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
47+
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
48+
; GCN-NEXT: s_nop 0
49+
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
50+
; GCN-NEXT: s_endpgm
51+
entry:
52+
%add = add i64 %a, %b
53+
store i64 %add, i64 addrspace(1)* %out
54+
ret void
55+
}
56+
57+
define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
58+
; GFX11-LABEL: s_sub_u64:
59+
; GFX11: ; %bb.0: ; %entry
60+
; GFX11-NEXT: s_clause 0x1
61+
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
62+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
63+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
64+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
65+
; GFX11-NEXT: s_sub_u32 s0, s6, s0
66+
; GFX11-NEXT: s_subb_u32 s1, s7, s1
67+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
68+
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
69+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
70+
; GFX11-NEXT: s_nop 0
71+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
72+
; GFX11-NEXT: s_endpgm
73+
;
74+
; GFX12-LABEL: s_sub_u64:
75+
; GFX12: ; %bb.0: ; %entry
76+
; GFX12-NEXT: s_clause 0x1
77+
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
78+
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
79+
; GFX12-NEXT: v_mov_b32_e32 v2, 0
80+
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
81+
; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
82+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
83+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
84+
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
85+
; GFX12-NEXT: s_nop 0
86+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
87+
; GFX12-NEXT: s_endpgm
88+
entry:
89+
%sub = sub i64 %a, %b
90+
store i64 %sub, i64 addrspace(1)* %out
91+
ret void
92+
}
93+
94+
define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
95+
; GCN-LABEL: v_sub_u64:
96+
; GCN: ; %bb.0: ; %entry
97+
; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4
98+
; GCN-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
99+
; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off
100+
; GCN-NEXT: s_nop 0
101+
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
102+
; GCN-NEXT: s_endpgm
103+
entry:
104+
%sub = sub i64 %a, %b
105+
store i64 %sub, i64 addrspace(1)* %out
106+
ret void
107+
}

0 commit comments

Comments
 (0)