Skip to content

Commit 7a62a5b

Browse files
committed
[AMDGPU] Legalize initialized LDS variables
We don't allow an initializer for LDS variables and there is an early abort during instruction selection. This patch legalizes them by ignoring the init values. During assembly emission, proper error reporting already exists for such instances. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D109901
1 parent afab3c4 commit 7a62a5b

File tree

4 files changed

+90
-46
lines changed

4 files changed

+90
-46
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,16 +1378,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
13781378
"Do not know what to do with an non-zero offset");
13791379

13801380
// TODO: We could emit code to handle the initialization somewhere.
1381-
if (!hasDefinedInitializer(GV)) {
1382-
unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1383-
return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1384-
}
1381+
// We ignore the initializer for now and legalize it to allow selection.
1382+
// The initializer will anyway get errored out during assembly emission.
1383+
unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1384+
return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
13851385
}
1386-
1387-
const Function &Fn = DAG.getMachineFunction().getFunction();
1388-
DiagnosticInfoUnsupported BadInit(
1389-
Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1390-
DAG.getContext()->diagnose(BadInit);
13911386
return SDValue();
13921387
}
13931388

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2420,43 +2420,36 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
24202420
}
24212421

24222422
// TODO: We could emit code to handle the initialization somewhere.
2423-
if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2424-
const SITargetLowering *TLI = ST.getTargetLowering();
2425-
if (!TLI->shouldUseLDSConstAddress(GV)) {
2426-
MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2427-
return true; // Leave in place;
2428-
}
2423+
// We ignore the initializer for now and legalize it to allow selection.
2424+
// The initializer will anyway get errored out during assembly emission.
2425+
const SITargetLowering *TLI = ST.getTargetLowering();
2426+
if (!TLI->shouldUseLDSConstAddress(GV)) {
2427+
MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2428+
return true; // Leave in place;
2429+
}
24292430

2430-
if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2431-
Type *Ty = GV->getValueType();
2432-
// HIP uses an unsized array `extern __shared__ T s[]` or similar
2433-
// zero-sized type in other languages to declare the dynamic shared
2434-
// memory which size is not known at the compile time. They will be
2435-
// allocated by the runtime and placed directly after the static
2436-
// allocated ones. They all share the same offset.
2437-
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2438-
// Adjust alignment for that dynamic shared memory array.
2439-
MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2440-
LLT S32 = LLT::scalar(32);
2441-
auto Sz =
2442-
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2443-
B.buildIntToPtr(DstReg, Sz);
2444-
MI.eraseFromParent();
2445-
return true;
2446-
}
2431+
if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2432+
Type *Ty = GV->getValueType();
2433+
// HIP uses an unsized array `extern __shared__ T s[]` or similar
2434+
// zero-sized type in other languages to declare the dynamic shared
2435+
// memory which size is not known at the compile time. They will be
2436+
// allocated by the runtime and placed directly after the static
2437+
// allocated ones. They all share the same offset.
2438+
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2439+
// Adjust alignment for that dynamic shared memory array.
2440+
MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2441+
LLT S32 = LLT::scalar(32);
2442+
auto Sz =
2443+
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2444+
B.buildIntToPtr(DstReg, Sz);
2445+
MI.eraseFromParent();
2446+
return true;
24472447
}
2448-
2449-
B.buildConstant(
2450-
DstReg,
2451-
MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2452-
MI.eraseFromParent();
2453-
return true;
24542448
}
24552449

2456-
const Function &Fn = MF.getFunction();
2457-
DiagnosticInfoUnsupported BadInit(
2458-
Fn, "unsupported initializer for address space", MI.getDebugLoc());
2459-
Fn.getContext().diagnose(BadInit);
2450+
B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2451+
*cast<GlobalVariable>(GV)));
2452+
MI.eraseFromParent();
24602453
return true;
24612454
}
24622455

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,38 @@
1-
; RUN: not llc -global-isel -march=amdgcn -mcpu=tonga < %S/../lds-zero-initializer.ll 2>&1 | FileCheck %s
1+
; RUN: llc -march=amdgcn -mcpu=tahiti -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
2+
; RUN: llc -march=amdgcn -mcpu=tonga -global-isel -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
23

3-
; CHECK: <unknown>:0: error: lds: unsupported initializer for address space
4+
; RUN: not llc -march=amdgcn -mcpu=tahiti -global-isel < %s 2>&1 | FileCheck %s
5+
; RUN: not llc -march=amdgcn -mcpu=tonga -global-isel < %s 2>&1 | FileCheck %s
6+
7+
; CHECK: error: lds: unsupported initializer for address space
8+
9+
@lds = addrspace(3) global [256 x i32] zeroinitializer
10+
11+
define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
12+
; GCN-LABEL: name: load_zeroinit_lds_global
13+
; GCN: bb.1 (%ir-block.0):
14+
; GCN: liveins: $sgpr0_sgpr1
15+
; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
16+
; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 40
17+
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds
18+
; GFX8: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def $scc
19+
; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0
20+
; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0
21+
; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
22+
; GCN: $m0 = S_MOV_B32 -1
23+
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
24+
; GFX8: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 0, 0, implicit $m0, implicit $exec
25+
; GFX9: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY1]], 40, 0, implicit $m0, implicit $exec
26+
; GFX8: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
27+
; GFX8: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
28+
; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
29+
; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_LOAD_DWORDX2_IMM]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
30+
; GFX8: BUFFER_STORE_DWORD_OFFSET [[DS_READ_B32_]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
31+
; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
32+
; GFX9: FLAT_STORE_DWORD [[COPY2]], [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr
33+
; GCN: S_ENDPGM 0
34+
%gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
35+
%ld = load i32, i32 addrspace(3)* %gep
36+
store i32 %ld, i32 addrspace(1)* %out
37+
ret void
38+
}

llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,32 @@
1-
; RUN: not llc -march=amdgcn -mcpu=tahiti -filetype=null < %s 2>&1 | FileCheck %s
2-
; RUN: not llc -march=amdgcn -mcpu=tonga -filetype=null < %s 2>&1 | FileCheck %s
1+
; RUN: llc -march=amdgcn -mcpu=tahiti -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
2+
; RUN: llc -march=amdgcn -mcpu=tonga -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s
3+
4+
; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
5+
; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
36

47
; CHECK: error: lds: unsupported initializer for address space
58

69
@lds = addrspace(3) global [256 x i32] zeroinitializer
710

811
define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
12+
; GCN-LABEL: name: load_zeroinit_lds_global
13+
; GCN: bb.0 (%ir-block.0):
14+
; GCN: liveins: $sgpr0_sgpr1
15+
; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
16+
; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0
17+
; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
18+
; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
19+
; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
20+
; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
21+
; GFX8: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
22+
; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
23+
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
24+
; GCN: SI_INIT_M0 -1, implicit-def $m0
25+
; GCN: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 killed [[V_MOV_B32_e32_]], 40, 0, implicit $m0, implicit $exec
26+
; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
27+
; GFX8: BUFFER_STORE_DWORD_OFFSET killed [[DS_READ_B32_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec
28+
; GFX9: FLAT_STORE_DWORD killed [[COPY1]], killed [[DS_READ_B32_]], 0, 0, implicit $exec, implicit $flat_scr
29+
; GCN: S_ENDPGM 0
930
%gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
1031
%ld = load i32, i32 addrspace(3)* %gep
1132
store i32 %ld, i32 addrspace(1)* %out

0 commit comments

Comments
 (0)