Skip to content

Commit 47ef3a0

Browse files
authored
[DirectX] Eliminate resource global variables from module (#114105)
By giving these intrinsics their appropriate attributes, loads of globals that are stored on the other side of these calls can be eliminated by the EarlyCSE pass. Stores to the same globals and the globals themselves require more direct intervention as part of the create/annotated handle lowering. Adds a test that verifies that the unneeded globals and their uses can be eliminated and also that the attributes are set properly. Fixes #104271
1 parent 63fb980 commit 47ef3a0

File tree

3 files changed

+80
-3
lines changed

3 files changed

+80
-3
lines changed

llvm/include/llvm/IR/IntrinsicsDirectX.td

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,14 @@ def int_dx_handle_fromBinding
2828
[IntrNoMem]>;
2929

3030
def int_dx_typedBufferLoad
31-
: DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>;
31+
: DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty],
32+
[IntrReadMem]>;
3233
def int_dx_typedBufferLoad_checkbit
3334
: DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
34-
[llvm_any_ty, llvm_i32_ty]>;
35+
[llvm_any_ty, llvm_i32_ty], [IntrReadMem]>;
3536
def int_dx_typedBufferStore
36-
: DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>;
37+
: DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty],
38+
[IntrWriteMem]>;
3739

3840
def int_dx_updateCounter
3941
: DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i8_ty]>;

llvm/lib/Target/DirectX/DXILOpLowering.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,25 @@ class OpLowerer {
204204
CleanupCasts.clear();
205205
}
206206

207+
// Remove the resource global associated with the handleFromBinding call
208+
// instruction and their uses as they aren't needed anymore.
209+
// TODO: We should verify that all the globals get removed.
210+
// It's expected we'll need a custom pass in the future that will eliminate
211+
// the need for this here.
212+
void removeResourceGlobals(CallInst *CI) {
213+
for (User *User : make_early_inc_range(CI->users())) {
214+
if (StoreInst *Store = dyn_cast<StoreInst>(User)) {
215+
Value *V = Store->getOperand(1);
216+
Store->eraseFromParent();
217+
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
218+
if (GV->use_empty()) {
219+
GV->removeDeadConstantUsers();
220+
GV->eraseFromParent();
221+
}
222+
}
223+
}
224+
}
225+
207226
[[nodiscard]] bool lowerToCreateHandle(Function &F) {
208227
IRBuilder<> &IRB = OpBuilder.getIRB();
209228
Type *Int8Ty = IRB.getInt8Ty();
@@ -228,6 +247,8 @@ class OpLowerer {
228247

229248
Value *Cast = createTmpHandleCast(*OpCall, CI->getType());
230249

250+
removeResourceGlobals(CI);
251+
231252
CI->replaceAllUsesWith(Cast);
232253
CI->eraseFromParent();
233254
return Error::success();
@@ -272,6 +293,8 @@ class OpLowerer {
272293

273294
Value *Cast = createTmpHandleCast(*OpAnnotate, CI->getType());
274295

296+
removeResourceGlobals(CI);
297+
275298
CI->replaceAllUsesWith(Cast);
276299
CI->eraseFromParent();
277300

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
; RUN: opt -S -passes='early-cse<memssa>' %s -o %t
2+
; RUN: FileCheck --check-prefixes=CSE,CHECK %s < %t
3+
; Finish compiling to verify that dxil-op-lower removes the globals entirely.
4+
; RUN: opt -mtriple=dxil-pc-shadermodel6.0-compute -S -dxil-op-lower %t -o - | FileCheck --check-prefixes=DXOP,CHECK %s
5+
; RUN: opt -mtriple=dxil-pc-shadermodel6.6-compute -S -dxil-op-lower %t -o - | FileCheck --check-prefixes=DXOP,CHECK %s
6+
; RUN: llc -mtriple=dxil-pc-shadermodel6.0-compute --filetype=asm -o - %t | FileCheck --check-prefixes=DXOP,CHECK %s
7+
; RUN: llc -mtriple=dxil-pc-shadermodel6.6-compute --filetype=asm -o - %t | FileCheck --check-prefixes=DXOP,CHECK %s
8+
9+
; Ensure that EarlyCSE is able to eliminate unneeded loads of resource globals across typedBufferLoad.
10+
; Also that DXILOpLowering eliminates the globals entirely.
11+
12+
%"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0) }
13+
14+
; DXOP-NOT: @In = global
15+
; DXOP-NOT: @Out = global
16+
@In = global %"class.hlsl::RWBuffer" zeroinitializer, align 4
17+
@Out = global %"class.hlsl::RWBuffer" zeroinitializer, align 4
18+
19+
; CHECK-LABEL define void @main()
20+
define void @main() local_unnamed_addr #0 {
21+
entry:
22+
; DXOP: %In_h.i1 = call %dx.types.Handle @dx.op.createHandle
23+
; DXOP: %Out_h.i2 = call %dx.types.Handle @dx.op.createHandle
24+
%In_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
25+
store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %In_h.i, ptr @In, align 4
26+
%Out_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 4, i32 1, i32 1, i32 0, i1 false)
27+
store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, ptr @Out, align 4
28+
; CSE: call i32 @llvm.dx.flattened.thread.id.in.group()
29+
%0 = call i32 @llvm.dx.flattened.thread.id.in.group()
30+
; CHECK-NOT: load {{.*}} ptr @In
31+
%1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
32+
; CSE: call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
33+
%2 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
34+
; CHECK-NOT: load {{.*}} ptr @In
35+
%3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
36+
%4 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
37+
%add.i = fadd <4 x float> %2, %4
38+
call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
39+
; CHECK: ret void
40+
ret void
41+
}
42+
43+
; CSE-DAG: declare <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
44+
; CSE-DAG: declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
45+
46+
attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
47+
48+
; Just need to split up the DAG searches.
49+
; CSE: attributes #0
50+
51+
; CSE-DAG: attributes [[ROAttr]] = { {{.*}} memory(read) }
52+
; CSE-DAG: attributes [[WOAttr]] = { {{.*}} memory(write) }

0 commit comments

Comments
 (0)