Skip to content

Commit 90f9085

Browse files
stefan-iligcbot
authored andcommitted
Use MergeAllocas pass to merge private memory
Functions with multiple inlined function using private memory would otherwise require sum of inlined functions privatFunctions with multiple inlined function using private memory would otherwise require sum of inlined functions private memory. With the change we can reuse non-overlapping memory.
1 parent 4cc8dff commit 90f9085

File tree

8 files changed

+244
-1
lines changed

8 files changed

+244
-1
lines changed

IGC/AdaptorCommon/RayTracing/MergeAllocas.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ SPDX-License-Identifier: MIT
1414
#include "common/LLVMWarningsPush.hpp"
1515
#include <llvm/ADT/SetVector.h>
1616
#include <llvm/ADT/SetOperations.h>
17+
#include <llvm/ADT/SmallSet.h>
1718
#include <llvm/Analysis/LoopInfo.h>
1819
#include <llvm/IR/Constants.h>
1920
#include <llvm/IR/Dominators.h>
@@ -167,6 +168,17 @@ AllocationBasedLivenessAnalysis::LivenessData::LivenessData(Instruction* allocat
167168
worklist.push_back(I->getParent());
168169
}
169170

171+
// Keep track of loop header of blocks that contain allocation instruction
172+
auto* allocationParent = allocationInstruction->getParent();
173+
llvm::SmallPtrSet<llvm::BasicBlock*, 4> containedLoopHeaders;
174+
if (const auto* parentLoop = LI.getLoopFor(allocationParent);
175+
parentLoop != nullptr) {
176+
containedLoopHeaders.insert(parentLoop->getHeader());
177+
while (parentLoop->getParentLoop() != nullptr) {
178+
parentLoop = parentLoop->getParentLoop();
179+
containedLoopHeaders.insert(parentLoop->getHeader());
180+
}
181+
}
170182
// perform data flow analysis
171183
while (!worklist.empty())
172184
{
@@ -175,8 +187,17 @@ AllocationBasedLivenessAnalysis::LivenessData::LivenessData(Instruction* allocat
175187
if (bbIn.contains(currbb) || currbb == userDominatorBlock)
176188
continue;
177189

178-
bbIn.insert(currbb);
190+
// If alloca is defined in the loop, we skip loop header
191+
// so that we don't escape loop scope.
192+
if (containedLoopHeaders.count(currbb) != 0)
193+
{
194+
continue;
195+
}
179196

197+
if (currbb != allocationParent)
198+
{
199+
bbIn.insert(currbb);
200+
}
180201
for (auto* pbb : llvm::predecessors(currbb))
181202
{
182203
bbOut.insert(pbb);

IGC/AdaptorCommon/RayTracing/RayTracingPasses.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,5 @@ llvm::Pass* CreateTraceRayInlineLoweringPass();
2323
llvm::Pass* CreateDynamicRayManagementPass();
2424
llvm::Pass* CreateRTGlobalsPointerLoweringPass();
2525
llvm::Pass* createOverrideTMaxPass(unsigned OverrideValue);
26+
llvm::Pass* createMergeAllocas();
2627
}

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,10 @@ void AddAnalysisPasses(CodeGenContext& ctx, IGCPassManager& mpm)
313313
}
314314
}
315315
mpm.add(createPromoteMemoryToRegisterPass());
316+
if (IGC_IS_FLAG_DISABLED(DisableMergeAllocasPrivateMemory))
317+
{
318+
mpm.add(createMergeAllocas());
319+
}
316320
if (ctx.type == ShaderType::OPENCL_SHADER &&
317321
!isOptDisabled &&
318322
IGC_IS_FLAG_ENABLED(EnableExplicitCopyForByVal))
@@ -706,6 +710,10 @@ void AddLegalizationPasses(CodeGenContext& ctx, IGCPassManager& mpm, PSSignature
706710
if (!(IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) &&
707711
IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc)))
708712
{
713+
if (IGC_IS_FLAG_DISABLED(DisableMergeAllocasPrivateMemory))
714+
{
715+
mpm.add(createMergeAllocas());
716+
}
709717
if (ctx.type == ShaderType::OPENCL_SHADER &&
710718
!isOptDisabled &&
711719
IGC_IS_FLAG_ENABLED(EnableExplicitCopyForByVal))
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
; RUN: igc_opt --igc-merge-allocas --igc-private-mem-resolution -S %s --platformpvc | FileCheck %s
9+
; ------------------------------------------------
10+
; PrivateMemoryResolution
11+
; ------------------------------------------------
12+
13+
; Check that allocas are merged before private memory resolution
14+
declare spir_func void @__itt_offload_wi_start_wrapper()
15+
16+
; Function Attrs: argmemonly nofree nosync nounwind willreturn
17+
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #0
18+
19+
; Function Attrs: argmemonly nofree nosync nounwind willreturn
20+
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #0
21+
22+
; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn
23+
declare void @llvm.assume(i1 noundef) #1
24+
25+
declare spir_func void @__itt_offload_wi_finish_wrapper()
26+
27+
define spir_kernel void @_ZTS43Kernel_NoReusePrivMem_SameFunc_AlwaysInline(float addrspace(1)* %0, i64 %1, i64 %2, i32 %3, i32 %4, i32 %5) {
28+
; CHECK-LABEL: _ZTS43Kernel_NoReusePrivMem_SameFunc_AlwaysInline
29+
; CHECK-NEXT: alloca [128 x float], align 4
30+
; CHECK-NOT: alloca [128 x float], align 4
31+
%7 = alloca [128 x float], align 4
32+
%8 = alloca [128 x float], align 4
33+
%9 = call spir_func i64 @_Z13get_global_idj()
34+
%10 = call spir_func i64 @_Z17get_global_offsetj()
35+
%11 = sub i64 %1, 0
36+
%12 = trunc i64 %1 to i32
37+
%13 = and i32 %3, 31
38+
%14 = and i32 %3, 1
39+
br label %15
40+
41+
15: ; preds = %19, %6
42+
%16 = phi i32 [ 0, %6 ], [ 1, %19 ]
43+
%17 = phi i32 [ 0, %6 ], [ 1, %19 ]
44+
%18 = icmp ult i32 %16, %3
45+
br i1 %18, label %19, label %.preheader1
46+
47+
.preheader1: ; preds = %15
48+
br label %22
49+
50+
19: ; preds = %15
51+
%20 = add nuw nsw i32 0, 1
52+
%21 = add nuw nsw i32 0, 1
53+
br label %15
54+
55+
22: ; preds = %27, %.preheader1
56+
%23 = phi i32 [ 0, %27 ], [ 0, %.preheader1 ]
57+
%24 = phi float [ %31, %27 ], [ 0.000000e+00, %.preheader1 ]
58+
%25 = phi i32 [ 1, %27 ], [ 0, %.preheader1 ]
59+
%26 = icmp ult i32 %25, %4
60+
br i1 %26, label %27, label %34
61+
62+
27: ; preds = %22
63+
%28 = add nsw i32 0, 0
64+
%29 = sext i32 %17 to i64
65+
%30 = getelementptr inbounds [128 x float], [128 x float]* %7, i64 0, i64 %29
66+
%31 = load float, float* %30, align 4
67+
%32 = fadd reassoc nsz arcp contract float 0.000000e+00, %31
68+
%33 = add nuw nsw i32 0, 1
69+
br label %22
70+
71+
34: ; preds = %22
72+
br label %35
73+
74+
35: ; preds = %34
75+
br label %.preheader
76+
77+
.preheader: ; preds = %35
78+
br label %36
79+
80+
36: ; preds = %41, %.preheader
81+
%37 = phi i32 [ %42, %41 ], [ 0, %.preheader ]
82+
%38 = phi float [ %45, %41 ], [ 0.000000e+00, %.preheader ]
83+
%39 = phi i32 [ 1, %41 ], [ 0, %.preheader ]
84+
%40 = icmp ult i32 %39, %3
85+
br i1 %40, label %41, label %48
86+
87+
41: ; preds = %36
88+
%42 = add nsw i32 %37, 1
89+
%43 = sext i32 %42 to i64
90+
%44 = getelementptr inbounds [128 x float], [128 x float]* %8, i64 0, i64 %43
91+
%45 = load float, float* %44, align 4
92+
%46 = fadd reassoc nsz arcp contract float 0.000000e+00, %45
93+
%47 = add nuw nsw i32 0, 1
94+
br label %36
95+
96+
48: ; preds = %36
97+
%49 = fadd reassoc nsz arcp contract float %24, %38
98+
%50 = getelementptr inbounds float, float addrspace(1)* %0, i64 %2
99+
store float %49, float addrspace(1)* null, align 4
100+
ret void
101+
}
102+
103+
declare spir_func i64 @_Z13get_global_idj()
104+
105+
declare spir_func i64 @_Z17get_global_offsetj()
106+
107+
attributes #0 = { argmemonly nofree nosync nounwind willreturn }
108+
attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn }
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
; RUN: igc_opt --igc-lower-byval-attribute --igc-merge-allocas --igc-private-mem-resolution -S %s --platformpvc
9+
; ------------------------------------------------
10+
; PrivateMemoryResolution
11+
; ------------------------------------------------
12+
13+
; Check that merge allocas can process loop with allocas inside without crashing.
14+
15+
%"class.sycl::_V1::vec.73" = type { <3 x double> }
16+
17+
; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
18+
declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
19+
20+
; Function Attrs: argmemonly nofree nounwind willreturn
21+
declare void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* noalias nocapture writeonly, i8 addrspace(4)* noalias nocapture readonly, i64, i1 immarg) #1
22+
23+
; Function Attrs: noinline optnone
24+
define internal spir_func i1 @_ZN12_GLOBAL__N_117check_elems_equalIdLi3EEEbRKN4sycl3_V13vecIT_XT0_EEES7_.28(i1 %0) #2 {
25+
%2 = alloca %"class.sycl::_V1::vec.73", i32 0, align 32
26+
%3 = addrspacecast %"class.sycl::_V1::vec.73"* null to %"class.sycl::_V1::vec.73" addrspace(4)*
27+
br label %4
28+
29+
4: ; preds = %10, %1
30+
%5 = icmp slt i32 undef, 0
31+
br i1 %0, label %6, label %11
32+
33+
6: ; preds = %4
34+
%7 = addrspacecast %"class.sycl::_V1::vec.73" addrspace(4)* null to %"class.sycl::_V1::vec.73"*
35+
%8 = call spir_func double undef(%"class.sycl::_V1::vec.73"* byval(%"class.sycl::_V1::vec.73") null, i32 0)
36+
br label %9
37+
38+
9: ; preds = %6
39+
br label %10
40+
41+
10: ; preds = %9
42+
br label %4
43+
44+
11: ; preds = %4
45+
ret i1 false
46+
}
47+
48+
define spir_kernel void @_ZTSN16accessor_utility34buffer_accessor_get_pointer_kernelIN25accessor_api_local_fp64__11kernel_nameIN4sycl3_V13vecIdLi3EEEEELi0ELNS4_6access4modeE1026ELNS8_6targetE2016ELNS8_11placeholderE0EEE() {
49+
%1 = call spir_func i1 @_ZN12_GLOBAL__N_117check_elems_equalIdLi3EEEbRKN4sycl3_V13vecIT_XT0_EEES7_.28(i1 undef)
50+
ret void
51+
}
52+
53+
attributes #0 = { nofree nosync nounwind readnone speculatable willreturn }
54+
attributes #1 = { argmemonly nofree nounwind willreturn }
55+
attributes #2 = { noinline optnone }
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
; RUN: igc_opt --igc-lower-byval-attribute --igc-merge-allocas --igc-private-mem-resolution -S %s --platformpvc
9+
; ------------------------------------------------
10+
; PrivateMemoryResolution
11+
; ------------------------------------------------
12+
13+
; Check that merge allocas can process loop with alloca use inside in case that
14+
; backedge points to block before loop header.
15+
16+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
17+
target triple = "spir64-unknown-unknown"
18+
19+
%"class.sycl::_V1::vec.73" = type { <3 x double> }
20+
21+
; Function Attrs: noinline optnone
22+
define internal spir_func i1 @_ZN12_GLOBAL__N_117check_elems_equalIdLi3EEEbRKN4sycl3_V13vecIT_XT0_EEES7_.28() #0 {
23+
%1 = alloca %"class.sycl::_V1::vec.73", i32 0, align 32
24+
br label %2
25+
26+
2: ; preds = %7, %0
27+
%3 = load i32, i32 addrspace(4)* null, align 4
28+
%4 = icmp slt i32 %3, 0
29+
br i1 %4, label %5, label %8
30+
31+
5: ; preds = %2
32+
%6 = call spir_func double null(%"class.sycl::_V1::vec.73"* %1)
33+
br label %7
34+
35+
7: ; preds = %5
36+
br label %2
37+
38+
8: ; preds = %2
39+
ret i1 false
40+
}
41+
42+
define spir_kernel void @_ZTSN16accessor_utility34buffer_accessor_get_pointer_kernelIN25accessor_api_local_fp64__11kernel_nameIN4sycl3_V13vecIdLi3EEEEELi0ELNS4_6access4modeE1026ELNS8_6targetE2016ELNS8_11placeholderE0EEE() {
43+
%1 = call spir_func i1 @_ZN12_GLOBAL__N_117check_elems_equalIdLi3EEEbRKN4sycl3_V13vecIT_XT0_EEES7_.28()
44+
ret void
45+
}
46+
47+
attributes #0 = { noinline optnone }

IGC/DriverInterface/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ endif()
5858
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/TraceRayInlineLoweringPass.cpp"
5959
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/DynamicRayManagementPass.cpp"
6060
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/SplitAsyncUtils.cpp"
61+
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/MergeAllocas.cpp"
6162
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/CrossingAnalysis.cpp"
6263
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/TraceRayInlinePrepPass.cpp"
6364
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/TraceRayInlineLatencySchedulerPass.cpp"
@@ -140,6 +141,7 @@ set(IGC_BUILD__HDR__DriverInterface
140141
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/RayTracingInterface.h"
141142
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/CrossingAnalysis.h"
142143
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/SplitAsyncUtils.h"
144+
"${CMAKE_CURRENT_SOURCE_DIR}/../AdaptorCommon/RayTracing/MergeAllocas.h"
143145
)
144146

145147
set(IGC_BUILD__HDR__RAYTRACING__API

IGC/common/igc_flags.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,7 @@ DECLARE_IGC_GROUP("Raytracing Options")
977977
DECLARE_IGC_REGKEY(bool, DisablePrepareLoadsStores, false, "Disable preparation for MemOpt", true)
978978
DECLARE_IGC_REGKEY(bool, DisableRayTracingConstantCoalescing, false, "Disable coalescing", true)
979979
DECLARE_IGC_REGKEY(bool, DisableMergeAllocas, true, "Do not merge allocas prior to SplitAsyncPass", false)
980+
DECLARE_IGC_REGKEY(bool, DisableMergeAllocasPrivateMemory, false, "Do not merge allocas prior to PrivateMemoryResolution", false)
980981
DECLARE_IGC_REGKEY(DWORD, RayTracingConstantCoalescingMinBlockSize, 4, "Set the minimum load size in # OWords = [1,2,4,8,16].", true)
981982
DECLARE_IGC_REGKEY(bool, DisableRayTracingOptimizations, false, "Disable RayTracing Optimizations for debugging", true)
982983
DECLARE_IGC_REGKEY(DWORD, RayTracingCustomTileXDim1D, 0, "X dimension of tile (default: DG2=256, Xe2+=512)", true)

0 commit comments

Comments
 (0)