Skip to content

Commit 6dcada8

Browse files
pkwasnie-inteligcbot
authored andcommitted
optimizations to global_id_offset/local_size implicit args
Compute workloads add following implicit arguments: 1. payloadHeader - 8 x i32 packing global_id_offset (3 x i32), local_size (3 x i32) and 2 x i32 reserved. 2. enqueued_local_size - 3 x i32 local_size is never used in favour of enqueued_local_size. In the end, payloadHeader has unused 20 bytes. This change introduces following optimizations: 1. Reduces payloadHeader to 3 x i32, packing only global_id_offset. 2. Removes global_id_offset and enqueued_local_size from finalizer and zeinfo if arguments are unused.
1 parent 868baf0 commit 6dcada8

12 files changed

+35
-38
lines changed

IGC/AdaptorCommon/ImplicitArgs.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ using namespace IGC::IGCMD;
2424
static const std::vector<ImplicitArg> IMPLICIT_ARGS = {
2525
ImplicitArg(ImplicitArg::R0, "r0", ImplicitArg::INT, WIAnalysis::UNIFORM_THREAD, 8, ImplicitArg::ALIGN_GRF, false, GenISAIntrinsic::GenISA_getR0),
2626

27-
ImplicitArg(ImplicitArg::PAYLOAD_HEADER, "payloadHeader", ImplicitArg::INT, WIAnalysis::UNIFORM_WORKGROUP, 8, ImplicitArg::ALIGN_GRF, true, GenISAIntrinsic::GenISA_getPayloadHeader),
27+
ImplicitArg(ImplicitArg::PAYLOAD_HEADER, "payloadHeader", ImplicitArg::INT, WIAnalysis::UNIFORM_WORKGROUP, 3, ImplicitArg::ALIGN_DWORD, true, GenISAIntrinsic::GenISA_getPayloadHeader),
2828
ImplicitArg(ImplicitArg::WORK_DIM, "workDim", ImplicitArg::INT, WIAnalysis::UNIFORM_GLOBAL, 1, ImplicitArg::ALIGN_DWORD, true, GenISAIntrinsic::GenISA_getWorkDim),
2929

3030
ImplicitArg(ImplicitArg::NUM_GROUPS, "numWorkGroups", ImplicitArg::INT, WIAnalysis::UNIFORM_GLOBAL, 3, ImplicitArg::ALIGN_DWORD, true, GenISAIntrinsic::GenISA_getNumWorkGroups),

IGC/Compiler/CISACodeGen/OpenCLKernelCodeGen.cpp

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -907,16 +907,11 @@ namespace IGC
907907
switch (kernelArg->getArgType()) {
908908

909909
case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER:{
910-
// PayloadHeader contains global work offset x,y,z and local size x,y,z
911-
// global work offset, size is int32x3
912-
uint cur_pos = payloadPosition;
910+
// PayloadHeader contains global work offset x,y,z
911+
// global work offset size is int32x3
913912
uint32_t size = iOpenCL::DATA_PARAMETER_DATA_SIZE * 3;
914913
zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
915-
zebin::PreDefinedAttrGetter::ArgType::global_id_offset, cur_pos, size);
916-
cur_pos += size;
917-
// local size, size is int32x3, the same as above
918-
zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
919-
zebin::PreDefinedAttrGetter::ArgType::local_size, cur_pos, size);
914+
zebin::PreDefinedAttrGetter::ArgType::global_id_offset, payloadPosition, size);
920915
break;
921916
}
922917
case KernelArg::ArgType::IMPLICIT_PRIVATE_BASE:
@@ -1512,17 +1507,15 @@ namespace IGC
15121507
break;
15131508

15141509
case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER:
1515-
// PayloadHeader contains global work offset x,y,z and local size x,y,z -->
1516-
// total of 6 annotations, 3 of each type
1517-
for (int i = 0; i < 6; ++i)
1510+
// PayloadHeader contains global work offset x,y,z -->
1511+
// total of 3 annotations of each type
1512+
for (int i = 0; i < 3; ++i)
15181513
{
15191514
auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
15201515

15211516
DWORD sizeInBytes = iOpenCL::DATA_PARAMETER_DATA_SIZE;
15221517

1523-
constInput->ConstantType = (i < 3 ?
1524-
iOpenCL::DATA_PARAMETER_GLOBAL_WORK_OFFSET :
1525-
iOpenCL::DATA_PARAMETER_LOCAL_WORK_SIZE);
1518+
constInput->ConstantType = iOpenCL::DATA_PARAMETER_GLOBAL_WORK_OFFSET;
15261519
constInput->Offset = (i % 3) * sizeInBytes;
15271520
constInput->PayloadPosition = payloadPosition;
15281521
constInput->PayloadSizeInBytes = sizeInBytes;
@@ -2441,6 +2434,8 @@ namespace IGC
24412434
bool IsUnusedArg =
24422435
(arg.getArgType() == KernelArg::ArgType::IMPLICIT_BUFFER_OFFSET ||
24432436
arg.getArgType() == KernelArg::ArgType::IMPLICIT_BINDLESS_OFFSET ||
2437+
arg.getArgType() == KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER || // contains global_id_offset
2438+
arg.getArgType() == KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE ||
24442439
arg.getArgType() == KernelArg::ArgType::IMPLICIT_BUFFER_SIZE) &&
24452440
arg.getArg()->use_empty();
24462441

IGC/Compiler/Optimizer/OpenCLPasses/KernelArgs/KernelArgs.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,7 @@ KernelArgsOrder::KernelArgsOrder(InputType layout)
802802

803803
KernelArg::ArgType::RUNTIME_VALUE,
804804
KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER,
805+
KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE,
805806

806807
KernelArg::ArgType::PTR_LOCAL,
807808
KernelArg::ArgType::PTR_GLOBAL,
@@ -823,7 +824,6 @@ KernelArgsOrder::KernelArgsOrder(InputType layout)
823824
KernelArg::ArgType::IMPLICIT_LOCAL_SIZE,
824825
KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN,
825826
KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE,
826-
KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE,
827827
KernelArg::ArgType::IMPLICIT_BINDLESS_OFFSET,
828828

829829
KernelArg::ArgType::IMPLICIT_IMAGE_HEIGHT,
@@ -928,6 +928,7 @@ KernelArgsOrder::KernelArgsOrder(InputType layout)
928928

929929
KernelArg::ArgType::RUNTIME_VALUE,
930930
KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER,
931+
KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE,
931932
KernelArg::ArgType::PTR_LOCAL,
932933
KernelArg::ArgType::PTR_GLOBAL,
933934
KernelArg::ArgType::PTR_CONSTANT,
@@ -947,7 +948,6 @@ KernelArgsOrder::KernelArgsOrder(InputType layout)
947948
KernelArg::ArgType::IMPLICIT_LOCAL_SIZE,
948949
KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_ORIGIN,
949950
KernelArg::ArgType::IMPLICIT_STAGE_IN_GRID_SIZE,
950-
KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE,
951951
KernelArg::ArgType::IMPLICIT_BINDLESS_OFFSET,
952952

953953
KernelArg::ArgType::IMPLICIT_ARG_BUFFER,

IGC/Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,14 @@ void WIFuncResolution::visitCallInst(CallInst& CI)
209209
210210
PayloadHeader:
211211
212-
-----------------------------------------------------------------------------------------------
213-
| Global | Global | Global | Local | Local | Local | Reserved | Num |
214-
| offset | offset | offset | size | size | size | | HW |
215-
| X | Y | Z | X | Y | Z | | Threads |
216-
| 32bit | 32bit | 32bit | 32bit | 32bit | 32bit | | 32bit |
217-
-----------------------------------------------------------------------------------------------
212+
Note: PayloadHeader used to be 8xi32, but unused bytes got removed and now it is 3xi32.
213+
214+
------------------------------------
215+
| Global | Global | Global |
216+
| offset | offset | offset |
217+
| X | Y | Z |
218+
| 32bit | 32bit | 32bit |
219+
-----------------------------------
218220
<low> <high>
219221
220222
*************************************************************************************************/
@@ -509,7 +511,7 @@ Value* WIFuncResolution::getGlobalOffset(CallInst& CI)
509511
// call i32 @__builtin_IB_get_global_offset(i32 %dim)
510512

511513
// Creates:
512-
// %globalOffset = extractelement <8 x i32> %payloadHeader, i32 %dim
514+
// %globalOffset = extractelement <3 x i32> %payloadHeader, i32 %dim
513515

514516
auto F = CI.getFunction();
515517
Value* V = m_implicitArgs.getImplicitArgValue(*F, ImplicitArg::PAYLOAD_HEADER, m_pMdUtils);

IGC/Compiler/tests/AddImplicitArgs/signatiure_changed_PayloadHeader-typed-pointers.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2017-2021 Intel Corporation
3+
; Copyright (C) 2017-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -21,5 +21,5 @@ define i32 @foo(i32 %x) nounwind {
2121
!4 = !{ !"implicit_arg_desc", !6}
2222
!6 = !{i32 1}
2323

24-
; CHECK: define i32 @foo(i32 %x, <8 x i32> %payloadHeader)
24+
; CHECK: define i32 @foo(i32 %x, <3 x i32> %payloadHeader)
2525
; CHECK-NOT: define i32 @foo(i32 %x)

IGC/Compiler/tests/AddImplicitArgs/signatiure_changed_PayloadHeader.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2017-2024 Intel Corporation
3+
; Copyright (C) 2017-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -22,5 +22,5 @@ define i32 @foo(i32 %x) nounwind {
2222
!4 = !{ !"implicit_arg_desc", !6}
2323
!6 = !{i32 1}
2424

25-
; CHECK: define i32 @foo(i32 %x, <8 x i32> %payloadHeader)
25+
; CHECK: define i32 @foo(i32 %x, <3 x i32> %payloadHeader)
2626
; CHECK-NOT: define i32 @foo(i32 %x)

IGC/Compiler/tests/AddImplicitArgs/signatiure_changed_R0_PayloadHeader_Localds-typed-pointers.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2017-2021 Intel Corporation
3+
; Copyright (C) 2017-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -25,5 +25,5 @@ define i32 @foo(i32 %x) nounwind {
2525
!9 = !{i32 8}
2626
!10 = !{i32 9}
2727

28-
; CHECK: define i32 @foo(i32 %x, <8 x i32> %r0, <8 x i32> %payloadHeader, i16 %localIdX, i16 %localIdY, i16 %localIdZ)
28+
; CHECK: define i32 @foo(i32 %x, <8 x i32> %r0, <3 x i32> %payloadHeader, i16 %localIdX, i16 %localIdY, i16 %localIdZ)
2929
; CHECK-NOT: define i32 @foo(i32 %x)

IGC/Compiler/tests/AddImplicitArgs/signatiure_changed_R0_PayloadHeader_Localds.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2017-2024 Intel Corporation
3+
; Copyright (C) 2017-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -26,5 +26,5 @@ define i32 @foo(i32 %x) nounwind {
2626
!9 = !{i32 8}
2727
!10 = !{i32 9}
2828

29-
; CHECK: define i32 @foo(i32 %x, <8 x i32> %r0, <8 x i32> %payloadHeader, i16 %localIdX, i16 %localIdY, i16 %localIdZ)
29+
; CHECK: define i32 @foo(i32 %x, <8 x i32> %r0, <3 x i32> %payloadHeader, i16 %localIdX, i16 %localIdY, i16 %localIdZ)
3030
; CHECK-NOT: define i32 @foo(i32 %x)

IGC/Compiler/tests/DebugInfo/AddImplicitArgs-typed-pointers.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2017-2024 Intel Corporation
3+
; Copyright (C) 2017-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -20,7 +20,7 @@ entry:
2020
store float 1.000000e+00, float addrspace(1)* %dst, align 4
2121
ret void
2222

23-
; CHECK: define void @test(float addrspace(1)* %dst, <8 x i32> %r0, <8 x i32> %payloadHeader)
23+
; CHECK: define void @test(float addrspace(1)* %dst, <8 x i32> %r0, <3 x i32> %payloadHeader)
2424
; CHECK: call void @llvm.dbg.value({{.*}})
2525
}
2626

IGC/Compiler/tests/DebugInfo/AddImplicitArgs.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
;=========================== begin_copyright_notice ============================
22
;
3-
; Copyright (C) 2017-2024 Intel Corporation
3+
; Copyright (C) 2017-2025 Intel Corporation
44
;
55
; SPDX-License-Identifier: MIT
66
;
@@ -21,7 +21,7 @@ entry:
2121
store float 1.000000e+00, ptr addrspace(1) %dst, align 4
2222
ret void
2323

24-
; CHECK: define void @test(ptr addrspace(1) %dst, <8 x i32> %r0, <8 x i32> %payloadHeader)
24+
; CHECK: define void @test(ptr addrspace(1) %dst, <8 x i32> %r0, <3 x i32> %payloadHeader)
2525
; CHECK: call void @llvm.dbg.value({{.*}})
2626
}
2727

IGC/ocloc_tests/features/fp64_conv_emu/fp64_conv_emu.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ SPDX-License-Identifier: MIT
1414
// CHECK-LABEL: @conversion_kernel(
1515
// CHECK-BASE: entry:
1616
// CHECK-BASE: [[DPEmuFlag:%.*]] = alloca i32, align 4
17-
// CHECK-BASE: [[TMP0:%.*]] = extractelement <8 x i32> %payloadHeader, i64 0
17+
// CHECK-BASE: [[TMP0:%.*]] = extractelement <3 x i32> %payloadHeader, i64 0
1818
// CHECK-BASE: [[TMP1:%.*]] = extractelement <3 x i32> %enqueuedLocalSize, i64 0
1919
// CHECK-BASE: [[TMP2:%.*]] = extractelement <8 x i32> %r0, i64 1
2020
// CHECK-BASE: [[MUL:%.*]] = mul i32 [[TMP1]], [[TMP2]]

IGC/ocloc_tests/features/fp64_conv_emu/fp64_conv_emu_fcmp.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ SPDX-License-Identifier: MIT
1313

1414
// CHECK-LABEL: @fcmp_kernel(
1515
// CHECK-BASE: entry:
16-
// CHECK-BASE: [[TMP0:%.*]] = extractelement <8 x i32> %payloadHeader, i64 0
16+
// CHECK-BASE: [[TMP0:%.*]] = extractelement <3 x i32> %payloadHeader, i64 0
1717
// CHECK-BASE: [[TMP1:%.*]] = extractelement <3 x i32> %enqueuedLocalSize, i64 0
1818
// CHECK-BASE: [[TMP2:%.*]] = extractelement <8 x i32> %r0, i64 1
1919
// CHECK-BASE: [[MUL:%.*]] = mul i32 [[TMP1]], [[TMP2]]

0 commit comments

Comments
 (0)