Skip to content

Commit ef12a59

Browse files
vmustyaigcbot
authored andcommitted
Fold immediate offsets for 2d block load/store operations
The hardware supports immediate offsets for 2d block load, store and prefetch operations. So, VC can eliminate the add and sub instructions which apply the immediate offset to the X and Y indices of the block.
1 parent 281e286 commit ef12a59

File tree

3 files changed

+230
-0
lines changed

3 files changed

+230
-0
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXLscAddrCalcFolding.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ SPDX-License-Identifier: MIT
1616

1717
#include "llvm/CodeGen/TargetPassConfig.h"
1818
#include "llvm/IR/InstVisitor.h"
19+
#include "llvm/InitializePasses.h"
1920
#include "llvm/Pass.h"
2021

2122
#define DEBUG_TYPE "genx-lsc-addr-calc-folding"
@@ -47,8 +48,17 @@ class GenXLscAddrCalcFolding : public FunctionPass,
4748

4849
Value *applyLscAddrFolding(Value *Offsets, APInt &Scale, APInt &Offset);
4950

51+
static constexpr unsigned Block2DIndexX = 10;
52+
static constexpr unsigned Block2DIndexY = 11;
53+
static constexpr unsigned Block2DOffsetX = 12;
54+
static constexpr unsigned Block2DOffsetY = 13;
55+
56+
bool foldLscBlock2DAddrCalculation(CallInst &CI, unsigned IndexArg,
57+
unsigned OffsetArg);
58+
5059
const GenXSubtarget *ST = nullptr;
5160

61+
unsigned Supported2DOffsetBits = 0;
5262
bool Changed = false;
5363
};
5464

@@ -62,6 +72,7 @@ void initializeGenXLscAddrCalcFoldingPass(PassRegistry &);
6272

6373
INITIALIZE_PASS_BEGIN(GenXLscAddrCalcFolding, "GenXLscAddrCalcFolding",
6474
"GenXLscAddrCalcFolding", false, false)
75+
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
6576
INITIALIZE_PASS_END(GenXLscAddrCalcFolding, "GenXLscAddrCalcFolding",
6677
"GenXLscAddrCalcFolding", false, false)
6778

@@ -112,9 +123,83 @@ void GenXLscAddrCalcFolding::visitCallInst(CallInst &CI) {
112123
case vc::InternalIntrinsic::lsc_store_quad_bti:
113124
Changed |= foldLscAddrCalculation(CI);
114125
break;
126+
case vc::InternalIntrinsic::lsc_load_block_2d_ugm:
127+
case vc::InternalIntrinsic::lsc_load_block_2d_ugm_transposed:
128+
case vc::InternalIntrinsic::lsc_load_block_2d_ugm_vnni:
129+
case vc::InternalIntrinsic::lsc_prefetch_block_2d_ugm:
130+
case vc::InternalIntrinsic::lsc_store_block_2d_ugm:
131+
Changed |= foldLscBlock2DAddrCalculation(CI, Block2DIndexX, Block2DOffsetX);
132+
Changed |= foldLscBlock2DAddrCalculation(CI, Block2DIndexY, Block2DOffsetY);
133+
break;
115134
}
116135
}
117136

137+
bool GenXLscAddrCalcFolding::foldLscBlock2DAddrCalculation(CallInst &CI,
138+
unsigned IndexArg,
139+
unsigned OffsetArg) {
140+
IGC_ASSERT(ST->hasLSCMessages() && ST->hasLSCOffset());
141+
142+
auto *Index = CI.getArgOperand(IndexArg);
143+
auto *OldIndex = Index;
144+
auto Offset = cast<ConstantInt>(CI.getArgOperand(OffsetArg))->getValue();
145+
146+
while (auto *BO = dyn_cast<BinaryOperator>(Index)) {
147+
auto Opcode = BO->getOpcode();
148+
if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
149+
break;
150+
151+
auto *Const = dyn_cast<ConstantInt>(BO->getOperand(1));
152+
if (!Const)
153+
break;
154+
155+
auto ConstValue = Const->getValue();
156+
157+
APInt NewOffset;
158+
bool Overflow = false;
159+
160+
switch (Opcode) {
161+
case Instruction::Add:
162+
NewOffset = Offset.sadd_ov(ConstValue, Overflow);
163+
break;
164+
case Instruction::Sub:
165+
NewOffset = Offset.ssub_ov(ConstValue, Overflow);
166+
break;
167+
default:
168+
llvm_unreachable("Unexpected opcode");
169+
}
170+
171+
if (Overflow)
172+
break;
173+
174+
Offset = std::move(NewOffset);
175+
Index = BO->getOperand(0);
176+
177+
LLVM_DEBUG(dbgs() << "LSC address folding found, index: " << *Index
178+
<< ", offset: " << Offset.getSExtValue() << "\n");
179+
}
180+
181+
if (Index == OldIndex)
182+
return false;
183+
184+
const auto OffsetV = Offset.getSExtValue();
185+
const auto ElementSizeBits =
186+
vc::InternalIntrinsic::getMemoryRegisterElementSize(&CI);
187+
if (OffsetV * ElementSizeBits % genx::DWordBits != 0) {
188+
LLVM_DEBUG(dbgs() << "Offset is not dword-aligned\n");
189+
return false;
190+
}
191+
192+
IRBuilder<> Builder(&CI);
193+
194+
LLVM_DEBUG(dbgs() << "Folding LSC address calculation for instruction: " << CI
195+
<< "\n");
196+
CI.setArgOperand(IndexArg, Index);
197+
CI.setArgOperand(OffsetArg, Builder.getInt32(OffsetV));
198+
LLVM_DEBUG(dbgs() << "Updated instruction: " << CI << "\n");
199+
200+
return true;
201+
}
202+
118203
bool GenXLscAddrCalcFolding::foldLscAddrCalculation(CallInst &Inst) {
119204
constexpr unsigned AddrIndex = 6, ScaleIndex = 7, OffsetIndex = 8;
120205

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: %opt %use_old_pass_manager% -GenXModule -GenXCategoryWrapper -GenXCisaBuilderPass -GenXFinalizer \
10+
; RUN: -march=genx64 -mtriple=spir64-unknown-unknown -finalizer-opts="-dumpcommonisa -isaasmToConsole" \
11+
; RUN: -mcpu=Xe2 -o /dev/null < %s | FileCheck %s
12+
13+
declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
14+
declare <32 x i16> @llvm.vc.internal.lsc.load.block.2d.ugm.v32i16.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <32 x i16>)
15+
declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
16+
declare <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <64 x i8>)
17+
18+
declare void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32)
19+
20+
declare void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
21+
22+
; CHECK-LABEL: .kernel "test"
23+
24+
; CHECK: .decl [[BASE:V[0-9]+]] v_type=G type=uq num_elts=1 alias=<[[IBASE:V[0-9]+]], 0>
25+
; CHECK: .decl [[WIDTH:V[0-9]+]] v_type=G type=ud num_elts=1 alias=<[[IWIDTH:V[0-9]+]], 0>
26+
; CHECK: .decl [[HEIGHT:V[0-9]+]] v_type=G type=ud num_elts=1 alias=<[[IHEIGHT:V[0-9]+]], 0>
27+
; CHECK: .decl [[PITCH:V[0-9]+]] v_type=G type=ud num_elts=1 alias=<[[IPITCH:V[0-9]+]], 0>
28+
; CHECK: .decl [[X:V[0-9]+]] v_type=G type=d num_elts=1 alias=<[[IX:V[0-9]+]], 0>
29+
; CHECK: .decl [[Y:V[0-9]+]] v_type=G type=d num_elts=1 alias=<[[IY:V[0-9]+]], 0>
30+
; CHECK: .input [[IBASE]] offset=64 size=8
31+
; CHECK: .input [[IWIDTH]] offset=72 size=4
32+
; CHECK: .input [[IHEIGHT]] offset=76 size=4
33+
; CHECK: .input [[IPITCH]] offset=80 size=4
34+
; CHECK: .input [[IX]] offset=84 size=4
35+
; CHECK: .input [[IY]] offset=88 size=4
36+
37+
define dllexport spir_kernel void @test(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
38+
; CHECK: lsc_load_block2d.ugm.uc.ca (M1, 1) [[LOAD:V[0-9]+]]:d32.8x2nn flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]+16,[[Y]]+32]
39+
%load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 32, <16 x i32> undef)
40+
; CHECK: lsc_load_block2d.ugm.ca.uc (M1, 1) [[LOAD2:V[0-9]+]]:d16.2x8x2nn flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]+128,[[Y]]-32]
41+
%load.a2 = call <32 x i16> @llvm.vc.internal.lsc.load.block.2d.ugm.v32i16.v2i8(i1 true, i8 2, <2 x i8> <i8 2, i8 1>, i8 2, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 128, i32 -32, <32 x i16> undef)
42+
; CHECK: lsc_load_block2d.ugm.st.uc (M1, 1) [[LOADT:V[0-9]+]]:d32.2x8tn flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]-192,[[Y]]+64]
43+
%load.t = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 5, i8 1>, i8 1, i16 2, i16 8, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 -192, i32 64, <16 x i32> undef)
44+
; CHECK: lsc_load_block2d.ugm.st.ca (M1, 1) [[LOADV:V[0-9]+]]:d8.4x16nt flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]],[[Y]]+128]
45+
%load.v = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 5, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 0, i32 128, <64 x i8> undef)
46+
47+
; CHECK: lsc_load_block2d.ugm.uc.ca (M1, 1) %null:d64.8x2nn flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]+256,[[Y]]]
48+
call void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1 true, i8 4, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 256, i32 0)
49+
50+
; CHECK: lsc_store_block2d.ugm.wt.wb (M1, 1) flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]-256,[[Y]]-512] [[LOAD:V[0-9]+]]:d32.8x2nn
51+
call void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1 true, i8 3, <2 x i8> <i8 4, i8 3>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 -256, i32 -512, <16 x i32> %load)
52+
ret void
53+
}
54+
55+
attributes #1 = { noinline nounwind "CMGenxMain" "VC.Stack.Amount"="0" "target-cpu"="XeHPC" }
56+
57+
!spirv.Source = !{!0}
58+
!opencl.spir.version = !{!1}
59+
!opencl.ocl.version = !{!0}
60+
!opencl.used.extensions = !{!2}
61+
!opencl.used.optional.core.features = !{!2}
62+
!spirv.Generator = !{!3}
63+
!genx.kernels = !{!4}
64+
!genx.kernel.internal = !{!8}
65+
66+
!0 = !{i32 0, i32 0}
67+
!1 = !{i32 1, i32 2}
68+
!2 = !{}
69+
!3 = !{i16 6, i16 14}
70+
!4 = !{void (i64, i32, i32, i32, i32, i32)* @test, !"test", !5, i32 0, !6, !0, !7, i32 0}
71+
!5 = !{i32 0, i32 0, i32 0, i32 0, i32 0, i32 0}
72+
!6 = !{i32 64, i32 72, i32 76, i32 80, i32 84, i32 88}
73+
!7 = !{!"svmptr_t"}
74+
!8 = !{void (i64, i32, i32, i32, i32, i32)* @test, null, null, null, null}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: %opt %use_old_pass_manager% -GenXLscAddrCalcFolding -march=genx64 -mcpu=Xe2 -mtriple=spir64-unknown-unknown -S < %s | FileCheck %s
10+
11+
declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
12+
declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
13+
declare <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <64 x i8>)
14+
15+
declare void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32)
16+
17+
declare void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
18+
19+
; CHECK-LABEL: @test1(
20+
define <16 x i32> @test1(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
21+
%xoff = add i32 %x, 16
22+
%yoff = sub i32 %y, 32
23+
; CHECK: %load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <16 x i32> undef)
24+
%load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <16 x i32> undef)
25+
ret <16 x i32> %load
26+
}
27+
28+
; CHECK-LABEL: @test2(
29+
define <16 x i32> @test2(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
30+
%xoff = add i32 %x, 16
31+
%yoff = sub i32 %y, 32
32+
; CHECK: %load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 2, i16 8, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <16 x i32> undef)
33+
%load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 2, i16 8, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <16 x i32> undef)
34+
ret <16 x i32> %load
35+
}
36+
37+
; CHECK-LABEL: @test3(
38+
define <64 x i8> @test3(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
39+
%xoff = add i32 %x, 16
40+
%yoff = sub i32 %y, 32
41+
; CHECK: %load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <64 x i8> undef)
42+
%load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <64 x i8> undef)
43+
ret <64 x i8> %load
44+
}
45+
46+
; CHECK-LABEL: @test4(
47+
define void @test4(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, <16 x i32> %data) {
48+
%xoff = add i32 %x, 16
49+
%yoff = sub i32 %y, 32
50+
; CHECK: call void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <16 x i32> %data)
51+
call void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <16 x i32> %data)
52+
ret void
53+
}
54+
55+
; CHECK-LABEL: @test5(
56+
define void @test5(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
57+
%xoff = add i32 %x, 16
58+
%yoff = sub i32 %y, 32
59+
; CHECK: call void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32)
60+
call void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0)
61+
ret void
62+
}
63+
64+
; CHECK-LABEL: @test6(
65+
define <64 x i8> @test6(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
66+
%xoff = add i32 %x, 17
67+
%yoff = sub i32 %y, 32
68+
; CHECK: %load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %y, i32 0, i32 -32, <64 x i8> undef)
69+
%load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <64 x i8> undef)
70+
ret <64 x i8> %load
71+
}

0 commit comments

Comments
 (0)