Skip to content

Commit 42ddbf6

Browse files
aratajewigcbot
authored andcommitted
Fix LSC immediate offset pattern match
Starting with Xe2, hardware is able to handle the following load: ```c load [VAR+IMM] ``` Formerly emitting an add instruction would be required: ```c add tmp VAR IMM load [tmp] ``` However, HW does an early bounds check on VAR. Thus, if VAR is negative, then the bounds check fails early even though the immediate offset would bring the final calculation to a positive number. This shows up when one indexes from the top of an SLM allocation: ```c float slm[IMM]; slm[IMM - var] = ... ``` This change improves LSC immediate offset pattern match, so that it is applied only if VAR can be proven to be a positive value.
1 parent cb5ea20 commit 42ddbf6

File tree

2 files changed

+107
-4
lines changed

2 files changed

+107
-4
lines changed

IGC/Compiler/CISACodeGen/PatternMatchPass.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,7 +2039,7 @@ namespace IGC
20392039
pass->PredAdd(pred, invertPred, sources, modf);
20402040
}
20412041

2042-
// Cannot use the ".sat" modifier as it is applied only to data lanes enabled by the predication
2042+
// Cannot use the ".sat" modifier as it is applied only to data lanes enabled by the predication
20432043
// while saturation operation matched MatchFloatingPointSatModifier should be executed
20442044
// irrespective of the predicate value matched by MatchPredAdd.
20452045
bool supportsSaturate() override
@@ -2717,6 +2717,17 @@ namespace IGC
27172717
IGC_ASSERT_MESSAGE(!isConstant0 || !isConstant1,
27182718
"Both operands are immediate - constants should be folded elsewhere.");
27192719

2720+
llvm::Value* varOffset = isConstant0 ?
2721+
addSubInst->getOperand(1) : addSubInst->getOperand(0);
2722+
2723+
// HW does an early bounds check on varOffset for A32 messages. Thus, if varOffset
2724+
// is negative, then the bounds check fails early even though the immediate offset
2725+
// would bring the final calculation to a positive number.
2726+
if (!isA64AddressingModel && !valueIsPositive(varOffset, m_DL))
2727+
return false;
2728+
2729+
MarkAsSource(varOffset, IsSourceOfSample(&I));
2730+
27202731
unsigned numSources = GetNbSources(I);
27212732
for (unsigned i = 0; i < numSources; i++) {
27222733
if (I.getOperand(i) != intToPtrInst && I.getOperand(i) != addSubInst) {
@@ -2726,9 +2737,6 @@ namespace IGC
27262737

27272738
llvm::Value* immOffset = isConstant0 ?
27282739
addSubInst->getOperand(0) : addSubInst->getOperand(1);
2729-
llvm::Value* varOffset = isConstant0 ?
2730-
addSubInst->getOperand(1) : addSubInst->getOperand(0);
2731-
MarkAsSource(varOffset, IsSourceOfSample(&I));
27322740

27332741
LSCImmOffsetPattern* pattern = new (m_allocator)
27342742
LSCImmOffsetPattern(&I, varOffset, llvm::dyn_cast<llvm::ConstantInt>(immOffset));
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
; REQUIRES: regkeys
9+
;
10+
; RUN: igc_opt -platformbmg -igc-emit-visa %s -regkey DumpVISAASMToConsole | FileCheck %s
11+
; ------------------------------------------------
12+
; EmitVISAPass
13+
; ------------------------------------------------
14+
15+
; Starting with Xe2, hardware is able to handle the following load:
16+
;
17+
; load [VAR+IMM]
18+
;
19+
; Formerly emitting an add instruction would be required:
20+
;
21+
; add tmp VAR IMM
22+
; load [tmp]
23+
;
24+
; However, HW does an early bounds check on VAR. Thus, if VAR is negative,
25+
; then the bounds check fails early even though the immediate offset would
26+
; bring the final calculation to a positive number.
27+
;
28+
; This shows up when one indexes from the top of an SLM allocation:
29+
;
30+
; float slm[IMM];
31+
; slm[IMM - var] = ...
32+
33+
; This test verifies whether immediate offset pattern match is ONLY applied
34+
; when VAR is proven to be a positive value.
35+
36+
define spir_kernel void @test(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, <8 x i32> %r0, <8 x i32> %payloadHeader, i32 %bufferOffset, i16 %localIdX, i16 %localIdY, i16 %localIdZ) {
37+
entry:
38+
%0 = zext i16 %localIdX to i32
39+
%1 = zext i16 %localIdY to i32
40+
41+
%2 = shl nuw nsw i32 %1, 6
42+
%3 = shl nuw nsw i32 %0, 2
43+
%4 = sub i32 0, %3
44+
%5 = sub i32 %4, %2
45+
%6 = add nsw i32 %5, 1020
46+
%7 = inttoptr i32 %6 to i32 addrspace(3)*
47+
; COM: VAR (%5) is a negative value, so immediate global offset cannot be applied
48+
; CHECK-NOT: lsc_load.slm (M1_NM, 1) {{V[0-9]+}}:d32t flat[{{V[0-9]+}}+0x3FC]:a32
49+
; CHECK: lsc_load.slm (M1_NM, 1) {{V[0-9]+}}:d32t flat[{{V[0-9]+}}]:a32
50+
%8 = load i32, i32 addrspace(3)* %7, align 4
51+
store i32 %8, i32 addrspace(1)* %out0
52+
53+
; Test positive var offset
54+
%9 = add nuw nsw i32 %0, 256
55+
%10 = inttoptr i32 %9 to i32 addrspace(3)*
56+
; COM: VAR (%0) is a positive value, so immediate global offset can be applied
57+
; CHECK: lsc_load.slm (M1_NM, 1) {{V[0-9]+}}:d32t flat[{{.*}}+0x100]:a32
58+
%11 = load i32, i32 addrspace(3)* %10, align 4
59+
store i32 %11, i32 addrspace(1)* %out1
60+
ret void
61+
}
62+
63+
!IGCMetadata = !{!0}
64+
!igc.functions = !{!22}
65+
66+
!0 = !{!"ModuleMD", !1, !21}
67+
!1 = !{!"FuncMD", !2, !3}
68+
!2 = !{!"FuncMDMap[0]", void (i32 addrspace(1)*, i32 addrspace(1)*, <8 x i32>, <8 x i32>, i32, i16, i16, i16)* @test}
69+
!3 = !{!"FuncMDValue[0]", !4, !17}
70+
!4 = !{!"resAllocMD", !5}
71+
!5 = !{!"argAllocMDList", !6, !10, !11, !14, !15, !16}
72+
!6 = !{!"argAllocMDListVec[0]", !7, !8, !9}
73+
!7 = !{!"type", i32 0}
74+
!8 = !{!"extensionType", i32 -1}
75+
!9 = !{!"indexType", i32 -1}
76+
!10 = !{!"argAllocMDListVec[1]", !7, !8, !9}
77+
!11 = !{!"argAllocMDListVec[2]", !12, !8, !13}
78+
!12 = !{!"type", i32 1}
79+
!13 = !{!"indexType", i32 0}
80+
!14 = !{!"argAllocMDListVec[3]", !7, !8, !9}
81+
!15 = !{!"argAllocMDListVec[4]", !7, !8, !9}
82+
!16 = !{!"argAllocMDListVec[5]", !7, !8, !9}
83+
!17 = !{!"m_OpenCLArgTypeQualifiers", !18, !19, !20}
84+
!18 = !{!"m_OpenCLArgTypeQualifiersVec[0]", !""}
85+
!19 = !{!"m_OpenCLArgTypeQualifiersVec[1]", !""}
86+
!20 = !{!"m_OpenCLArgTypeQualifiersVec[2]", !""}
87+
!21 = !{!"isHDCFastClearShader", i1 false}
88+
!22 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, <8 x i32>, <8 x i32>, i32, i16, i16, i16)* @test, !23}
89+
!23 = !{!24, !25}
90+
!24 = !{!"function_type", i32 0}
91+
!25 = !{!"implicit_arg_desc", !26, !27, !28}
92+
!26 = !{i32 0}
93+
!27 = !{i32 1}
94+
!28 = !{i32 14, !29}
95+
!29 = !{!"explicit_arg_num", i32 2}

0 commit comments

Comments
 (0)