Skip to content

Commit 6c202ab

Browse files
adam-bzowskiigcbot
authored andcommitted
[IGC Core] LSC 2D load splits: block loads
Coalescing block reads can improve performance significantly, as it decreases the number of block read instructions, decrease the address payload required by the block reads, etc. However, block reads coalescing is not always profitable, e.g., in the situation of high register pressure, more block reads can decrease live ranges. This is the first PR of the series of PRs that splits large block reads into smaller blocks. It provides an interface and a pass for splitting block loads (i.e., old style intrinsics) into smaller loads. The following PRs will extend the functionality of the pass to more general cases.
1 parent 968e644 commit 6c202ab

14 files changed

+5808
-0
lines changed

IGC/Compiler/CISACodeGen/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ set(IGC_BUILD__SRC__CISACodeGen_Common
7676
"${CMAKE_CURRENT_SOURCE_DIR}/PromoteConstantStructs.cpp"
7777
"${CMAKE_CURRENT_SOURCE_DIR}/PromoteInt8Type.cpp"
7878
"${CMAKE_CURRENT_SOURCE_DIR}/SinkCommonOffsetFromGEP.cpp"
79+
"${CMAKE_CURRENT_SOURCE_DIR}/SplitLoads.cpp"
7980
"${CMAKE_CURRENT_SOURCE_DIR}/PruneUnusedArguments.cpp"
8081
"${CMAKE_CURRENT_SOURCE_DIR}/PullConstantHeuristics.cpp"
8182
"${CMAKE_CURRENT_SOURCE_DIR}/PushAnalysis.cpp"
@@ -201,6 +202,7 @@ set(IGC_BUILD__HDR__CISACodeGen_Common
201202
"${CMAKE_CURRENT_SOURCE_DIR}/ScalarizerCodeGen.hpp"
202203
"${CMAKE_CURRENT_SOURCE_DIR}/ShaderCodeGen.hpp"
203204
"${CMAKE_CURRENT_SOURCE_DIR}/ShaderUnits.hpp"
205+
"${CMAKE_CURRENT_SOURCE_DIR}/SplitLoads.h"
204206
"${CMAKE_CURRENT_SOURCE_DIR}/Simd32Profitability.hpp"
205207
"${CMAKE_CURRENT_SOURCE_DIR}/SinkCommonOffsetFromGEP.h"
206208
"${CMAKE_CURRENT_SOURCE_DIR}/TimeStatsCounter.h"

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ SPDX-License-Identifier: MIT
3939
#include "Compiler/CISACodeGen/MemOpt.h"
4040
#include "Compiler/CISACodeGen/MemOpt2.h"
4141
#include "Compiler/CISACodeGen/MergeUniformStores.hpp"
42+
#include "Compiler/CISACodeGen/SplitLoads.h"
4243
#include "Compiler/CISACodeGen/PreRARematFlag.h"
4344
#include "Compiler/CISACodeGen/PromoteConstantStructs.hpp"
4445
#include "Compiler/Optimizer/OpenCLPasses/Decompose2DBlockFuncs/Decompose2DBlockFuncs.hpp"
@@ -220,6 +221,10 @@ void AddAnalysisPasses(CodeGenContext& ctx, IGCPassManager& mpm)
220221
mpm.add(createMemOpt2Pass(16));
221222
}
222223

224+
if (!isOptDisabled) {
225+
mpm.add(createSplitLoadsPass());
226+
}
227+
223228
// only limited code-sinking to several shader-type
224229
// vs input has the URB-reuse issue to be resolved.
225230
// Also need to understand the performance benefit better.

IGC/Compiler/CISACodeGen/SplitLoads.cpp

Lines changed: 2090 additions & 0 deletions
Large diffs are not rendered by default.

IGC/Compiler/CISACodeGen/SplitLoads.h

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2025 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
#pragma once
10+
11+
#include "Compiler/CISACodeGen/IGCLivenessAnalysis.h"
12+
#include "Compiler/CodeGenPublic.h"
13+
#include "GenISAIntrinsics/GenIntrinsicInst.h"
14+
15+
#include "common/LLVMWarningsPush.hpp"
16+
#include "llvm/ADT/SmallPtrSet.h"
17+
#include "llvm/ADT/SmallVector.h"
18+
#include "llvm/IR/BasicBlock.h"
19+
#include "llvm/IR/Function.h"
20+
#include "llvm/IR/Instruction.h"
21+
#include "llvm/IR/Value.h"
22+
#include "llvm/Pass.h"
23+
#include "common/LLVMWarningsPop.hpp"
24+
25+
#include <memory>
26+
#include <set>
27+
#include <utility>
28+
29+
namespace llvm {
30+
class FunctionPass;
31+
}
32+
33+
namespace IGC {
34+
namespace LS {
35+
36+
/// A `struct` containing two dimensions of a block.
37+
struct Dims {
38+
unsigned grSize, numOfGr;
39+
unsigned size() const { return grSize * numOfGr; }
40+
41+
bool operator<(const Dims &rhs) const {
42+
return grSize < rhs.grSize ||
43+
(grSize == rhs.grSize && numOfGr < rhs.numOfGr);
44+
}
45+
};
46+
47+
using PossibleDims = std::set<Dims>;
48+
49+
struct Config {
50+
Module *M = nullptr; // for debug info
51+
CodeGenContext *CGC = nullptr;
52+
IGCLivenessAnalysis *RPE = nullptr;
53+
54+
bool isLegitW8 = false;
55+
unsigned sizeOfRegs_B = 0;
56+
unsigned numOfRegs = 0;
57+
unsigned defaultSimd = 0;
58+
unsigned actualSimd = 0;
59+
60+
/// Turns on the splitting pass.
61+
bool enableLoadSplitting = IGC_IS_FLAG_ENABLED(LS_enableLoadSplitting);
62+
63+
/// If `true`, the register pressure data is ignored and the pass splits all
64+
/// loads.
65+
bool ignoreSplitThreshold = IGC_IS_FLAG_ENABLED(LS_ignoreSplitThreshold);
66+
67+
/// Minimal split size in terms of GRFs, used in determination of the possible
68+
/// split dimensions.
69+
unsigned minSplitSize_GRF = IGC_GET_FLAG_VALUE(LS_minSplitSize_GRF);
70+
71+
/// Minimal split size in terms of vector elements (bit width-independent),
72+
/// used in determination of the possible split dimensions.
73+
unsigned minSplitSize_E = IGC_GET_FLAG_VALUE(LS_minSplitSize_E);
74+
75+
/// If `ignoreSplitThreshold = false`, the pass splits loads in a given basic
76+
/// block only if the maximal register pressure exceeds total GRFs by this
77+
/// much.
78+
int splitThresholdDelta_GRF = IGC_GET_FLAG_VALUE(LS_splitThresholdDelta_GRF);
79+
80+
/// Minimal split size in bytes, to be calculated from minSplitSize_GRF.
81+
unsigned minSplitSize_B = 0;
82+
83+
/// Absolute split threshold in bytes.
84+
int splitThreshold_B = 0;
85+
86+
Config(const Config &) = delete;
87+
Config(Config &&) = delete;
88+
89+
/// Value of `SIMD` as reported by metadata.
90+
unsigned SIMD() const { return actualSimd ? actualSimd : defaultSimd; }
91+
92+
static Config &get() {
93+
static Config config;
94+
return config;
95+
}
96+
97+
bool initialize(Function *inF, CodeGenContext *inCGC,
98+
IGCLivenessAnalysis *inRPE);
99+
100+
private:
101+
Config() = default;
102+
};
103+
104+
Config &config();
105+
106+
/// The class `LoadSplitter` is responsible for splitting loads in an LLVM
107+
/// function.
108+
class LoadSplitter {
109+
public:
110+
/// @brief Factory function to create an instance of `LoadSplitter`.
111+
/// @param inF LLVM function pointer.
112+
/// @param inCGC The code generation context.
113+
/// @param inRPE The register pressure estimator.
114+
static std::unique_ptr<LoadSplitter>
115+
Create(Function *inF, CodeGenContext *inCGC, IGCLivenessAnalysis *inRPE);
116+
117+
LoadSplitter(const LoadSplitter &) = delete;
118+
LoadSplitter &operator=(const LoadSplitter &) = delete;
119+
120+
/// @brief Returns `true` is the register pressure for the basic block exceeds
121+
/// the threshold given by the flag IGS_LS_splitThresholdDelta_GRF. The
122+
/// pressure must also exceed the goal, IGC_LS_goalPressureDelta_GRF.
123+
/// @param BB The basic block to check.
124+
bool isRPHigh(BasicBlock *BB);
125+
126+
/// @brief Returns the set of all possible dimensions in which the load or AP
127+
/// loads can be split into.
128+
/// @param GII The load or the address payload to split. If `GII` is an AP
129+
/// Load, all loads associated with its AP are considered.
130+
PossibleDims possibleDims(GenIntrinsicInst *GII);
131+
132+
/// @brief Splits the block load into the specified dimensions.
133+
/// @param GII The load or the address payload to split. If `GII` is an AP
134+
/// Load, all loads associated with its AP are considered.
135+
/// @param dims Size of the new blocks.
136+
/// @return Returns `true` on success, `false` otherwise.
137+
bool split(GenIntrinsicInst *GII, Dims dims);
138+
139+
/// @brief Splits all loads in the basic block to the smallest size possible.
140+
/// @param BB The basic block.
141+
/// @return Returns `true` on success, `false` otherwise.
142+
bool splitAllToSmallest(BasicBlock *BB);
143+
144+
private:
145+
LoadSplitter() = default;
146+
struct Impl;
147+
std::unique_ptr<Impl> impl;
148+
};
149+
150+
} // namespace LS
151+
152+
FunctionPass *createSplitLoadsPass();
153+
} // namespace IGC

IGC/Compiler/InitializePasses.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ void initializeSPIRMetaDataTranslationPass(llvm::PassRegistry&);
156156
void initializeSplitStructurePhisPassPass(llvm::PassRegistry&);
157157
void initializeSpv2dBlockIOResolutionPass(llvm::PassRegistry&);
158158
void initializeSpvSubgroupMMAResolutionPass(llvm::PassRegistry&);
159+
void initializeSplitLoadsPass(llvm::PassRegistry&);
159160
void initializeStatelessToStatefulPass(llvm::PassRegistry&);
160161
void initializeSubGroupFuncsResolutionPass(llvm::PassRegistry&);
161162
void initializeSubGroupReductionPatternPass(llvm::PassRegistry&);
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
;=========================== begin_copyright_notice ============================
3+
;
4+
; Copyright (C) 2025 Intel Corporation
5+
;
6+
; SPDX-License-Identifier: MIT
7+
;
8+
;============================ end_copyright_notice =============================
9+
10+
; REQUIRES: regkeys
11+
; RUN: igc_opt -S --igc-split-loads -platformpvc --regkey=LS_enableLoadSplitting=1 --regkey=LS_ignoreSplitThreshold=1 --regkey=LS_minSplitSize_GRF=0 --regkey=LS_minSplitSize_E=0 %s | FileCheck %s --check-prefix=SPLIT
12+
; RUN: igc_opt -S --igc-split-loads -platformpvc --regkey=LS_enableLoadSplitting=1 --regkey=LS_ignoreSplitThreshold=1 --regkey=LS_minSplitSize_GRF=100 --regkey=LS_minSplitSize_E=0 %s | FileCheck %s --check-prefix=GRF
13+
; RUN: igc_opt -S --igc-split-loads -platformpvc --regkey=LS_enableLoadSplitting=1 --regkey=LS_ignoreSplitThreshold=1 --regkey=LS_minSplitSize_GRF=0 --regkey=LS_minSplitSize_E=4 %s | FileCheck %s --check-prefix=ELTS4
14+
; RUN: igc_opt -S --igc-split-loads -platformpvc --regkey=LS_enableLoadSplitting=1 --regkey=LS_ignoreSplitThreshold=1 --regkey=LS_minSplitSize_GRF=0 --regkey=LS_minSplitSize_E=8 %s | FileCheck %s --check-prefix=ELTS8
15+
; RUN: igc_opt -S --igc-split-loads -platformpvc --regkey=LS_enableLoadSplitting=1 --regkey=LS_ignoreSplitThreshold=1 --regkey=LS_minSplitSize_GRF=0 --regkey=LS_minSplitSize_E=16 %s | FileCheck %s --check-prefix=ELTS16
16+
; RUN: igc_opt -S --igc-split-loads -platformpvc --regkey=LS_enableLoadSplitting=1 --regkey=LS_ignoreSplitThreshold=0 --regkey=LS_splitThresholdDelta_GRF=-1000 %s | FileCheck %s --check-prefix=THRESHOLD
17+
18+
declare spir_func void @fun_v4i32(<4 x i32>)
19+
20+
declare spir_func <16 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v16i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32)
21+
22+
define spir_kernel void @test_threshold(i64 %ptr) {
23+
; SPLIT-LABEL: @test_threshold(
24+
; SPLIT-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR:%.*]], i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
25+
; SPLIT-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 4, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
26+
; SPLIT-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 8, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
27+
; SPLIT-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 12, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
28+
; SPLIT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP1]])
29+
; SPLIT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP2]])
30+
; SPLIT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP3]])
31+
; SPLIT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP4]])
32+
; SPLIT-NEXT: ret void
33+
;
34+
; GRF-LABEL: @test_threshold(
35+
; GRF-NEXT: [[VEC1:%.*]] = call <16 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v16i32(i64 [[PTR:%.*]], i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0)
36+
; GRF-NEXT: [[PICK1_1:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
37+
; GRF-NEXT: [[PICK1_2:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
38+
; GRF-NEXT: [[PICK1_3:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
39+
; GRF-NEXT: [[PICK1_4:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
40+
; GRF-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_1]])
41+
; GRF-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_2]])
42+
; GRF-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_3]])
43+
; GRF-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_4]])
44+
; GRF-NEXT: ret void
45+
;
46+
; ELTS4-LABEL: @test_threshold(
47+
; ELTS4-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR:%.*]], i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
48+
; ELTS4-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 4, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
49+
; ELTS4-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 8, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
50+
; ELTS4-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 12, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
51+
; ELTS4-NEXT: call void @fun_v4i32(<4 x i32> [[TMP1]])
52+
; ELTS4-NEXT: call void @fun_v4i32(<4 x i32> [[TMP2]])
53+
; ELTS4-NEXT: call void @fun_v4i32(<4 x i32> [[TMP3]])
54+
; ELTS4-NEXT: call void @fun_v4i32(<4 x i32> [[TMP4]])
55+
; ELTS4-NEXT: ret void
56+
;
57+
; ELTS8-LABEL: @test_threshold(
58+
; ELTS8-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[PTR:%.*]], i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
59+
; ELTS8-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i64 0
60+
; ELTS8-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i64 0
61+
; ELTS8-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1
62+
; ELTS8-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i64 1
63+
; ELTS8-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2
64+
; ELTS8-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2
65+
; ELTS8-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3
66+
; ELTS8-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i64 3
67+
; ELTS8-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4
68+
; ELTS8-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TMP10]], i64 0
69+
; ELTS8-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5
70+
; ELTS8-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i64 1
71+
; ELTS8-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6
72+
; ELTS8-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP14]], i64 2
73+
; ELTS8-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7
74+
; ELTS8-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP16]], i64 3
75+
; ELTS8-NEXT: [[TMP18:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 8, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
76+
; ELTS8-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i64 0
77+
; ELTS8-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> undef, i32 [[TMP19]], i64 0
78+
; ELTS8-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i64 1
79+
; ELTS8-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i64 1
80+
; ELTS8-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i64 2
81+
; ELTS8-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP23]], i64 2
82+
; ELTS8-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i64 3
83+
; ELTS8-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP25]], i64 3
84+
; ELTS8-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP18]], i64 4
85+
; ELTS8-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i64 0
86+
; ELTS8-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP18]], i64 5
87+
; ELTS8-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP29]], i64 1
88+
; ELTS8-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[TMP18]], i64 6
89+
; ELTS8-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP31]], i64 2
90+
; ELTS8-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i64 7
91+
; ELTS8-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP33]], i64 3
92+
; ELTS8-NEXT: call void @fun_v4i32(<4 x i32> [[TMP9]])
93+
; ELTS8-NEXT: call void @fun_v4i32(<4 x i32> [[TMP17]])
94+
; ELTS8-NEXT: call void @fun_v4i32(<4 x i32> [[TMP26]])
95+
; ELTS8-NEXT: call void @fun_v4i32(<4 x i32> [[TMP34]])
96+
; ELTS8-NEXT: ret void
97+
;
98+
; ELTS16-LABEL: @test_threshold(
99+
; ELTS16-NEXT: [[VEC1:%.*]] = call <16 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v16i32(i64 [[PTR:%.*]], i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0)
100+
; ELTS16-NEXT: [[PICK1_1:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
101+
; ELTS16-NEXT: [[PICK1_2:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
102+
; ELTS16-NEXT: [[PICK1_3:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
103+
; ELTS16-NEXT: [[PICK1_4:%.*]] = shufflevector <16 x i32> [[VEC1]], <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
104+
; ELTS16-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_1]])
105+
; ELTS16-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_2]])
106+
; ELTS16-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_3]])
107+
; ELTS16-NEXT: call void @fun_v4i32(<4 x i32> [[PICK1_4]])
108+
; ELTS16-NEXT: ret void
109+
;
110+
; THRESHOLD-LABEL: @test_threshold(
111+
; THRESHOLD-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR:%.*]], i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
112+
; THRESHOLD-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 4, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
113+
; THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 8, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
114+
; THRESHOLD-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 12, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
115+
; THRESHOLD-NEXT: call void @fun_v4i32(<4 x i32> [[TMP1]])
116+
; THRESHOLD-NEXT: call void @fun_v4i32(<4 x i32> [[TMP2]])
117+
; THRESHOLD-NEXT: call void @fun_v4i32(<4 x i32> [[TMP3]])
118+
; THRESHOLD-NEXT: call void @fun_v4i32(<4 x i32> [[TMP4]])
119+
; THRESHOLD-NEXT: ret void
120+
;
121+
; DEFAULT-LABEL: @test_threshold(
122+
; DEFAULT-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR:%.*]], i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
123+
; DEFAULT-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 4, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
124+
; DEFAULT-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 8, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
125+
; DEFAULT-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64 [[PTR]], i32 127, i32 63, i32 127, i32 0, i32 12, i32 32, i32 16, i32 4, i32 1, i1 false, i1 false, i32 0)
126+
; DEFAULT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP1]])
127+
; DEFAULT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP2]])
128+
; DEFAULT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP3]])
129+
; DEFAULT-NEXT: call void @fun_v4i32(<4 x i32> [[TMP4]])
130+
; DEFAULT-NEXT: ret void
131+
%vec1 = call <16 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v16i32(i64 %ptr, i32 127, i32 63, i32 127, i32 0, i32 0, i32 32, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0)
132+
%pick1.1 = shufflevector <16 x i32> %vec1, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
133+
%pick1.2 = shufflevector <16 x i32> %vec1, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
134+
%pick1.3 = shufflevector <16 x i32> %vec1, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
135+
%pick1.4 = shufflevector <16 x i32> %vec1, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
136+
call void @fun_v4i32(<4 x i32> %pick1.1)
137+
call void @fun_v4i32(<4 x i32> %pick1.2)
138+
call void @fun_v4i32(<4 x i32> %pick1.3)
139+
call void @fun_v4i32(<4 x i32> %pick1.4)
140+
ret void
141+
}

0 commit comments

Comments
 (0)