Skip to content

Commit 0882965

Browse files
admitricpszymich
authored andcommitted
Optimize sub-group shuffle xor built-in
Replace sub_group shuffle with index = sub_group_id ^ xor_value, where xor_value is a compile-time constant to intrinsic, which will produce sequence of movs instead of using indirect access
1 parent 2a1e219 commit 0882965

File tree

9 files changed

+314
-2
lines changed

9 files changed

+314
-2
lines changed

IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ void CheckInstrTypes::visitCallInst(CallInst& C)
278278
case GenISAIntrinsic::GenISA_WaveClustered:
279279
case GenISAIntrinsic::GenISA_QuadPrefix:
280280
case GenISAIntrinsic::GenISA_simdShuffleDown:
281+
case GenISAIntrinsic::GenISA_simdShuffleXor:
281282
g_InstrTypes->numWaveIntrinsics++;
282283
break;
283284
case GenISAIntrinsic::GenISA_DCL_inputVec:

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5175,6 +5175,162 @@ void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
51755175
}
51765176
}
51775177

5178+
void EmitPass::emitSimdShuffleXor(llvm::Instruction* inst)
5179+
{
5180+
CVariable* pData = m_currShader->GetSymbol(inst->getOperand(0));
5181+
CVariable* pXorValue = m_currShader->GetSymbol(inst->getOperand(1));
5182+
5183+
IGC_ASSERT_MESSAGE(pXorValue->IsImmediate(), "simdShuffleXor must have \
5184+
constant xorValue parameter");
5185+
5186+
// emit move sequence for 1 bit
5187+
// case 0: 1 2 3 4 5 6 7 8 => 2 1 4 3 6 5 8 7
5188+
// case 1: 1 2 3 4 5 6 7 8 => 3 4 1 2 7 8 5 6
5189+
// case 1: 1 2 3 4 5 6 7 8 => 3 4 1 2 7 8 5 6
5190+
// case 2: 1 2 3 4 5 6 7 8 => 5 6 7 8 1 2 3 4
5191+
// case 3: 1 2 .. 8 9 .. 15 16 => 9 10 .. 15 16 1 2 .. 7 8
5192+
auto emitShuffleXor1Bit = [&](CVariable* pData, uint xorBit) -> CVariable*
5193+
{
5194+
VISA_Type type = pData->GetType();
5195+
bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
5196+
5197+
CVariable* pResult = m_currShader->GetNewVariable(
5198+
pData->GetNumberElement(),
5199+
pData->GetType(),
5200+
pData->GetAlign(),
5201+
false,
5202+
1,
5203+
"simdShuffleXorTmp");
5204+
5205+
if (xorBit == 0 || (xorBit == 1 && !is64bitType)) {
5206+
// Use strided access of max possible length
5207+
// For simd16 and xorBit == 0
5208+
// mov (M1_NM, 8) simdShuffleXorTmp(0,0)<2> V0040(0,1)<2;1,0> /// $11
5209+
// mov (M1_NM, 8) simdShuffleXorTmp(0,1)<2> V0040(0,0)<2;1,0> /// $12
5210+
// For 32-bit it will be just 2 movs, for 64-bit double type let the finalizer to split the vars:
5211+
// r10 is the source
5212+
// (W) mov (16|M0) r19.0<1>:ud r10.2<4;2,1>:ud {$4.dst} // $13
5213+
// (W) mov (8|M0) r18.0<1>:df r19.0<1;1,0>:df {I@1} // $13
5214+
// (W) mov (8|M0) r12.0<4>:ud r18.0<2;1,0>:ud {Compacted,L@1} // $13
5215+
// (W) mov (8|M0) r12.1<4>:ud r18.1<2;1,0>:ud // $13
5216+
// (W) mov (16|M0) r21.0<1>:ud r10.0<4;2,1>:ud // $14
5217+
// (W) mov (8|M0) r20.0<1>:df r21.0<1;1,0>:df {I@1} // $14
5218+
// (W) mov (8|M0) r12.2<4>:ud r20.0<2;1,0>:ud {Compacted,L@1} // $14
5219+
// (W) mov (8|M0) r12.3<4>:ud r20.1<2;1,0>:ud // $14
5220+
5221+
// For int32 and xorBit == 1
5222+
// mov (M1_NM, 4) simdShuffleXorTmp(0,0)<4> V0040(0,2)<4;1,0> /// $11
5223+
// mov (M1_NM, 4) simdShuffleXorTmp(0,2)<4> V0040(0,0)<4;1,0> /// $12
5224+
// mov (M1_NM, 4) simdShuffleXorTmp(0,1)<4> V0040(0,3)<4;1,0> /// $13
5225+
// mov (M1_NM, 4) simdShuffleXorTmp(0,3)<4> V0040(0,1)<4;1,0> /// $14
5226+
// for xorBit == 1 strided moves are beneficial only if the type is less that 64-bit
5227+
// (fewer moves will be generated)
5228+
5229+
// for xorBit > 1 is it always more beneficial to copy with subsequent chunks
5230+
5231+
auto stride = (2 * (xorBit + 1));
5232+
auto width = pData->GetNumberElement() / stride;
5233+
auto currentSimdMode = lanesToSIMDMode(width);
5234+
5235+
for (uint i = 0; i < xorBit + 1; i++) {
5236+
m_encoder->SetSimdSize(currentSimdMode);
5237+
m_encoder->SetSrcRegion(0, stride, 1, 0);
5238+
m_encoder->SetSrcSubReg(0, i + xorBit + 1);
5239+
m_encoder->SetDstRegion(stride);
5240+
m_encoder->SetDstSubReg(i);
5241+
m_encoder->SetNoMask();
5242+
m_encoder->Copy(pResult, pData);
5243+
m_encoder->Push();
5244+
5245+
m_encoder->SetSimdSize(currentSimdMode);
5246+
m_encoder->SetSrcRegion(0, stride, 1, 0);
5247+
m_encoder->SetSrcSubReg(0, i);
5248+
m_encoder->SetDstRegion(stride);
5249+
m_encoder->SetDstSubReg(i + xorBit + 1);
5250+
m_encoder->SetNoMask();
5251+
m_encoder->Copy(pResult, pData);
5252+
m_encoder->Push();
5253+
}
5254+
}
5255+
else if ((xorBit >= 1) && (xorBit <= 3)) {
5256+
// Use subsequent accesses to copy all subsequent chunks
5257+
// for xorBit == 2
5258+
// mov (M1_NM, 4) simdShuffleXorTmp(0,0)<1> V0043(0,4)<1;1,0> /// $13
5259+
// mov (M1_NM, 4) simdShuffleXorTmp(0,4)<1> V0043(0,0)<1;1,0> /// $14
5260+
// mov (M1_NM, 4) simdShuffleXorTmp(1,0)<1> V0043(1,4)<1;1,0> /// $15
5261+
// mov (M1_NM, 4) simdShuffleXorTmp(1,4)<1> V0043(1,0)<1;1,0> /// $16
5262+
// for 64-bit types the accesses will be 2x widened in finalizer
5263+
// (W) mov (8|M0) r12.0<1>:ud r10.8<1;1,0>:ud {Compacted,$4.dst} // $13
5264+
// (W) mov (8|M0) r12.8<1>:ud r10.0<1;1,0>:ud {Compacted} // $14
5265+
// (W) mov (8|M0) r13.0<1>:ud r11.8<1;1,0>:ud {Compacted} // $15
5266+
// (W) mov (8|M0) r13.8<1>:ud r11.0<1;1,0>:ud {Compacted} // $16
5267+
// The number of chunks is larger on the larger SIMD
5268+
5269+
auto width = static_cast<int>(std::pow(2, xorBit));
5270+
auto currentSimdMode = lanesToSIMDMode(width);
5271+
5272+
for (uint i = 0; i < pData->GetNumberElement(); i += width * 2) {
5273+
m_encoder->SetSimdSize(currentSimdMode);
5274+
m_encoder->SetSrcRegion(0, 1, 1, 0);
5275+
m_encoder->SetSrcSubReg(0, i + width);
5276+
m_encoder->SetDstRegion(1);
5277+
m_encoder->SetDstSubReg(i);
5278+
m_encoder->SetNoMask();
5279+
m_encoder->Copy(pResult, pData);
5280+
m_encoder->Push();
5281+
5282+
m_encoder->SetSimdSize(currentSimdMode);
5283+
m_encoder->SetSrcRegion(0, 1, 1, 0);
5284+
m_encoder->SetSrcSubReg(0, i);
5285+
m_encoder->SetDstRegion(1);
5286+
m_encoder->SetDstSubReg(i + width);
5287+
m_encoder->SetNoMask();
5288+
m_encoder->Copy(pResult, pData);
5289+
m_encoder->Push();
5290+
}
5291+
}
5292+
else {
5293+
IGC_ASSERT_MESSAGE(false, "simdShuffleXor is only implemented for 0 <= xor_value <= 15");
5294+
};
5295+
5296+
return pResult;
5297+
};
5298+
5299+
// just broadcast the value if the value is uniform
5300+
if (pData->IsUniform()) {
5301+
m_encoder->SetSrcRegion(0, 0, 1, 0);
5302+
m_encoder->SetSrcSubReg(0, 0);
5303+
m_encoder->SetDstRegion(1);
5304+
m_encoder->SetDstSubReg(0);
5305+
m_encoder->Copy(m_destination, pData);
5306+
m_encoder->Push();
5307+
return;
5308+
}
5309+
5310+
// emit moves for every non-zero bit subsequently
5311+
const auto xorValue = pXorValue->GetImmediateValue();
5312+
CVariable* tempValue = pData;
5313+
for (uint i = 0; i < 5; i++)
5314+
{
5315+
if (((xorValue >> i) & 0x1) == 0x1)
5316+
{
5317+
tempValue = emitShuffleXor1Bit(tempValue, i);
5318+
}
5319+
}
5320+
5321+
// final copy, respecting the execution mask if in divergent CF
5322+
if (!m_currShader->InsideDivergentCF(inst))
5323+
{
5324+
m_encoder->SetNoMask();
5325+
}
5326+
m_encoder->SetSrcRegion(0, 1, 1, 0);
5327+
m_encoder->SetSrcSubReg(0, 0);
5328+
m_encoder->SetDstRegion(1);
5329+
m_encoder->SetDstSubReg(0);
5330+
m_encoder->Copy(m_destination, tempValue);
5331+
m_encoder->Push();
5332+
}
5333+
51785334
static uint32_t getBlockMsgSize(uint32_t bytesRemaining, uint32_t maxSize)
51795335
{
51805336
uint32_t size = 0;
@@ -7235,6 +7391,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
72357391
case GenISAIntrinsic::GenISA_simdShuffleDown:
72367392
emitSimdShuffleDown(inst);
72377393
break;
7394+
case GenISAIntrinsic::GenISA_simdShuffleXor:
7395+
emitSimdShuffleXor(inst);
7396+
break;
72387397
case GenISAIntrinsic::GenISA_simdBlockRead:
72397398
emitSimdBlockRead(inst);
72407399
break;

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ class EmitPass : public llvm::FunctionPass
207207
void emitSimdShuffle(llvm::Instruction* inst);
208208
void emitCrossInstanceMov(const SSource& source, const DstModifier& modifier);
209209
void emitSimdShuffleDown(llvm::Instruction* inst);
210+
void emitSimdShuffleXor(llvm::Instruction* inst);
210211
void emitSimdBlockRead(llvm::Instruction* inst, llvm::Value* ptrVal = nullptr);
211212
void emitSimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal = nullptr);
212213
void emitLegacySimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal = nullptr);

IGC/Compiler/CISACodeGen/helper.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,6 +1395,7 @@ namespace IGC
13951395
{
13961396
case GenISAIntrinsic::GenISA_WaveShuffleIndex:
13971397
case GenISAIntrinsic::GenISA_simdShuffleDown:
1398+
case GenISAIntrinsic::GenISA_simdShuffleXor:
13981399
case GenISAIntrinsic::GenISA_simdBlockRead:
13991400
case GenISAIntrinsic::GenISA_simdBlockWrite:
14001401
case GenISAIntrinsic::GenISA_simdMediaBlockRead:

IGC/Compiler/CISACodeGen/opCode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ DECLARE_OPCODE(GenISA_RTDualBlendSource, GenISAIntrinsic, llvm_dualRTWrite, fals
178178
DECLARE_OPCODE(GenISA_simdLaneId, GenISAIntrinsic, llvm_simdLaneId, false, false, false, false, false, false, false)
179179
DECLARE_OPCODE(GenISA_simdSize, GenISAIntrinsic, llvm_simdSize, false, false, false, false, false, false, false)
180180
DECLARE_OPCODE(GenISA_simdShuffleDown, GenISAIntrinsic, llvm_simdShuffleDown, false, false, false, false, false, false, false)
181+
DECLARE_OPCODE(GenISA_simdShuffleXor, GenISAIntrinsic, llvm_simdShuffleXor, false, false, false, false, false, false, false)
181182
DECLARE_OPCODE(GenISA_simdBlockRead, GenISAIntrinsic, llvm_simdBlockRead, false, false, false, false, false, false, false)
182183
DECLARE_OPCODE(GenISA_simdBlockReadBindless, GenISAIntrinsic, llvm_simdBlockReadBindless, false, false, false, false, false, false, false)
183184
DECLARE_OPCODE(GenISA_simdBlockWrite, GenISAIntrinsic, llvm_simdBlockWrite, false, false, false, false, false, false, false)

IGC/Compiler/CustomSafeOptPass.cpp

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,69 @@ void CustomSafeOptPass::visitAnd(BinaryOperator& I) {
280280
I.eraseFromParent();
281281
}
282282

283+
// Replace sub_group shuffle with index = sub_group_id ^ xor_value,
284+
// where xor_value is a compile-time constant to intrinsic,
285+
// which will produce sequence of movs instead of using indirect access
286+
// This pattern comes from permute_group_by_xor, but can
287+
// also be written manually as
288+
// uint32_t other_id = sg.get_local_id() ^ XOR_VALUE;
289+
// r = select_from_group(sg, x, other_id);
290+
void CustomSafeOptPass::visitShuffleIndex(llvm::CallInst* I)
291+
{
292+
using namespace llvm::PatternMatch;
293+
294+
bool patternFound = false;
295+
Value* simdLaneId = nullptr;
296+
ConstantInt* xorValueConstant = nullptr;
297+
/*
298+
Pattern match
299+
%simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
300+
%xor = xor i16 %simdLaneId, 1
301+
%xor.i = zext i16 %xor to i32
302+
%simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %x, i32 %xor.i, i32 0)
303+
*/
304+
if (match(I->getOperand(1),
305+
m_ZExt(m_c_Xor(m_Value(simdLaneId), m_ConstantInt(xorValueConstant)))))
306+
{
307+
if (CallInst* CI = dyn_cast<CallInst>(simdLaneId))
308+
{
309+
Function* simdIdF = CI->getCalledFunction();
310+
if (!simdIdF) return;
311+
patternFound =
312+
GenISAIntrinsic::getIntrinsicID(simdIdF) == GenISAIntrinsic::GenISA_simdLaneId;
313+
}
314+
}
315+
316+
auto insertShuffleXor = [](IRBuilder<>& builder,
317+
Value* value,
318+
uint32_t xorValue)->Value*
319+
{
320+
Function* simdShuffleXorFunc = GenISAIntrinsic::getDeclaration(
321+
builder.GetInsertBlock()->getParent()->getParent(),
322+
GenISAIntrinsic::GenISA_simdShuffleXor,
323+
value->getType());
324+
325+
return builder.CreateCall(simdShuffleXorFunc,
326+
{ value, builder.getInt32(xorValue) }, "simdShuffleXor");
327+
};
328+
329+
if (patternFound)
330+
{
331+
uint64_t xorValue = xorValueConstant->getValue().getZExtValue();
332+
333+
if (xorValue >= 16) {
334+
// currently not supported in the emitter
335+
return;
336+
}
337+
338+
Value* value = I->getOperand(0);
339+
IRBuilder<> builder(I);
340+
Value* result = insertShuffleXor(builder, value, static_cast<uint32_t>(xorValue));
341+
I->replaceAllUsesWith(result);
342+
I->eraseFromParent();
343+
}
344+
}
345+
283346
// Check if Lower 64b to 32b transformation is applicable for binary operator
284347
// i.e. trunc(a op b) == trunc(a) op trunc(b)
285348
static bool isTruncInvariant(unsigned Opcode) {
@@ -729,14 +792,19 @@ void CustomSafeOptPass::visitCallInst(CallInst& C)
729792
visitLdRawVec(inst);
730793
break;
731794
}
795+
case GenISAIntrinsic::GenISA_WaveShuffleIndex:
796+
{
797+
visitShuffleIndex(inst);
798+
break;
799+
}
732800
case GenISAIntrinsic::GenISA_OUTPUT:
733801
{
734802
if (pContext->m_ForceEarlyZMathCheck)
735803
{
736804
earlyZDepthDetection(C);
737805
}
738-
break;
739-
}
806+
break;
807+
}
740808
default:
741809
break;
742810
}

IGC/Compiler/CustomSafeOptPass.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ namespace IGC
7474
bool isIdentityMatrix(llvm::ExtractElementInst& I);
7575
void visitAnd(llvm::BinaryOperator& I);
7676
void visitXor(llvm::Instruction& XorInstr);
77+
void visitShuffleIndex(llvm::CallInst* I);
7778
//
7879
// IEEE Floating point arithmetic is not associative. Any pattern
7980
// match that changes the order or paramters is unsafe.
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2017-2022 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: igc_opt -igc-custom-safe-opt -S %s -o %t.ll
10+
; RUN: FileCheck %s --input-file=%t.ll
11+
12+
declare i16 @llvm.genx.GenISA.simdLaneId()
13+
declare i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32, i32, i32)
14+
declare double @llvm.genx.GenISA.WaveShuffleIndex.f64(double, i32, i32)
15+
16+
; Change the call in simple case
17+
define void @test_transformation_simple(i32 %x) nounwind {
18+
entry:
19+
%simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
20+
%xor = xor i16 %simdLaneId, 1
21+
%xor.i = zext i16 %xor to i32
22+
%simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %x, i32 %xor.i, i32 0)
23+
ret void
24+
}
25+
; CHECK-LABEL: @test_transformation_simple
26+
; CHECK: call i32 @llvm.genx.GenISA.simdShuffleXor{{.*}}(i32 %x, i32 1)
27+
28+
29+
; Change the call in double case too
30+
define void @test_transformation_double(double %x) nounwind {
31+
entry:
32+
%simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
33+
%xor = xor i16 %simdLaneId, 15
34+
%xor.i = zext i16 %xor to i32
35+
%simdShuffle = call double @llvm.genx.GenISA.WaveShuffleIndex.f64(double %x, i32 %xor.i, i32 0)
36+
ret void
37+
}
38+
; CHECK-LABEL: @test_transformation_double
39+
; CHECK: call double @llvm.genx.GenISA.simdShuffleXor{{.*}}(double %x, i32 15)
40+
41+
42+
; Change both calls when the value is splitted into high and low parts
43+
define void @test_transformation_splitted(i64 %x) nounwind {
44+
entry:
45+
%vec = bitcast i64 %x to <2 x i32>
46+
%scalar1 = extractelement <2 x i32> %vec, i32 0
47+
%scalar2 = extractelement <2 x i32> %vec, i32 1
48+
%simdLaneId16 = call i16 @llvm.genx.GenISA.simdLaneId()
49+
%xor = xor i16 %simdLaneId16, 8
50+
%xor.i = zext i16 %xor to i32
51+
%simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %scalar1, i32 %xor.i, i32 0)
52+
%simdShuffle2 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %scalar2, i32 %xor.i, i32 0)
53+
%assembled.vect = insertelement <2 x i32> undef, i32 %simdShuffle, i32 0
54+
%assembled.vect2 = insertelement <2 x i32> %assembled.vect, i32 %simdShuffle2, i32 1
55+
ret void
56+
}
57+
; CHECK-LABEL: @test_transformation_splitted
58+
; CHECK: [[I1:%[a-zA-Z0-9.]+]] = call i32 @llvm.genx.GenISA.simdShuffleXor{{.*}}(i32 %scalar1, i32 8)
59+
; CHECK: [[I2:%[a-zA-Z0-9.]+]] = call i32 @llvm.genx.GenISA.simdShuffleXor{{.*}}(i32 %scalar2, i32 8)
60+
; CHECK: [[RES:%[a-zA-Z0-9.]+]] = insertelement <2 x i32> undef, i32 [[I1]], i32 0
61+
; CHECK: insertelement <2 x i32> [[RES]], i32 [[I2]], i32 1
62+
63+
64+
; Do not change the call if xor is not constant
65+
define void @test_no_constant(i32 %x, i16 %xor_value) nounwind {
66+
entry:
67+
%simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
68+
%xor = xor i16 %simdLaneId, %xor_value
69+
%xor.i = zext i16 %xor to i32
70+
%simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %x, i32 %xor.i, i32 0)
71+
ret void
72+
}
73+
; CHECK-LABEL: @test_no_constant
74+
; CHECK: call i32 @llvm.genx.GenISA.WaveShuffleIndex.{{.*}}(i32 %x, i32 %xor.i, i32 0)

IGC/GenISAIntrinsics/Intrinsic_definitions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1787,6 +1787,12 @@
17871787
("int", "offset")],
17881788
"Convergent,NoMem"]],
17891789
####################################################################################################
1790+
"GenISA_simdShuffleXor": ["",
1791+
[("anyint", "result"),
1792+
[(0, "value"),
1793+
("int", "xor value")],
1794+
"Convergent,NoMem"]],
1795+
####################################################################################################
17901796
"GenISA_simdSize": ["",
17911797
[("int", "result"),
17921798
[],

0 commit comments

Comments
 (0)