Optimize sub-group shuffle xor built-in

admitric · pszymich · commit 0882965cd06d · 2022-11-04T14:01:36.000+01:00
Replace sub_group shuffle with index = sub_group_id ^ xor_value,
where xor_value is a compile-time constant to intrinsic,
which will produce sequence of movs instead of using indirect access
diff --git a/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp b/IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp
@@ -278,6 +278,7 @@ void CheckInstrTypes::visitCallInst(CallInst& C)
         case GenISAIntrinsic::GenISA_WaveClustered:
         case GenISAIntrinsic::GenISA_QuadPrefix:
         case GenISAIntrinsic::GenISA_simdShuffleDown:
+        case GenISAIntrinsic::GenISA_simdShuffleXor:
             g_InstrTypes->numWaveIntrinsics++;
             break;
         case GenISAIntrinsic::GenISA_DCL_inputVec:
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -5175,6 +5175,162 @@ void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
     }
 }
 
+void EmitPass::emitSimdShuffleXor(llvm::Instruction* inst)
+{
+    CVariable* pData = m_currShader->GetSymbol(inst->getOperand(0));
+    CVariable* pXorValue = m_currShader->GetSymbol(inst->getOperand(1));
+
+    IGC_ASSERT_MESSAGE(pXorValue->IsImmediate(), "simdShuffleXor must have \
+        constant xorValue parameter");
+
+    // emit move sequence for 1 bit
+    // case 0:  1 2 3 4 5 6 7 8     => 2 1 4 3 6 5 8 7
+    // case 1:  1 2 3 4 5 6 7 8     => 3 4 1 2 7 8 5 6
+    // case 1:  1 2 3 4 5 6 7 8     => 3 4 1 2 7 8 5 6
+    // case 2:  1 2 3 4 5 6 7 8     => 5 6 7 8 1 2 3 4
+    // case 3:  1 2 .. 8 9 .. 15 16 => 9 10 .. 15 16 1 2 .. 7 8
+    auto emitShuffleXor1Bit = [&](CVariable* pData, uint xorBit) -> CVariable*
+    {
+        VISA_Type type = pData->GetType();
+        bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
+
+        CVariable* pResult = m_currShader->GetNewVariable(
+            pData->GetNumberElement(),
+            pData->GetType(),
+            pData->GetAlign(),
+            false,
+            1,
+            "simdShuffleXorTmp");
+
+        if (xorBit == 0 || (xorBit == 1 && !is64bitType)) {
+            // Use strided access of max possible length
+            // For simd16 and xorBit == 0
+            //     mov (M1_NM, 8) simdShuffleXorTmp(0,0)<2> V0040(0,1)<2;1,0>                   /// $11
+            //     mov (M1_NM, 8) simdShuffleXorTmp(0,1)<2> V0040(0,0)<2;1,0>                   /// $12
+            // For 32-bit it will be just 2 movs, for 64-bit double type let the finalizer to split the vars:
+            // r10 is the source
+            // (W)     mov (16|M0)              r19.0<1>:ud   r10.2<4;2,1>:ud                  {$4.dst}             // $13
+            // (W)     mov (8|M0)               r18.0<1>:df   r19.0<1;1,0>:df                  {I@1}                // $13
+            // (W)     mov (8|M0)               r12.0<4>:ud   r18.0<2;1,0>:ud                  {Compacted,L@1}      // $13
+            // (W)     mov (8|M0)               r12.1<4>:ud   r18.1<2;1,0>:ud                                       // $13
+            // (W)     mov (16|M0)              r21.0<1>:ud   r10.0<4;2,1>:ud                                       // $14
+            // (W)     mov (8|M0)               r20.0<1>:df   r21.0<1;1,0>:df                  {I@1}                // $14
+            // (W)     mov (8|M0)               r12.2<4>:ud   r20.0<2;1,0>:ud                  {Compacted,L@1}      // $14
+            // (W)     mov (8|M0)               r12.3<4>:ud   r20.1<2;1,0>:ud                                       // $14
+
+            // For int32 and xorBit == 1
+            //     mov (M1_NM, 4) simdShuffleXorTmp(0,0)<4> V0040(0,2)<4;1,0>                   /// $11
+            //     mov (M1_NM, 4) simdShuffleXorTmp(0,2)<4> V0040(0,0)<4;1,0>                   /// $12
+            //     mov (M1_NM, 4) simdShuffleXorTmp(0,1)<4> V0040(0,3)<4;1,0>                   /// $13
+            //     mov (M1_NM, 4) simdShuffleXorTmp(0,3)<4> V0040(0,1)<4;1,0>                   /// $14
+            // for xorBit == 1 strided moves are beneficial only if the type is less that 64-bit
+            // (fewer moves will be generated)
+
+            // for xorBit > 1 is it always more beneficial to copy with subsequent chunks
+
+            auto stride = (2 * (xorBit + 1));
+            auto width = pData->GetNumberElement() / stride;
+            auto currentSimdMode = lanesToSIMDMode(width);
+
+            for (uint i = 0; i < xorBit + 1; i++) {
+                m_encoder->SetSimdSize(currentSimdMode);
+                m_encoder->SetSrcRegion(0, stride, 1, 0);
+                m_encoder->SetSrcSubReg(0, i + xorBit + 1);
+                m_encoder->SetDstRegion(stride);
+                m_encoder->SetDstSubReg(i);
+                m_encoder->SetNoMask();
+                m_encoder->Copy(pResult, pData);
+                m_encoder->Push();
+
+                m_encoder->SetSimdSize(currentSimdMode);
+                m_encoder->SetSrcRegion(0, stride, 1, 0);
+                m_encoder->SetSrcSubReg(0, i);
+                m_encoder->SetDstRegion(stride);
+                m_encoder->SetDstSubReg(i + xorBit + 1);
+                m_encoder->SetNoMask();
+                m_encoder->Copy(pResult, pData);
+                m_encoder->Push();
+            }
+        }
+        else if ((xorBit >= 1) && (xorBit <= 3)) {
+            // Use subsequent accesses to copy all subsequent chunks
+            // for xorBit == 2
+            //     mov (M1_NM, 4) simdShuffleXorTmp(0,0)<1> V0043(0,4)<1;1,0>                   /// $13
+            //     mov (M1_NM, 4) simdShuffleXorTmp(0,4)<1> V0043(0,0)<1;1,0>                   /// $14
+            //     mov (M1_NM, 4) simdShuffleXorTmp(1,0)<1> V0043(1,4)<1;1,0>                   /// $15
+            //     mov (M1_NM, 4) simdShuffleXorTmp(1,4)<1> V0043(1,0)<1;1,0>                   /// $16
+            // for 64-bit types the accesses will be 2x widened in finalizer
+            // (W)     mov (8|M0)               r12.0<1>:ud   r10.8<1;1,0>:ud                  {Compacted,$4.dst}   // $13
+            // (W)     mov (8|M0)               r12.8<1>:ud   r10.0<1;1,0>:ud                  {Compacted}          // $14
+            // (W)     mov (8|M0)               r13.0<1>:ud   r11.8<1;1,0>:ud                  {Compacted}          // $15
+            // (W)     mov (8|M0)               r13.8<1>:ud   r11.0<1;1,0>:ud                  {Compacted}          // $16
+            // The number of chunks is larger on the larger SIMD
+
+            auto width = static_cast<int>(std::pow(2, xorBit));
+            auto currentSimdMode = lanesToSIMDMode(width);
+
+            for (uint i = 0; i < pData->GetNumberElement(); i += width * 2) {
+                m_encoder->SetSimdSize(currentSimdMode);
+                m_encoder->SetSrcRegion(0, 1, 1, 0);
+                m_encoder->SetSrcSubReg(0, i + width);
+                m_encoder->SetDstRegion(1);
+                m_encoder->SetDstSubReg(i);
+                m_encoder->SetNoMask();
+                m_encoder->Copy(pResult, pData);
+                m_encoder->Push();
+
+                m_encoder->SetSimdSize(currentSimdMode);
+                m_encoder->SetSrcRegion(0, 1, 1, 0);
+                m_encoder->SetSrcSubReg(0, i);
+                m_encoder->SetDstRegion(1);
+                m_encoder->SetDstSubReg(i + width);
+                m_encoder->SetNoMask();
+                m_encoder->Copy(pResult, pData);
+                m_encoder->Push();
+            }
+        }
+        else {
+            IGC_ASSERT_MESSAGE(false, "simdShuffleXor is only implemented for 0 <= xor_value <= 15");
+        };
+
+        return pResult;
+    };
+
+    // just broadcast the value if the value is uniform
+    if (pData->IsUniform()) {
+        m_encoder->SetSrcRegion(0, 0, 1, 0);
+        m_encoder->SetSrcSubReg(0, 0);
+        m_encoder->SetDstRegion(1);
+        m_encoder->SetDstSubReg(0);
+        m_encoder->Copy(m_destination, pData);
+        m_encoder->Push();
+        return;
+    }
+
+    // emit moves for every non-zero bit subsequently
+    const auto xorValue = pXorValue->GetImmediateValue();
+    CVariable* tempValue = pData;
+    for (uint i = 0; i < 5; i++)
+    {
+        if (((xorValue >> i) & 0x1) == 0x1)
+        {
+            tempValue = emitShuffleXor1Bit(tempValue, i);
+        }
+    }
+
+    // final copy, respecting the execution mask if in divergent CF
+    if (!m_currShader->InsideDivergentCF(inst))
+    {
+        m_encoder->SetNoMask();
+    }
+    m_encoder->SetSrcRegion(0, 1, 1, 0);
+    m_encoder->SetSrcSubReg(0, 0);
+    m_encoder->SetDstRegion(1);
+    m_encoder->SetDstSubReg(0);
+    m_encoder->Copy(m_destination, tempValue);
+    m_encoder->Push();
+}
+
 static uint32_t getBlockMsgSize(uint32_t bytesRemaining, uint32_t maxSize)
 {
     uint32_t size = 0;
@@ -7235,6 +7391,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
     case GenISAIntrinsic::GenISA_simdShuffleDown:
         emitSimdShuffleDown(inst);
         break;
+    case GenISAIntrinsic::GenISA_simdShuffleXor:
+        emitSimdShuffleXor(inst);
+        break;
     case GenISAIntrinsic::GenISA_simdBlockRead:
         emitSimdBlockRead(inst);
         break;
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp
@@ -207,6 +207,7 @@ class EmitPass : public llvm::FunctionPass
     void emitSimdShuffle(llvm::Instruction* inst);
     void emitCrossInstanceMov(const SSource& source, const DstModifier& modifier);
     void emitSimdShuffleDown(llvm::Instruction* inst);
+    void emitSimdShuffleXor(llvm::Instruction* inst);
     void emitSimdBlockRead(llvm::Instruction* inst, llvm::Value* ptrVal = nullptr);
     void emitSimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal = nullptr);
     void emitLegacySimdBlockWrite(llvm::Instruction* inst, llvm::Value* ptrVal = nullptr);
diff --git a/IGC/Compiler/CISACodeGen/helper.cpp b/IGC/Compiler/CISACodeGen/helper.cpp
@@ -1395,6 +1395,7 @@ namespace IGC
         {
         case GenISAIntrinsic::GenISA_WaveShuffleIndex:
         case GenISAIntrinsic::GenISA_simdShuffleDown:
+        case GenISAIntrinsic::GenISA_simdShuffleXor:
         case GenISAIntrinsic::GenISA_simdBlockRead:
         case GenISAIntrinsic::GenISA_simdBlockWrite:
         case GenISAIntrinsic::GenISA_simdMediaBlockRead:
diff --git a/IGC/Compiler/CISACodeGen/opCode.h b/IGC/Compiler/CISACodeGen/opCode.h
@@ -178,6 +178,7 @@ DECLARE_OPCODE(GenISA_RTDualBlendSource, GenISAIntrinsic, llvm_dualRTWrite, fals
 DECLARE_OPCODE(GenISA_simdLaneId, GenISAIntrinsic, llvm_simdLaneId, false, false, false, false, false, false, false)
 DECLARE_OPCODE(GenISA_simdSize, GenISAIntrinsic, llvm_simdSize, false, false, false, false, false, false, false)
 DECLARE_OPCODE(GenISA_simdShuffleDown, GenISAIntrinsic, llvm_simdShuffleDown, false, false, false, false, false, false, false)
+DECLARE_OPCODE(GenISA_simdShuffleXor, GenISAIntrinsic, llvm_simdShuffleXor, false, false, false, false, false, false, false)
 DECLARE_OPCODE(GenISA_simdBlockRead, GenISAIntrinsic, llvm_simdBlockRead, false, false, false, false, false, false, false)
 DECLARE_OPCODE(GenISA_simdBlockReadBindless, GenISAIntrinsic, llvm_simdBlockReadBindless, false, false, false, false, false, false, false)
 DECLARE_OPCODE(GenISA_simdBlockWrite, GenISAIntrinsic, llvm_simdBlockWrite, false, false, false, false, false, false, false)
diff --git a/IGC/Compiler/CustomSafeOptPass.cpp b/IGC/Compiler/CustomSafeOptPass.cpp
@@ -280,6 +280,69 @@ void CustomSafeOptPass::visitAnd(BinaryOperator& I) {
     I.eraseFromParent();
 }
 
+// Replace sub_group shuffle with index = sub_group_id ^ xor_value,
+// where xor_value is a compile-time constant to intrinsic,
+// which will produce sequence of movs instead of using indirect access
+// This pattern comes from permute_group_by_xor, but can
+// also be written manually as
+//   uint32_t other_id = sg.get_local_id() ^ XOR_VALUE;
+//   r = select_from_group(sg, x, other_id);
+void CustomSafeOptPass::visitShuffleIndex(llvm::CallInst* I)
+{
+    using namespace llvm::PatternMatch;
+
+    bool patternFound = false;
+    Value* simdLaneId = nullptr;
+    ConstantInt* xorValueConstant = nullptr;
+    /*
+    Pattern match
+    %simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
+    %xor = xor i16 %simdLaneId, 1
+    %xor.i = zext i16 %xor to i32
+    %simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %x, i32 %xor.i, i32 0)
+    */
+    if (match(I->getOperand(1),
+              m_ZExt(m_c_Xor(m_Value(simdLaneId), m_ConstantInt(xorValueConstant)))))
+    {
+        if (CallInst* CI = dyn_cast<CallInst>(simdLaneId))
+        {
+            Function* simdIdF = CI->getCalledFunction();
+            if (!simdIdF) return;
+            patternFound =
+                GenISAIntrinsic::getIntrinsicID(simdIdF) == GenISAIntrinsic::GenISA_simdLaneId;
+        }
+    }
+
+    auto insertShuffleXor = [](IRBuilder<>& builder,
+                                    Value* value,
+                                    uint32_t xorValue)->Value*
+    {
+        Function* simdShuffleXorFunc = GenISAIntrinsic::getDeclaration(
+            builder.GetInsertBlock()->getParent()->getParent(),
+            GenISAIntrinsic::GenISA_simdShuffleXor,
+            value->getType());
+
+        return builder.CreateCall(simdShuffleXorFunc,
+            { value, builder.getInt32(xorValue) }, "simdShuffleXor");
+    };
+
+    if (patternFound)
+    {
+        uint64_t xorValue = xorValueConstant->getValue().getZExtValue();
+
+        if (xorValue >= 16) {
+            // currently not supported in the emitter
+            return;
+        }
+
+        Value* value = I->getOperand(0);
+        IRBuilder<> builder(I);
+        Value* result = insertShuffleXor(builder, value, static_cast<uint32_t>(xorValue));
+        I->replaceAllUsesWith(result);
+        I->eraseFromParent();
+    }
+}
+
 // Check if Lower 64b to 32b transformation is applicable for binary operator
 // i.e. trunc(a op b) == trunc(a) op trunc(b)
 static bool isTruncInvariant(unsigned Opcode) {
@@ -729,14 +792,19 @@ void CustomSafeOptPass::visitCallInst(CallInst& C)
             visitLdRawVec(inst);
             break;
         }
+        case GenISAIntrinsic::GenISA_WaveShuffleIndex:
+        {
+            visitShuffleIndex(inst);
+            break;
+        }
         case GenISAIntrinsic::GenISA_OUTPUT:
         {
             if (pContext->m_ForceEarlyZMathCheck)
             {
                 earlyZDepthDetection(C);
             }
-            break;
-        }
+	    break;
+	}
         default:
             break;
         }
diff --git a/IGC/Compiler/CustomSafeOptPass.hpp b/IGC/Compiler/CustomSafeOptPass.hpp
@@ -74,6 +74,7 @@ namespace IGC
         bool isIdentityMatrix(llvm::ExtractElementInst& I);
         void visitAnd(llvm::BinaryOperator& I);
         void visitXor(llvm::Instruction& XorInstr);
+        void visitShuffleIndex(llvm::CallInst* I);
         //
         // IEEE Floating point arithmetic is not associative.  Any pattern
         // match that changes the order or paramters is unsafe.
diff --git a/IGC/Compiler/tests/CustomSafeOptPass/simd_shuffle_xor.ll b/IGC/Compiler/tests/CustomSafeOptPass/simd_shuffle_xor.ll
@@ -0,0 +1,74 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2017-2022 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; RUN: igc_opt -igc-custom-safe-opt -S %s -o %t.ll
+; RUN: FileCheck %s --input-file=%t.ll
+
+declare i16 @llvm.genx.GenISA.simdLaneId()
+declare i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32, i32, i32)
+declare double @llvm.genx.GenISA.WaveShuffleIndex.f64(double, i32, i32)
+
+; Change the call in simple case
+define void @test_transformation_simple(i32 %x) nounwind {
+entry:
+  %simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
+  %xor = xor i16 %simdLaneId, 1
+  %xor.i = zext i16 %xor to i32
+  %simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %x, i32 %xor.i, i32 0)
+  ret void
+}
+; CHECK-LABEL: @test_transformation_simple
+; CHECK: call i32 @llvm.genx.GenISA.simdShuffleXor{{.*}}(i32 %x, i32 1)
+
+
+; Change the call in double case too
+define void @test_transformation_double(double %x) nounwind {
+entry:
+  %simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
+  %xor = xor i16 %simdLaneId, 15
+  %xor.i = zext i16 %xor to i32
+  %simdShuffle = call double @llvm.genx.GenISA.WaveShuffleIndex.f64(double %x, i32 %xor.i, i32 0)
+  ret void
+}
+; CHECK-LABEL: @test_transformation_double
+; CHECK: call double @llvm.genx.GenISA.simdShuffleXor{{.*}}(double %x, i32 15)
+
+
+; Change both calls when the value is splitted into high and low parts
+define void @test_transformation_splitted(i64 %x) nounwind {
+entry:
+  %vec = bitcast i64 %x to <2 x i32>
+  %scalar1 = extractelement <2 x i32> %vec, i32 0
+  %scalar2 = extractelement <2 x i32> %vec, i32 1
+  %simdLaneId16 = call i16 @llvm.genx.GenISA.simdLaneId()
+  %xor = xor i16 %simdLaneId16, 8
+  %xor.i = zext i16 %xor to i32
+  %simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %scalar1, i32 %xor.i, i32 0)
+  %simdShuffle2 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %scalar2, i32 %xor.i, i32 0)
+  %assembled.vect = insertelement <2 x i32> undef, i32 %simdShuffle, i32 0
+  %assembled.vect2 = insertelement <2 x i32> %assembled.vect, i32 %simdShuffle2, i32 1
+  ret void
+}
+; CHECK-LABEL: @test_transformation_splitted
+; CHECK: [[I1:%[a-zA-Z0-9.]+]] = call i32 @llvm.genx.GenISA.simdShuffleXor{{.*}}(i32 %scalar1, i32 8)
+; CHECK: [[I2:%[a-zA-Z0-9.]+]] = call i32 @llvm.genx.GenISA.simdShuffleXor{{.*}}(i32 %scalar2, i32 8)
+; CHECK: [[RES:%[a-zA-Z0-9.]+]] = insertelement <2 x i32> undef, i32 [[I1]], i32 0
+; CHECK: insertelement <2 x i32> [[RES]], i32 [[I2]], i32 1
+
+
+; Do not change the call if xor is not constant
+define void @test_no_constant(i32 %x, i16 %xor_value) nounwind {
+entry:
+  %simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
+  %xor = xor i16 %simdLaneId, %xor_value
+  %xor.i = zext i16 %xor to i32
+  %simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %x, i32 %xor.i, i32 0)
+  ret void
+}
+; CHECK-LABEL: @test_no_constant
+; CHECK: call i32 @llvm.genx.GenISA.WaveShuffleIndex.{{.*}}(i32 %x, i32 %xor.i, i32 0)
diff --git a/IGC/GenISAIntrinsics/Intrinsic_definitions.py b/IGC/GenISAIntrinsics/Intrinsic_definitions.py
@@ -1787,6 +1787,12 @@
      ("int",                           "offset")],
     "Convergent,NoMem"]],
 ####################################################################################################
+"GenISA_simdShuffleXor": ["",
+    [("anyint",                        "result"),
+    [(0,                               "value"),
+     ("int",                           "xor value")],
+    "Convergent,NoMem"]],
+####################################################################################################
 "GenISA_simdSize": ["",
     [("int",                           "result"),
     [],

Original file line number	Diff line number	Diff line change
`@@ -1395,6 +1395,7 @@ namespace IGC`
`1395`	`1395`	`{`
`1396`	`1396`	`case GenISAIntrinsic::GenISA_WaveShuffleIndex:`
`1397`	`1397`	`case GenISAIntrinsic::GenISA_simdShuffleDown:`
	`1398`	`+ case GenISAIntrinsic::GenISA_simdShuffleXor:`
`1398`	`1399`	`case GenISAIntrinsic::GenISA_simdBlockRead:`
`1399`	`1400`	`case GenISAIntrinsic::GenISA_simdBlockWrite:`
`1400`	`1401`	`case GenISAIntrinsic::GenISA_simdMediaBlockRead:`