Optimize SYCL joint_matrix_apply lowering for accumulator 32x64

YuriPlyakhin · igcbot · commit cb5ea201b8c9 · 2024-06-27T10:58:22.000+02:00
1. Optimize resolving of slice extract and insert for
accumulator 32x64 to use GEP/Load/Store for accessing/updating
matrix elements instead of extracting vectors from arrays and
composing new arrays.
2. Make sure loop used inside joint_matrix_apply implementation
is always fully unrolled.
diff --git a/IGC/Compiler/GenTTI.cpp b/IGC/Compiler/GenTTI.cpp
@@ -159,6 +159,21 @@ namespace llvm {
 #endif
         )
     {
+        // Always unroll joint_matrix_apply loop
+        for (auto BB : L->blocks())
+        {
+            for (auto &I : *BB)
+            {
+                if (auto *MD = I.getMetadata("joint_matrix_apply"))
+                {
+                    UP.Threshold = UINT_MAX;
+                    UP.UpperBound = true;
+                    UP.Force = true;
+                    return;
+                }
+            }
+        }
+
         unsigned LoopUnrollThreshold = ctx->m_DriverInfo.GetLoopUnrollThreshold();
 
         // override the LoopUnrollThreshold if the registry key is set
diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass/JointMatrixFuncsResolutionPass.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass/JointMatrixFuncsResolutionPass.cpp
@@ -299,6 +299,7 @@ bool JointMatrixFuncsResolutionPass::runOnFunction(Function& F)
     ResolvedValues.clear();
     ResolvedTypes.clear();
     InstsToErase.clear();
+    MatrixAllocas.clear();
     m_SIMDSize = 0;
 
     // Use reverse post order traversal to reduce level or recursion
@@ -1876,6 +1877,30 @@ static Value *mergeComponentToPackedValue(BuilderT *builder, Value *value, Value
     return builder->CreateOr(value, component);
 }
 
+// Gets pointer to element to process in joint_matrix_apply loop for Accumulator 32x64
+// Also updates MatPtr to point to alloca of [2 x <float x 64>] used inside joint_matrix_apply loop
+Value *JointMatrixFuncsResolutionPass::getAcc32x64ElementPtr(CallInst *CI, Value *matrix, Value *index, IRBuilder<> *builder, Value **MatPtr) {
+    if (LoadInst *loadInst = dyn_cast<LoadInst>(matrix)) {
+        *MatPtr = Resolve(loadInst->getPointerOperand());
+    } else {
+        // Use existing alloca or create alloca in the entry node of the function
+        *MatPtr = MatrixAllocas[matrix];
+        if (!*MatPtr) {
+            builder->SetInsertPoint(&*CI->getFunction()->getEntryBlock().getFirstInsertionPt());
+            builder->SetCurrentDebugLocation(CI->getDebugLoc());
+            *MatPtr = builder->CreateAlloca(matrix->getType(), ADDRESS_SPACE_PRIVATE);
+            MatrixAllocas[matrix] = *MatPtr;
+            builder->SetInsertPoint(CI);
+        }
+        builder->CreateStore(matrix, *MatPtr);
+    }
+
+    Value *FloatPtr = builder->CreateBitCast(*MatPtr, builder->getFloatTy()->getPointerTo((*MatPtr)->getType()->getPointerAddressSpace()));
+
+    // create GEP to extract element by 'index' from 'matrix'
+    return builder->CreateGEP(builder->getFloatTy(), FloatPtr, index);
+}
+
 Value *JointMatrixFuncsResolutionPass::ResolveSliceInsert(CallInst *CI) {
     Value *matrix = Resolve(CI->getArgOperand(0));
     Value *component = CI->getArgOperand(1);
@@ -1906,18 +1931,10 @@ Value *JointMatrixFuncsResolutionPass::ResolveSliceInsert(CallInst *CI) {
     // Special case Accumulator 32x64 is represented as [2 x <float x 64>].
     if (isAccumulator32x64(desc))
     {
-        // extract first or second half of array
-        Value *indexArray = builder.CreateICmpUGT(index, ConstantInt::get(index->getType(), 63)); // i1 0 or 1
-        Value *half0 = builder.CreateExtractValue(matrix, {0}, "matrix.slice.half0");
-        Value *half1 = builder.CreateExtractValue(matrix, {1}, "matrix.slice.half1");
-        Value *halfMatrix = builder.CreateSelect(indexArray, half1, half0, "matrix.slice.selected.half"); // <64 x float>
-
-        // insert new component to vector <64 x float> and then insert new vector to array of 2 vectors
-        Value* indexVec = builder.CreateURem(index, ConstantInt::get(index->getType(), 64)); // 0..63
-        slice = builder.CreateInsertElement(halfMatrix, component, indexVec);
-        Value *newHalf0 = builder.CreateSelect(indexArray, half0, slice);
-        Value *newHalf1 = builder.CreateSelect(indexArray, slice, half1);
-        slice = createPair(&builder, getAcc32x64HalfType(builder.getContext()), newHalf0, newHalf1);
+        Value *MatPtr = nullptr;
+        Value *ptrToElem = getAcc32x64ElementPtr(CI, matrix, index, &builder, &MatPtr);
+        builder.CreateStore(component, ptrToElem);
+        slice = builder.CreateLoad(matTy, MatPtr);
     }
     else if (dyn_cast<IGCLLVM::FixedVectorType>(matTy))
         slice = builder.CreateInsertElement(matrix, component, index);
@@ -1942,17 +1959,9 @@ Value *JointMatrixFuncsResolutionPass::ResolveSliceExtract(CallInst *CI) {
         Value *indexVec = index;
         element = updateIndexAndCreateSliceExtract(&builder, matrix, &indexVec, desc.contribBitWidth, desc.bitWidth);
     } else if (isAccumulator32x64(desc)) {
-        // Get index of which element of array to use: 0 or 1
-        Value* indexArray = builder.CreateICmpUGT(index, ConstantInt::get(index->getType(), 63));
-
-        // Select half that we need:
-        Value* half0 = builder.CreateExtractValue(matrix, {0}, "matrix.slice.half0");
-        Value* half1 = builder.CreateExtractValue(matrix, {1}, "matrix.slice.half1");
-        Value* halfMatrix = builder.CreateSelect(indexArray, half1, half0, "matrix.slice.selected.half");
-
-        // get index of element inside vector of 64 elements
-        Value* indexVec = builder.CreateURem(index, ConstantInt::get(index->getType(), 64)); // 0..63
-        element = updateIndexAndCreateSliceExtract(&builder, halfMatrix, &indexVec, desc.contribBitWidth, desc.bitWidth);
+        Value *MatPtr = nullptr;
+        Value *ptrToElem = getAcc32x64ElementPtr(CI, matrix, index, &builder, &MatPtr);
+        element = builder.CreateLoad(builder.getFloatTy(), ptrToElem);
     }
 
     // unpack element we need from packed value
@@ -1964,6 +1973,12 @@ Value *JointMatrixFuncsResolutionPass::ResolveSliceExtract(CallInst *CI) {
     // being replaced has a half return type and the vectorElementType is i16
     element = builder.CreateBitCast(element, CI->getType());
 
+    // Add metadata to mark this value as part of joint_matrix_apply loop
+    // It will be used in getUnrollingPreferences to make sure this loop is fully unrolled
+    Instruction* elementInst = cast<Instruction>(element);
+    MDNode* node = MDNode::get(CI->getContext(), ConstantAsMetadata::get(builder.getInt1(true)));
+    elementInst->setMetadata("joint_matrix_apply", node);
+
     InstsToErase.insert(CI);
     return element;
 }
diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass/JointMatrixFuncsResolutionPass.h b/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass/JointMatrixFuncsResolutionPass.h
@@ -62,6 +62,7 @@ namespace IGC
         llvm::Value *ResolveFill(llvm::CallInst *CI);
         llvm::Instruction *ResolveFillChecked(llvm::CallInst *CI);
         llvm::Value *ResolveWILength(llvm::CallInst *CI);
+        llvm::Value *getAcc32x64ElementPtr(llvm::CallInst *CI, llvm::Value *matrix, llvm::Value *index, llvm::IRBuilder<> *builder, llvm::Value **MatPtr);
         llvm::Value *ResolveSliceInsert(llvm::CallInst *CI);
         llvm::Value *ResolveSliceExtract(llvm::CallInst *CI);
         llvm::Instruction *ResolveGetCoord(llvm::CallInst *CI);
@@ -113,6 +114,7 @@ namespace IGC
 
         llvm::ValueMap<llvm::Value *, llvm::Instruction *> PlaceholderInstructions;
         llvm::ValueMap<llvm::Value *, llvm::Value *> ResolvedValues;
+        llvm::ValueMap<llvm::Value *, llvm::Value *> MatrixAllocas;
         std::unordered_map<llvm::Type *, llvm::Type *> ResolvedTypes;
         llvm::SmallPtrSet<llvm::Instruction *, 8> InstsToErase;
         // Maps function to it's kernel entry function
diff --git a/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/extract_insert.ll b/IGC/Compiler/tests/JointMatrixFuncsResolutionPass/extract_insert.ll
@@ -9,6 +9,9 @@
 ; RUN: igc_opt -platformpvc -igc-joint-matrix-resolution -S 2>&1 < %s | FileCheck %s
 ; ------------------------------------------------
 ; JointMatrixFuncsResolutionPass
+;
+; Test verifies resolution of joint matrix extract and insert functions,
+; including adding of joint_matrix_apply metadata.
 ; ------------------------------------------------
 
 %spirv.JointMatrixINTEL._float_16_16_3_3_2 = type opaque
@@ -18,53 +21,47 @@
 ; CHECK-SAME: float addrspace(1)* [[PTR1:%.*]], i64 [[IND1:%.*]], float addrspace(1)* [[PTR2:%.*]], i64 [[IND2:%.*]]) {
 define spir_kernel void @test(float addrspace(1)* %ptr1, i64 %ind1, float addrspace(1)* %ptr2, i64 %ind2) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x <64 x float>]
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca <16 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x <64 x float>]
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <16 x float>
 
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x float>* [[TMP2]] to i8*
-; CHECK-NEXT:    call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_RowMajor_SG16_16x16_i32_16_global_v8i8_pi32_i32(i8* [[TMP3]], float addrspace(1)* [[PTR1]], i64 32, i32 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x float>, <16 x float>* [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x float>* [[TMP3]] to i8*
+; CHECK-NEXT:    call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_RowMajor_SG16_16x16_i32_16_global_v8i8_pi32_i32(i8* [[TMP4]], float addrspace(1)* [[PTR1]], i64 32, i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, <16 x float>* [[TMP3]]
   %C1 = call spir_func %spirv.JointMatrixINTEL._float_16_16_3_3_2 addrspace(1)* @_Z81__spirv_JointMatrixLoadINTEL_RPU3AS143__spirv_JointMatrixINTEL__float_16_16_3_3_2PU3AS1fliii(float addrspace(1)* %ptr1, i64 32, i32 0, i32 3, i32 0)
 
-; CHECK-NEXT:    [[MATRIX_ELEMENT:%.*]] = extractelement <16 x float> [[TMP4]], i64 [[IND1]]
+; CHECK-NEXT:    [[MATRIX_ELEMENT:%.*]] = extractelement <16 x float> [[TMP5]], i64 [[IND1]]
   %1 = call spir_func float @_Z28__spirv_VectorExtractDynamicPU3AS143__spirv_JointMatrixINTEL__float_16_16_3_3_2l(%spirv.JointMatrixINTEL._float_16_16_3_3_2 addrspace(1)* %C1, i64 %ind1)
 
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd float [[MATRIX_ELEMENT]], 5.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[MATRIX_ELEMENT]], 5.000000e+00
   %2 = fadd float %1, 5.0
 
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP4]], float [[TMP5]], i64 [[IND1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP5]], float [[TMP6]], i64 [[IND1]]
   %3 = call spir_func %spirv.JointMatrixINTEL._float_16_16_3_3_2 addrspace(1)* @_Z27__spirv_VectorInsertDynamicPU3AS143__spirv_JointMatrixINTEL__float_16_16_3_3_2fl(%spirv.JointMatrixINTEL._float_16_16_3_3_2 addrspace(1)* %C1, float %2, i64 %ind1)
 
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast [2 x <64 x float>]* [[TMP1]] to i8*
-; CHECK-NEXT:    call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_global_v8i8_pi32_i32(i8* [[TMP7]], float addrspace(1)* [[PTR2]], i64 128, i32 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast [2 x <64 x float>]* [[TMP1]] to <64 x float>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <64 x float>, <64 x float>* [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr <64 x float>, <64 x float>* [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load <64 x float>, <64 x float>* [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue [2 x <64 x float>] undef, <64 x float> [[TMP9]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue [2 x <64 x float>] [[TMP12]], <64 x float> [[TMP11]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast [2 x <64 x float>]* [[TMP2]] to i8*
+; CHECK-NEXT:    call void @__builtin_spriv_OpJointMatrixLoadINTEL_Accumulator_RowMajor_SG16_32x64_i32_128_global_v8i8_pi32_i32(i8* [[TMP8]], float addrspace(1)* [[PTR2]], i64 128, i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast [2 x <64 x float>]* [[TMP2]] to <64 x float>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <64 x float>, <64 x float>* [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr <64 x float>, <64 x float>* [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load <64 x float>, <64 x float>* [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertvalue [2 x <64 x float>] undef, <64 x float> [[TMP10]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue [2 x <64 x float>] [[TMP13]], <64 x float> [[TMP12]], 1
   %C2 = call spir_func %spirv.JointMatrixINTEL._float_32_64_3_3_2 addrspace(1)* @_Z81__spirv_JointMatrixLoadINTEL_RPU3AS143__spirv_JointMatrixINTEL__float_32_64_3_3_2PU3AS1fliii(float addrspace(1)* %ptr2, i64 128, i32 0, i32 3, i32 0)
 
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[IND2]], 63
-; CHECK-NEXT:    [[MATRIX_SLICE_HALF0:%.*]] = extractvalue [2 x <64 x float>] [[TMP13]], 0
-; CHECK-NEXT:    [[MATRIX_SLICE_HALF1:%.*]] = extractvalue [2 x <64 x float>] [[TMP13]], 1
-; CHECK-NEXT:    [[MATRIX_SLICE_SELECTED_HALF:%.*]] = select i1 [[TMP14]], <64 x float> [[MATRIX_SLICE_HALF1]], <64 x float> [[MATRIX_SLICE_HALF0]]
-; CHECK-NEXT:    [[TMP15:%.*]] = urem i64 [[IND2]], 64
-; CHECK-NEXT:    [[MATRIX_ELEMENT5:%.*]] = extractelement <64 x float> [[MATRIX_SLICE_SELECTED_HALF]], i64 [[TMP15]]
+; CHECK-NEXT:    store [2 x <64 x float>] [[TMP14]], [2 x <64 x float>]* [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast [2 x <64 x float>]* [[TMP1]] to float*
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr float, float* [[TMP15]], i64 [[IND2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]],{{.*}} !joint_matrix_apply [[MD:![0-9]+]]
   %4 = call spir_func float @_Z28__spirv_VectorExtractDynamicPU3AS143__spirv_JointMatrixINTEL__float_32_64_3_3_2l(%spirv.JointMatrixINTEL._float_32_64_3_3_2 addrspace(1)* %C2, i64 %ind2)
 
-; CHECK-NEXT:    [[TMP16:%.*]] = fadd float [[MATRIX_ELEMENT5]], 5.000000e+00
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd float [[TMP17]], 5.000000e+00
   %5 = fadd float %4, 5.0
 
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp ugt i64 [[IND2]], 63
-; CHECK-NEXT:    [[MATRIX_SLICE_HALF07:%.*]] = extractvalue [2 x <64 x float>] [[TMP13]], 0
-; CHECK-NEXT:    [[MATRIX_SLICE_HALF18:%.*]] = extractvalue [2 x <64 x float>] [[TMP13]], 1
-; CHECK-NEXT:    [[MATRIX_SLICE_SELECTED_HALF9:%.*]] = select i1 [[TMP17]], <64 x float> [[MATRIX_SLICE_HALF18]], <64 x float> [[MATRIX_SLICE_HALF07]]
-; CHECK-NEXT:    [[TMP18:%.*]] = urem i64 [[IND2]], 64
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <64 x float> [[MATRIX_SLICE_SELECTED_HALF9]], float [[TMP16]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP17]], <64 x float> [[MATRIX_SLICE_HALF07]], <64 x float> [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP17]], <64 x float> [[TMP19]], <64 x float> [[MATRIX_SLICE_HALF18]]
-; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue [2 x <64 x float>] undef, <64 x float> [[TMP20]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = insertvalue [2 x <64 x float>] [[TMP22]], <64 x float> [[TMP21]], 1
+; CHECK-NEXT:    store [2 x <64 x float>] [[TMP14]], [2 x <64 x float>]* [[TMP1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast [2 x <64 x float>]* [[TMP1]] to float*
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr float, float* [[TMP19]], i64 [[IND2]]
+; CHECK-NEXT:    store float [[TMP18]], float* [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load [2 x <64 x float>], [2 x <64 x float>]* [[TMP1]]
   %6 = call spir_func %spirv.JointMatrixINTEL._float_32_64_3_3_2 addrspace(1)* @_Z27__spirv_VectorInsertDynamicPU3AS143__spirv_JointMatrixINTEL__float_32_64_3_3_2fl(%spirv.JointMatrixINTEL._float_32_64_3_3_2 addrspace(1)* %C2, float %5, i64 %ind2)
 
 ; CHECK-NEXT:    ret void
@@ -79,6 +76,7 @@ declare spir_func %spirv.JointMatrixINTEL._float_32_64_3_3_2 addrspace(1)* @_Z27
 declare spir_func %spirv.JointMatrixINTEL._float_32_64_3_3_2 addrspace(1)* @_Z81__spirv_JointMatrixLoadINTEL_RPU3AS143__spirv_JointMatrixINTEL__float_32_64_3_3_2PU3AS1fliii(float addrspace(1)*, i64, i32, i32, i32)
 declare spir_func %spirv.JointMatrixINTEL._float_16_16_3_3_2 addrspace(1)* @_Z81__spirv_JointMatrixLoadINTEL_RPU3AS143__spirv_JointMatrixINTEL__float_16_16_3_3_2PU3AS1fliii(float addrspace(1)*, i64, i32, i32, i32)
 
+; CHECK: [[MD]] = !{i1 true}
 !igc.functions = !{!0}
 !0 = !{void (float addrspace(1)*, i64, float addrspace(1)*, i64)* @test, !1}
 !1 = !{!2, !3}