Fold immediate offsets for 2d block load/store operations

vmustya · igcbot · commit ef12a5979a3f · 2024-06-25T00:38:44.000+02:00
The hardware supports immediate offsets for 2d block load, store and
prefetch operations. So, VC can eliminate the add and sub instructions
which apply the immediate offset to the X and Y indices of the block.
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLscAddrCalcFolding.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLscAddrCalcFolding.cpp
@@ -16,6 +16,7 @@ SPDX-License-Identifier: MIT
 
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 
 #define DEBUG_TYPE "genx-lsc-addr-calc-folding"
@@ -47,8 +48,17 @@ class GenXLscAddrCalcFolding : public FunctionPass,
 
   Value *applyLscAddrFolding(Value *Offsets, APInt &Scale, APInt &Offset);
 
+  static constexpr unsigned Block2DIndexX = 10;
+  static constexpr unsigned Block2DIndexY = 11;
+  static constexpr unsigned Block2DOffsetX = 12;
+  static constexpr unsigned Block2DOffsetY = 13;
+
+  bool foldLscBlock2DAddrCalculation(CallInst &CI, unsigned IndexArg,
+                                     unsigned OffsetArg);
+
   const GenXSubtarget *ST = nullptr;
 
+  unsigned Supported2DOffsetBits = 0;
   bool Changed = false;
 };
 
@@ -62,6 +72,7 @@ void initializeGenXLscAddrCalcFoldingPass(PassRegistry &);
 
 INITIALIZE_PASS_BEGIN(GenXLscAddrCalcFolding, "GenXLscAddrCalcFolding",
                       "GenXLscAddrCalcFolding", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(GenXLscAddrCalcFolding, "GenXLscAddrCalcFolding",
                     "GenXLscAddrCalcFolding", false, false)
 
@@ -112,9 +123,83 @@ void GenXLscAddrCalcFolding::visitCallInst(CallInst &CI) {
   case vc::InternalIntrinsic::lsc_store_quad_bti:
     Changed |= foldLscAddrCalculation(CI);
     break;
+  case vc::InternalIntrinsic::lsc_load_block_2d_ugm:
+  case vc::InternalIntrinsic::lsc_load_block_2d_ugm_transposed:
+  case vc::InternalIntrinsic::lsc_load_block_2d_ugm_vnni:
+  case vc::InternalIntrinsic::lsc_prefetch_block_2d_ugm:
+  case vc::InternalIntrinsic::lsc_store_block_2d_ugm:
+    Changed |= foldLscBlock2DAddrCalculation(CI, Block2DIndexX, Block2DOffsetX);
+    Changed |= foldLscBlock2DAddrCalculation(CI, Block2DIndexY, Block2DOffsetY);
+    break;
   }
 }
 
+bool GenXLscAddrCalcFolding::foldLscBlock2DAddrCalculation(CallInst &CI,
+                                                           unsigned IndexArg,
+                                                           unsigned OffsetArg) {
+  IGC_ASSERT(ST->hasLSCMessages() && ST->hasLSCOffset());
+
+  auto *Index = CI.getArgOperand(IndexArg);
+  auto *OldIndex = Index;
+  auto Offset = cast<ConstantInt>(CI.getArgOperand(OffsetArg))->getValue();
+
+  while (auto *BO = dyn_cast<BinaryOperator>(Index)) {
+    auto Opcode = BO->getOpcode();
+    if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+      break;
+
+    auto *Const = dyn_cast<ConstantInt>(BO->getOperand(1));
+    if (!Const)
+      break;
+
+    auto ConstValue = Const->getValue();
+
+    APInt NewOffset;
+    bool Overflow = false;
+
+    switch (Opcode) {
+    case Instruction::Add:
+      NewOffset = Offset.sadd_ov(ConstValue, Overflow);
+      break;
+    case Instruction::Sub:
+      NewOffset = Offset.ssub_ov(ConstValue, Overflow);
+      break;
+    default:
+      llvm_unreachable("Unexpected opcode");
+    }
+
+    if (Overflow)
+      break;
+
+    Offset = std::move(NewOffset);
+    Index = BO->getOperand(0);
+
+    LLVM_DEBUG(dbgs() << "LSC address folding found, index: " << *Index
+                      << ", offset: " << Offset.getSExtValue() << "\n");
+  }
+
+  if (Index == OldIndex)
+    return false;
+
+  const auto OffsetV = Offset.getSExtValue();
+  const auto ElementSizeBits =
+      vc::InternalIntrinsic::getMemoryRegisterElementSize(&CI);
+  if (OffsetV * ElementSizeBits % genx::DWordBits != 0) {
+    LLVM_DEBUG(dbgs() << "Offset is not dword-aligned\n");
+    return false;
+  }
+
+  IRBuilder<> Builder(&CI);
+
+  LLVM_DEBUG(dbgs() << "Folding LSC address calculation for instruction: " << CI
+                    << "\n");
+  CI.setArgOperand(IndexArg, Index);
+  CI.setArgOperand(OffsetArg, Builder.getInt32(OffsetV));
+  LLVM_DEBUG(dbgs() << "Updated instruction: " << CI << "\n");
+
+  return true;
+}
+
 bool GenXLscAddrCalcFolding::foldLscAddrCalculation(CallInst &Inst) {
   constexpr unsigned AddrIndex = 6, ScaleIndex = 7, OffsetIndex = 8;
 
diff --git a/IGC/VectorCompiler/test/CisaBuilder/lsc_block2d_ugm_offset.ll b/IGC/VectorCompiler/test/CisaBuilder/lsc_block2d_ugm_offset.ll
@@ -0,0 +1,74 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; RUN: %opt %use_old_pass_manager% -GenXModule -GenXCategoryWrapper -GenXCisaBuilderPass -GenXFinalizer \
+; RUN: -march=genx64 -mtriple=spir64-unknown-unknown -finalizer-opts="-dumpcommonisa -isaasmToConsole" \
+; RUN: -mcpu=Xe2 -o /dev/null < %s | FileCheck %s
+
+declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
+declare <32 x i16> @llvm.vc.internal.lsc.load.block.2d.ugm.v32i16.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <32 x i16>)
+declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
+declare <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <64 x i8>)
+
+declare void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32)
+
+declare void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
+
+; CHECK-LABEL: .kernel "test"
+
+; CHECK: .decl [[BASE:V[0-9]+]] v_type=G type=uq num_elts=1 alias=<[[IBASE:V[0-9]+]], 0>
+; CHECK: .decl [[WIDTH:V[0-9]+]] v_type=G type=ud num_elts=1 alias=<[[IWIDTH:V[0-9]+]], 0>
+; CHECK: .decl [[HEIGHT:V[0-9]+]] v_type=G type=ud num_elts=1 alias=<[[IHEIGHT:V[0-9]+]], 0>
+; CHECK: .decl [[PITCH:V[0-9]+]] v_type=G type=ud num_elts=1 alias=<[[IPITCH:V[0-9]+]], 0>
+; CHECK: .decl [[X:V[0-9]+]] v_type=G type=d num_elts=1 alias=<[[IX:V[0-9]+]], 0>
+; CHECK: .decl [[Y:V[0-9]+]] v_type=G type=d num_elts=1 alias=<[[IY:V[0-9]+]], 0>
+; CHECK: .input [[IBASE]] offset=64 size=8
+; CHECK: .input [[IWIDTH]] offset=72 size=4
+; CHECK: .input [[IHEIGHT]] offset=76 size=4
+; CHECK: .input [[IPITCH]] offset=80 size=4
+; CHECK: .input [[IX]] offset=84 size=4
+; CHECK: .input [[IY]] offset=88 size=4
+
+define dllexport spir_kernel void @test(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
+  ; CHECK: lsc_load_block2d.ugm.uc.ca (M1, 1)  [[LOAD:V[0-9]+]]:d32.8x2nn  flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]+16,[[Y]]+32]
+  %load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 32, <16 x i32> undef)
+  ; CHECK: lsc_load_block2d.ugm.ca.uc (M1, 1)  [[LOAD2:V[0-9]+]]:d16.2x8x2nn  flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]+128,[[Y]]-32]
+  %load.a2 = call <32 x i16> @llvm.vc.internal.lsc.load.block.2d.ugm.v32i16.v2i8(i1 true, i8 2, <2 x i8> <i8 2, i8 1>, i8 2, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 128, i32 -32, <32 x i16> undef)
+  ; CHECK: lsc_load_block2d.ugm.st.uc (M1, 1)  [[LOADT:V[0-9]+]]:d32.2x8tn  flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]-192,[[Y]]+64]
+  %load.t = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 5, i8 1>, i8 1, i16 2, i16 8, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 -192, i32 64, <16 x i32> undef)
+  ; CHECK: lsc_load_block2d.ugm.st.ca (M1, 1)  [[LOADV:V[0-9]+]]:d8.4x16nt  flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]],[[Y]]+128]
+  %load.v = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 5, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 0, i32 128, <64 x i8> undef)
+
+  ; CHECK: lsc_load_block2d.ugm.uc.ca (M1, 1)  %null:d64.8x2nn  flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]+256,[[Y]]]
+  call void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1 true, i8 4, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 256, i32 0)
+
+  ; CHECK: lsc_store_block2d.ugm.wt.wb (M1, 1)  flat[[[BASE]],[[WIDTH]],[[HEIGHT]],[[PITCH]],[[X]]-256,[[Y]]-512]  [[LOAD:V[0-9]+]]:d32.8x2nn
+  call void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1 true, i8 3, <2 x i8> <i8 4, i8 3>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 -256, i32 -512, <16 x i32> %load)
+  ret void
+}
+
+attributes #1 = { noinline nounwind "CMGenxMain" "VC.Stack.Amount"="0" "target-cpu"="XeHPC" }
+
+!spirv.Source = !{!0}
+!opencl.spir.version = !{!1}
+!opencl.ocl.version = !{!0}
+!opencl.used.extensions = !{!2}
+!opencl.used.optional.core.features = !{!2}
+!spirv.Generator = !{!3}
+!genx.kernels = !{!4}
+!genx.kernel.internal = !{!8}
+
+!0 = !{i32 0, i32 0}
+!1 = !{i32 1, i32 2}
+!2 = !{}
+!3 = !{i16 6, i16 14}
+!4 = !{void (i64, i32, i32, i32, i32, i32)* @test, !"test", !5, i32 0, !6, !0, !7, i32 0}
+!5 = !{i32 0, i32 0, i32 0, i32 0, i32 0, i32 0}
+!6 = !{i32 64, i32 72, i32 76, i32 80, i32 84, i32 88}
+!7 = !{!"svmptr_t"}
+!8 = !{void (i64, i32, i32, i32, i32, i32)* @test, null, null, null, null}
diff --git a/IGC/VectorCompiler/test/GenXLscAddrCalcFolding/lsc_2d_offset_folding.ll b/IGC/VectorCompiler/test/GenXLscAddrCalcFolding/lsc_2d_offset_folding.ll
@@ -0,0 +1,71 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; RUN: %opt %use_old_pass_manager% -GenXLscAddrCalcFolding -march=genx64 -mcpu=Xe2 -mtriple=spir64-unknown-unknown -S < %s | FileCheck %s
+
+declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
+declare <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
+declare <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <64 x i8>)
+
+declare void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32)
+
+declare void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1, i8, <2 x i8>, i8, i16, i16, i64, i32, i32, i32, i32, i32, i32, i32, <16 x i32>)
+
+; CHECK-LABEL: @test1(
+define <16 x i32> @test1(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
+  %xoff = add i32 %x, 16
+  %yoff = sub i32 %y, 32
+; CHECK: %load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <16 x i32> undef)
+  %load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <16 x i32> undef)
+  ret <16 x i32> %load
+}
+
+; CHECK-LABEL: @test2(
+define <16 x i32> @test2(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
+  %xoff = add i32 %x, 16
+  %yoff = sub i32 %y, 32
+; CHECK: %load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 2, i16 8, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <16 x i32> undef)
+  %load = call <16 x i32> @llvm.vc.internal.lsc.load.block.2d.ugm.transposed.v16i32.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 2, i16 8, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <16 x i32> undef)
+  ret <16 x i32> %load
+}
+
+; CHECK-LABEL: @test3(
+define <64 x i8> @test3(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
+  %xoff = add i32 %x, 16
+  %yoff = sub i32 %y, 32
+; CHECK: %load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <64 x i8> undef)
+  %load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <64 x i8> undef)
+  ret <64 x i8> %load
+}
+
+; CHECK-LABEL: @test4(
+define void @test4(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, <16 x i32> %data) {
+  %xoff = add i32 %x, 16
+  %yoff = sub i32 %y, 32
+; CHECK: call void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32, <16 x i32> %data)
+  call void @llvm.vc.internal.lsc.store.block.2d.ugm.v2i8.v16i32(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <16 x i32> %data)
+  ret void
+}
+
+; CHECK-LABEL: @test5(
+define void @test5(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
+  %xoff = add i32 %x, 16
+  %yoff = sub i32 %y, 32
+; CHECK: call void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y, i32 16, i32 -32)
+  call void @llvm.vc.internal.lsc.prefetch.block.2d.ugm.v2i8(i1 true, i8 3, <2 x i8> <i8 1, i8 2>, i8 1, i16 8, i16 2, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: @test6(
+define <64 x i8> @test6(i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %x, i32 %y) {
+  %xoff = add i32 %x, 17
+  %yoff = sub i32 %y, 32
+; CHECK: %load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %y, i32 0, i32 -32, <64 x i8> undef)
+  %load = call <64 x i8> @llvm.vc.internal.lsc.load.block.2d.ugm.vnni.v64i8.v2i8(i1 true, i8 1, <2 x i8> <i8 1, i8 2>, i8 1, i16 4, i16 16, i64 %base, i32 %width, i32 %height, i32 %pitch, i32 %xoff, i32 %yoff, i32 0, i32 0, <64 x i8> undef)
+  ret <64 x i8> %load
+}