Make the LowerBitCast pass support opaque pointers.

jcranmer-intel · svenvh · commit 83b2dfd317d9 · 2022-05-18T09:59:06.000+01:00
This is a relatively large change, as the original pass relied on being able to
track the initial bitcast &lt;3 x i64&gt;* to &lt;6 x i32&gt;* to know where to start
rewriting. Instead, this patch starts at the final invalid extractelement call
and works its way backwards as far as necessary to generate correct code.
diff --git a/lib/SPIRV/SPIRVLowerBitCastToNonStandardType.cpp b/lib/SPIRV/SPIRVLowerBitCastToNonStandardType.cpp
@@ -40,104 +40,83 @@
 // point types, 2/3/4/8/16-element vector of scalar types").
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "spv-lower-bitcast-to-nonstandard-type"
-
 #include "SPIRVInternal.h"
 
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 #include <utility>
 
+#define DEBUG_TYPE "spv-lower-bitcast-to-nonstandard-type"
+
 using namespace llvm;
 
 namespace SPIRV {
 
-static VectorType *getVectorType(Type *Ty) {
-  assert(Ty != nullptr && "Expected non-null type");
-  if (auto *ElemTy = dyn_cast<PointerType>(Ty))
-    Ty = ElemTy->getPointerElementType();
-  return dyn_cast<VectorType>(Ty);
-}
+using NFIRBuilder = IRBuilder<NoFolder>;
 
-/// Since SPIR-V does not support non-standard vector types, instructions using
-/// these types should be replaced in a special way to avoid using of
-/// unsupported types.
-/// lowerBitCastToNonStdVec function is designed to avoid using of bitcast to
-/// unsupported vector types instructions and should be called if similar
-/// instructions have been encountered in input LLVM IR.
-bool lowerBitCastToNonStdVec(Instruction *OldInst, Value *NewInst,
-                             const VectorType *OldVecTy,
-                             std::vector<Instruction *> &InstsToErase,
-                             IRBuilder<> &Builder,
-                             unsigned RecursionDepth = 0) {
-  static constexpr unsigned MaxRecursionDepth = 16;
-  if (RecursionDepth++ > MaxRecursionDepth)
-    report_fatal_error(
-        llvm::Twine(
-            "The depth of recursion exceeds the maximum possible depth"),
-        false);
-
-  bool Changed = false;
-  VectorType *NewVecTy = getVectorType(NewInst->getType());
-  if (NewVecTy) {
-    Builder.SetInsertPoint(OldInst);
-    for (auto *U : OldInst->users()) {
-      // Handle addrspacecast instruction after bitcast if present
-      if (auto *ASCastInst = dyn_cast<AddrSpaceCastInst>(U)) {
-        unsigned DestAS = ASCastInst->getDestAddressSpace();
-        auto *NewVecPtrTy = NewVecTy->getPointerTo(DestAS);
-        // AddrSpaceCast is created explicitly instead of using method
-        // IRBuilder<>.CreateAddrSpaceCast because IRBuilder doesn't create
-        // separate instruction for constant values. Whereas SPIR-V translator
-        // doesn't like several nested instructions in one.
-        Value *LocalValue = new AddrSpaceCastInst(NewInst, NewVecPtrTy);
-        Builder.Insert(LocalValue);
-        Changed |=
-            lowerBitCastToNonStdVec(ASCastInst, LocalValue, OldVecTy,
-                                    InstsToErase, Builder, RecursionDepth);
-      }
-      // Handle load instruction which is following the bitcast in the pattern
-      else if (auto *LI = dyn_cast<LoadInst>(U)) {
-        Value *LocalValue = Builder.CreateLoad(NewVecTy, NewInst);
-        Changed |= lowerBitCastToNonStdVec(
-            LI, LocalValue, OldVecTy, InstsToErase, Builder, RecursionDepth);
-      }
-      // Handle extractelement instruction which is following the load
-      else if (auto *EEI = dyn_cast<ExtractElementInst>(U)) {
-        uint64_t NumElemsInOldVec = OldVecTy->getElementCount().getFixedValue();
-        uint64_t NumElemsInNewVec = NewVecTy->getElementCount().getFixedValue();
-        uint64_t OldElemIdx =
-            cast<ConstantInt>(EEI->getIndexOperand())->getZExtValue();
-        uint64_t NewElemIdx =
-            OldElemIdx / (NumElemsInOldVec / NumElemsInNewVec);
-        Value *LocalValue = Builder.CreateExtractElement(NewInst, NewElemIdx);
-        // The trunc instruction truncates the high order bits in value, so it
-        // may be necessary to shift right high order bits, if required bits are
-        // not at the end of extracted value
-        unsigned OldVecElemBitWidth =
-            cast<IntegerType>(OldVecTy->getElementType())->getBitWidth();
-        unsigned NewVecElemBitWidth =
-            cast<IntegerType>(NewVecTy->getElementType())->getBitWidth();
-        unsigned BitWidthRatio = NewVecElemBitWidth / OldVecElemBitWidth;
-        if (auto RequiredBitsIdx =
-                OldElemIdx % BitWidthRatio != BitWidthRatio - 1) {
-          uint64_t Shift =
-              OldVecElemBitWidth * (BitWidthRatio - RequiredBitsIdx);
-          LocalValue = Builder.CreateLShr(LocalValue, Shift);
-        }
-        LocalValue =
-            Builder.CreateTrunc(LocalValue, OldVecTy->getElementType());
-        Changed |= lowerBitCastToNonStdVec(
-            EEI, LocalValue, OldVecTy, InstsToErase, Builder, RecursionDepth);
+static Value *removeBitCasts(Value *OldValue, Type *NewTy, NFIRBuilder &Builder,
+                             std::vector<Instruction *> &InstsToErase) {
+  IRBuilderBase::InsertPointGuard Guard(Builder);
+  auto RauwBitcasts = [&](Instruction *OldValue, Value *NewValue) {
+    // If there's only one use, don't create a bitcast for any uses, since it
+    // will be immediately replaced anyways.
+    if (OldValue->hasOneUse()) {
+      OldValue->replaceAllUsesWith(UndefValue::get(OldValue->getType()));
+    } else {
+      OldValue->replaceAllUsesWith(
+          Builder.CreateBitCast(NewValue, OldValue->getType()));
+    }
+    InstsToErase.push_back(OldValue);
+    return NewValue;
+  };
+
+  if (auto *LI = dyn_cast<LoadInst>(OldValue)) {
+    Builder.SetInsertPoint(LI);
+    Value *Pointer = LI->getPointerOperand();
+    if (!Pointer->getType()->isOpaquePointerTy()) {
+      Type *NewPointerTy =
+          PointerType::get(NewTy, LI->getPointerAddressSpace());
+      Pointer = removeBitCasts(Pointer, NewPointerTy, Builder, InstsToErase);
+    }
+    LoadInst *NewLI = Builder.CreateAlignedLoad(NewTy, Pointer, LI->getAlign(),
+                                                LI->isVolatile());
+    NewLI->setOrdering(LI->getOrdering());
+    NewLI->setSyncScopeID(LI->getSyncScopeID());
+    return RauwBitcasts(LI, NewLI);
+  }
+
+  if (auto *ASCI = dyn_cast<AddrSpaceCastInst>(OldValue)) {
+    Builder.SetInsertPoint(ASCI);
+    Type *NewSrcTy = PointerType::getWithSamePointeeType(
+        cast<PointerType>(NewTy), ASCI->getSrcAddressSpace());
+    Value *Pointer = removeBitCasts(ASCI->getPointerOperand(), NewSrcTy,
+                                    Builder, InstsToErase);
+    return RauwBitcasts(ASCI, Builder.CreateAddrSpaceCast(Pointer, NewTy));
+  }
+
+  if (auto *BC = dyn_cast<BitCastInst>(OldValue)) {
+    if (BC->getSrcTy() == NewTy) {
+      if (BC->hasOneUse()) {
+        BC->replaceAllUsesWith(UndefValue::get(BC->getType()));
+        InstsToErase.push_back(BC);
       }
+      return BC->getOperand(0);
     }
+    Builder.SetInsertPoint(BC);
+    return RauwBitcasts(BC, Builder.CreateBitCast(BC->getOperand(0), NewTy));
   }
-  InstsToErase.push_back(OldInst);
-  if (!Changed)
-    OldInst->replaceAllUsesWith(NewInst);
-  return true;
+
+  report_fatal_error("Cannot translate source of bitcast instruction.");
+  return nullptr;
+}
+
+static bool isNonStdVecType(VectorType *VecTy) {
+  uint64_t NumElems = VecTy->getElementCount().getFixedValue();
+  return !isValidVectorSize(NumElems);
 }
 
 class SPIRVLowerBitCastToNonStandardTypePass
@@ -160,41 +139,82 @@ class SPIRVLowerBitCastToNonStandardTypePass
     if (Opts.isAllowedToUseExtension(ExtensionID::SPV_INTEL_vector_compute))
       return PreservedAnalyses::all();
 
-    std::vector<Instruction *> BCastsToNonStdVec;
-    std::vector<Instruction *> InstsToErase;
+    // The basic pattern we're trying to fix is this InstCombine pattern:
+    // trunc (extractelement) -> extractelement (bitcast)
+    // (note that the bitcast itself can get propagated back to change the type
+    // of load instructions, and even through those to pointer casts, if typed
+    // pointers are enabled.
+    std::vector<ExtractElementInst *> NonStdVecInsts;
+    SmallVector<WeakTrackingVH, 4> MaybeDeletedInsts;
     for (auto &BB : F)
       for (auto &I : BB) {
-        auto *BC = dyn_cast<BitCastInst>(&I);
-        if (!BC)
-          continue;
-        VectorType *SrcVecTy = getVectorType(BC->getSrcTy());
-        if (SrcVecTy) {
-          uint64_t NumElemsInSrcVec =
-              SrcVecTy->getElementCount().getFixedValue();
-          if (!isValidVectorSize(NumElemsInSrcVec))
-            report_fatal_error(
-                llvm::Twine("Unsupported vector type with the size of: " +
-                            std::to_string(NumElemsInSrcVec)),
-                false);
-        }
-        VectorType *DestVecTy = getVectorType(BC->getDestTy());
-        if (DestVecTy) {
-          uint64_t NumElemsInDestVec =
-              DestVecTy->getElementCount().getFixedValue();
-          if (!isValidVectorSize(NumElemsInDestVec))
-            BCastsToNonStdVec.push_back(&I);
+        if (auto *EI = dyn_cast<ExtractElementInst>(&I)) {
+          if (isNonStdVecType(EI->getVectorOperandType()))
+            NonStdVecInsts.push_back(EI);
+        } else if (auto *VT = dyn_cast<VectorType>(I.getType())) {
+          if (isNonStdVecType(VT)) {
+            MaybeDeletedInsts.push_back(&I);
+          }
         }
       }
-    IRBuilder<> Builder(F.getContext());
-    for (auto &I : BCastsToNonStdVec) {
-      Value *NewValue = I->getOperand(0);
-      VectorType *OldVecTy = getVectorType(I->getType());
-      Changed |=
-          lowerBitCastToNonStdVec(I, NewValue, OldVecTy, InstsToErase, Builder);
+
+    std::vector<Instruction *> InstsToErase;
+    NFIRBuilder Builder(F.getContext());
+    for (auto &I : NonStdVecInsts) {
+      VectorType *OldVecTy = I->getVectorOperandType();
+      unsigned OldVecSize = OldVecTy->getElementCount().getFixedValue();
+
+      // Compute the adjustment factor for the new vector size.
+      unsigned VecFactor = 2;
+      while (OldVecSize % VecFactor == 0 &&
+             !isValidVectorSize(OldVecSize / VecFactor))
+        VecFactor *= 2;
+      if (OldVecSize % VecFactor != 0) {
+        report_fatal_error(Twine("Invalid vector size for fixup: ") +
+                           Twine(OldVecSize));
+        return PreservedAnalyses::none();
+      }
+      unsigned NewElemSize = OldVecTy->getScalarSizeInBits() * VecFactor;
+      VectorType *NewVecTy =
+          VectorType::get(Type::getIntNTy(F.getContext(), NewElemSize),
+                          OldVecSize / VecFactor, false);
+
+      // Adjust the element index as appropriate.
+      uint64_t OldElemIdx =
+          cast<ConstantInt>(I->getIndexOperand())->getZExtValue();
+      uint64_t NewElemIdx = OldElemIdx / VecFactor;
+      uint64_t ShiftCount = OldElemIdx % VecFactor;
+      Builder.SetInsertPoint(I);
+      Value *NewVecOp = removeBitCasts(I->getVectorOperand(), NewVecTy, Builder,
+                                       InstsToErase);
+      Value *NewExtracted = Builder.CreateExtractElement(NewVecOp, NewElemIdx);
+
+      // If the extract does higher-order bits of the value, shift as necessary.
+      if (ShiftCount > 0)
+        NewExtracted = Builder.CreateLShr(
+            NewExtracted, ShiftCount * OldVecTy->getScalarSizeInBits());
+
+      Value *NewValue = Builder.CreateTrunc(NewExtracted, I->getType());
+      I->replaceAllUsesWith(NewValue);
+      I->eraseFromParent();
+      Changed = true;
     }
 
     for (auto *I : InstsToErase)
-      I->eraseFromParent();
+      RecursivelyDeleteTriviallyDeadInstructions(I);
+
+    // Check if there are any residual unsupported vector types.
+    for (auto &VH : MaybeDeletedInsts) {
+      // Some vector-valued instructions were replaced with undef values, so if
+      // that's what we got, it's still a dead instruction.
+      if (VH.pointsToAliveValue() && !isa<UndefValue>(VH)) {
+        auto *VT = dyn_cast<VectorType>(VH->getType());
+        report_fatal_error(Twine("Unsupported vector type with ") +
+                               Twine(VT->getElementCount().getFixedValue()) +
+                               Twine(" elements"),
+                           false);
+      }
+    }
 
     return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
   }
diff --git a/test/lower-non-standard-types-opaque.ll b/test/lower-non-standard-types-opaque.ll
@@ -0,0 +1,50 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv -s %t.bc -o - | llvm-dis -o - | FileCheck %s --implicit-check-not="<6 x i32>"
+
+; CHECK: [[ASCastInst:%.*]] = addrspacecast ptr addrspace(1) @Id to ptr addrspace(4)
+; CHECK: [[LoadInst1:%.*]] = load <3 x i64>, ptr addrspace(4) [[ASCastInst]], align 32
+; CHECK: [[LoadInst2:%.*]] = load <3 x i64>, ptr addrspace(4) [[ASCastInst]], align 32
+; CHECK: [[ExtrElInst1:%.*]] = extractelement <3 x i64> [[LoadInst1]], i64 0
+; CHECK: [[TruncInst1:%.*]] = trunc i64 [[ExtrElInst1]] to i32
+; CHECK: [[ExtrElInst2:%.*]] = extractelement <3 x i64> [[LoadInst2]], i64 2
+; CHECK: [[LShrInst:%.*]] = lshr i64 [[ExtrElInst2]], 32
+; CHECK: [[TruncInst2:%.*]] = trunc i64 [[LShrInst]] to i32
+; CHECK: %conv1 = sitofp i32 [[TruncInst1]] to float
+; CHECK: %conv2 = sitofp i32 [[TruncInst2]] to float
+
+; ModuleID = 'lower-non-standard-types'
+source_filename = "lower-non-standard-types.cpp"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+@Id = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+; Function Attrs: convergent norecurse
+define dso_local spir_func void @vmult2() local_unnamed_addr #0 !sycl_explicit_simd !4 !intel_reqd_sub_group_size !6 {
+entry:
+  %0 = load <6 x i32>, ptr addrspace(4) addrspacecast (ptr addrspace(1) @Id to ptr addrspace(4)), align 32
+  %1 = load <6 x i32>, ptr addrspace(4) addrspacecast (ptr addrspace(1) @Id to ptr addrspace(4)), align 32
+  %2 = extractelement <6 x i32> %0, i32 0
+  %3 = extractelement <6 x i32> %1, i32 5
+  %conv1 = sitofp i32 %2 to float
+  %conv2 = sitofp i32 %3 to float
+  ret void
+}
+
+attributes #0 = { convergent norecurse "frame-pointer"="all" "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="lower-external-funcs-with-z.cpp" }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.spir.version = !{!2}
+!spirv.Source = !{!3}
+!opencl.used.extensions = !{!4}
+!opencl.used.optional.core.features = !{!4}
+!opencl.compiler.options = !{!4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 0, i32 100000}
+!4 = !{}
+!5 = !{!"Compiler"}
+!6 = !{i32 1}
diff --git a/test/lower-non-standard-types.ll b/test/lower-non-standard-types.ll
@@ -3,9 +3,9 @@
 
 ; CHECK: [[ASCastInst:%.*]] = addrspacecast <3 x i64> addrspace(1)* @Id to <3 x i64> addrspace(4)*
 ; CHECK: [[LoadInst1:%.*]] = load <3 x i64>, <3 x i64> addrspace(4)* [[ASCastInst]], align 32
+; CHECK: [[LoadInst2:%.*]] = load <3 x i64>, <3 x i64> addrspace(4)* [[ASCastInst]], align 32
 ; CHECK: [[ExtrElInst1:%.*]] = extractelement <3 x i64> [[LoadInst1]], i64 0
 ; CHECK: [[TruncInst1:%.*]] = trunc i64 [[ExtrElInst1]] to i32
-; CHECK: [[LoadInst2:%.*]] = load <3 x i64>, <3 x i64> addrspace(4)* [[ASCastInst]], align 32
 ; CHECK: [[ExtrElInst2:%.*]] = extractelement <3 x i64> [[LoadInst2]], i64 2
 ; CHECK: [[LShrInst:%.*]] = lshr i64 [[ExtrElInst2]], 32
 ; CHECK: [[TruncInst2:%.*]] = trunc i64 [[LShrInst]] to i32
@@ -24,8 +24,8 @@ define dso_local spir_func void @vmult2() local_unnamed_addr #0 !sycl_explicit_s
 entry:
   %0 = load <6 x i32>, <6 x i32> addrspace(4)* addrspacecast (<6 x i32> addrspace(1)* bitcast (<3 x i64> addrspace(1)* @Id to <6 x i32> addrspace(1)*) to <6 x i32> addrspace(4)*), align 32
   %1 = load <6 x i32>, <6 x i32> addrspace(4)* addrspacecast (<6 x i32> addrspace(1)* bitcast (<3 x i64> addrspace(1)* @Id to <6 x i32> addrspace(1)*) to <6 x i32> addrspace(4)*), align 32
-  %2 = extractelement <6 x i32> %0, i32 1
-  %3 = extractelement <6 x i32> %1, i32 4
+  %2 = extractelement <6 x i32> %0, i32 0
+  %3 = extractelement <6 x i32> %1, i32 5
   %conv1 = sitofp i32 %2 to float
   %conv2 = sitofp i32 %3 to float
   ret void