[AArch64][SVE] Combine bitcasts of predicate types with vector inserts/extracts of loads/stores

brads55 · brads55 · commit d9cc5d84e4d3 · 2021-08-04T15:51:14.000Z
An insert subvector that is inserting the result of a vector predicate sized load into undef at index 0, whose result is casted to a predicate type, can be combined into a direct predicate load. Likewise the same applies to extract subvector but in reverse. The purpose of this optimization is to clean up cases that will be introduced in a later patch where casts to/from predicate types from i8 types will use insert subvector, rather than going through memory early. This optimization is done in SVEIntrinsicOpts rather than InstCombine to re-introduce scalable loads as late as possible, to give other optimizations the best chance possible to do a good job. Differential Revision: https://reviews.llvm.org/D106549
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -59,6 +59,10 @@ struct SVEIntrinsicOpts : public ModulePass {
   bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
                                    SmallSetVector<IntrinsicInst *, 4> &PTrues);
   bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
+  bool optimizePredicateStore(Instruction *I);
+  bool optimizePredicateLoad(Instruction *I);
+
+  bool optimizeInstructions(SmallSetVector<Function *, 4> &Functions);
 
   /// Operates at the function-scope. I.e., optimizations are applied local to
   /// the functions themselves.
@@ -276,11 +280,166 @@ bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
   return Changed;
 }
 
+// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
+// scalable stores as late as possible
+bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
+  auto *F = I->getFunction();
+  auto Attr = F->getFnAttribute(Attribute::VScaleRange);
+  if (!Attr.isValid())
+    return false;
+
+  unsigned MinVScale, MaxVScale;
+  std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
+  // The transform needs to know the exact runtime length of scalable vectors
+  if (MinVScale != MaxVScale || MinVScale == 0)
+    return false;
+
+  auto *PredType =
+      ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
+  auto *FixedPredType =
+      FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
+
+  // If we have a store..
+  auto *Store = dyn_cast<StoreInst>(I);
+  if (!Store || !Store->isSimple())
+    return false;
+
+  // ..that is storing a predicate vector sized worth of bits..
+  if (Store->getOperand(0)->getType() != FixedPredType)
+    return false;
+
+  // ..where the value stored comes from a vector extract..
+  auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
+  if (!IntrI ||
+      IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract)
+    return false;
+
+  // ..that is extracting from index 0..
+  if (!cast<ConstantInt>(IntrI->getOperand(1))->isZero())
+    return false;
+
+  // ..where the value being extract from comes from a bitcast
+  auto *BitCast = dyn_cast<BitCastInst>(IntrI->getOperand(0));
+  if (!BitCast)
+    return false;
+
+  // ..and the bitcast is casting from predicate type
+  if (BitCast->getOperand(0)->getType() != PredType)
+    return false;
+
+  IRBuilder<> Builder(I->getContext());
+  Builder.SetInsertPoint(I);
+
+  auto *PtrBitCast = Builder.CreateBitCast(
+      Store->getPointerOperand(),
+      PredType->getPointerTo(Store->getPointerAddressSpace()));
+  Builder.CreateStore(BitCast->getOperand(0), PtrBitCast);
+
+  Store->eraseFromParent();
+  if (IntrI->getNumUses() == 0)
+    IntrI->eraseFromParent();
+  if (BitCast->getNumUses() == 0)
+    BitCast->eraseFromParent();
+
+  return true;
+}
+
+// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
+// scalable loads as late as possible
+bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
+  auto *F = I->getFunction();
+  auto Attr = F->getFnAttribute(Attribute::VScaleRange);
+  if (!Attr.isValid())
+    return false;
+
+  unsigned MinVScale, MaxVScale;
+  std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
+  // The transform needs to know the exact runtime length of scalable vectors
+  if (MinVScale != MaxVScale || MinVScale == 0)
+    return false;
+
+  auto *PredType =
+      ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
+  auto *FixedPredType =
+      FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
+
+  // If we have a bitcast..
+  auto *BitCast = dyn_cast<BitCastInst>(I);
+  if (!BitCast || BitCast->getType() != PredType)
+    return false;
+
+  // ..whose operand is a vector_insert..
+  auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
+  if (!IntrI ||
+      IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert)
+    return false;
+
+  // ..that is inserting into index zero of an undef vector..
+  if (!isa<UndefValue>(IntrI->getOperand(0)) ||
+      !cast<ConstantInt>(IntrI->getOperand(2))->isZero())
+    return false;
+
+  // ..where the value inserted comes from a load..
+  auto *Load = dyn_cast<LoadInst>(IntrI->getOperand(1));
+  if (!Load || !Load->isSimple())
+    return false;
+
+  // ..that is loading a predicate vector sized worth of bits..
+  if (Load->getType() != FixedPredType)
+    return false;
+
+  IRBuilder<> Builder(I->getContext());
+  Builder.SetInsertPoint(Load);
+
+  auto *PtrBitCast = Builder.CreateBitCast(
+      Load->getPointerOperand(),
+      PredType->getPointerTo(Load->getPointerAddressSpace()));
+  auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast);
+
+  BitCast->replaceAllUsesWith(LoadPred);
+  BitCast->eraseFromParent();
+  if (IntrI->getNumUses() == 0)
+    IntrI->eraseFromParent();
+  if (Load->getNumUses() == 0)
+    Load->eraseFromParent();
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizeInstructions(
+    SmallSetVector<Function *, 4> &Functions) {
+  bool Changed = false;
+
+  for (auto *F : Functions) {
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+
+    // Traverse the DT with an rpo walk so we see defs before uses, allowing
+    // simplification to be done incrementally.
+    BasicBlock *Root = DT->getRoot();
+    ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
+    for (auto *BB : RPOT) {
+      for (Instruction &I : make_early_inc_range(*BB)) {
+        switch (I.getOpcode()) {
+        case Instruction::Store:
+          Changed |= optimizePredicateStore(&I);
+          break;
+        case Instruction::BitCast:
+          Changed |= optimizePredicateLoad(&I);
+          break;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool SVEIntrinsicOpts::optimizeFunctions(
     SmallSetVector<Function *, 4> &Functions) {
   bool Changed = false;
 
   Changed |= optimizePTrueIntrinsicCalls(Functions);
+  Changed |= optimizeInstructions(Functions);
 
   return Changed;
 }
@@ -297,6 +456,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
       continue;
 
     switch (F.getIntrinsicID()) {
+    case Intrinsic::experimental_vector_extract:
+    case Intrinsic::experimental_vector_insert:
     case Intrinsic::aarch64_sve_ptrue:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll
@@ -0,0 +1,86 @@
+; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @pred_store_v2i8(<vscale x 16 x i1> %pred, <2 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_store_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret void
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <2 x i8> %extract, <2 x i8>* %addr, align 4
+  ret void
+}
+
+define void @pred_store_v4i8(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_store_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret void
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+define void @pred_store_v8i8(<vscale x 16 x i1> %pred, <8 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_store_v8i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret void
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <8 x i8> %extract, <8 x i8>* %addr, align 4
+  ret void
+}
+
+
+; Check that too small of a vscale prevents optimization
+define void @pred_store_neg1(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_store_neg1(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+; Check that too large of a vscale prevents optimization
+define void @pred_store_neg2(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_store_neg2(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+; Check that a non-zero index prevents optimization
+define void @pred_store_neg3(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_store_neg3(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 4)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+; Check that differing vscale min/max prevents optimization
+define void @pred_store_neg4(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #3 {
+; CHECK-LABEL: @pred_store_neg4(
+; CHECK:         call <4 x i8> @llvm.experimental.vector.extract
+  %bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
+  %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
+  store <4 x i8> %extract, <4 x i8>* %addr, align 4
+  ret void
+}
+
+declare <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8>, i64)
+declare <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8>, i64)
+declare <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8>, i64)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,1) }
+attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
+attributes #2 = { "target-features"="+sve" vscale_range(4,4) }
+attributes #3 = { "target-features"="+sve" vscale_range(2,4) }
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll
@@ -0,0 +1,114 @@
+; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i1> @pred_load_v2i8(<2 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <2 x i8>, <2 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+define <vscale x 16 x i1> @pred_load_v4i8(<4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+define <vscale x 16 x i1> @pred_load_v8i8(<8 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_load_v8i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+  %load = load <8 x i8>, <8 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Ensure the insertion point is at the load
+define <vscale x 16 x i1> @pred_load_insertion_point(<2 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_load_insertion_point(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i8>* %addr to <vscale x 16 x i1>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[TMP1]]
+; CHECK-NEXT:    br label %bb1
+; CHECK:       bb1:
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+entry:
+  %load = load <2 x i8>, <2 x i8>* %addr, align 4
+  br label %bb1
+
+bb1:
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that too small of a vscale prevents optimization
+define <vscale x 16 x i1> @pred_load_neg1(<4 x i8>* %addr) #0 {
+; CHECK-LABEL: @pred_load_neg1(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that too large of a vscale prevents optimization
+define <vscale x 16 x i1> @pred_load_neg2(<4 x i8>* %addr) #2 {
+; CHECK-LABEL: @pred_load_neg2(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that a non-zero index prevents optimization
+define <vscale x 16 x i1> @pred_load_neg3(<4 x i8>* %addr) #1 {
+; CHECK-LABEL: @pred_load_neg3(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 4)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that differing vscale min/max prevents optimization
+define <vscale x 16 x i1> @pred_load_neg4(<4 x i8>* %addr) #3 {
+; CHECK-LABEL: @pred_load_neg4(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+; Check that insertion into a non-undef vector prevents optimization
+define <vscale x 16 x i1> @pred_load_neg5(<4 x i8>* %addr, <vscale x 2 x i8> %passthru) #1 {
+; CHECK-LABEL: @pred_load_neg5(
+; CHECK:         call <vscale x 2 x i8> @llvm.experimental.vector.insert
+  %load = load <4 x i8>, <4 x i8>* %addr, align 4
+  %insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> %passthru, <4 x i8> %load, i64 0)
+  %ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %ret
+}
+
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8>, <2 x i8>, i64)
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8>, <4 x i8>, i64)
+declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8>, <8 x i8>, i64)
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,1) }
+attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
+attributes #2 = { "target-features"="+sve" vscale_range(4,4) }
+attributes #3 = { "target-features"="+sve" vscale_range(2,4) }