Revert "[InstCombine] Remove PromoteCastOfAllocation() fold (NFC)"

againull · againull · commit 16592a3eb9d0 · 2023-04-13T16:38:40.000-04:00
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -25,6 +25,168 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+#ifndef INTEL_SYCL_OPAQUEPOINTER_READY
+/// Analyze 'Val', seeing if it is a simple linear expression.
+/// If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+                                        uint64_t &Offset) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    Offset = CI->getZExtValue();
+    Scale  = 0;
+    return ConstantInt::get(Val->getType(), 0);
+  }
+
+  if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
+    // Cannot look past anything that might overflow.
+    // We specifically require nuw because we store the Scale in an unsigned
+    // and perform an unsigned divide on it.
+    OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
+    if (OBI && !OBI->hasNoUnsignedWrap()) {
+      Scale = 1;
+      Offset = 0;
+      return Val;
+    }
+
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (I->getOpcode() == Instruction::Shl) {
+        // This is a value scaled by '1 << the shift amt'.
+        Scale = UINT64_C(1) << RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Mul) {
+        // This value is scaled by 'RHS'.
+        Scale = RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Add) {
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
+        // where C1 is divisible by C2.
+        unsigned SubScale;
+        Value *SubVal =
+          decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+        Offset += RHS->getZExtValue();
+        Scale = SubScale;
+        return SubVal;
+      }
+    }
+  }
+
+  // Otherwise, we can't look past this.
+  Scale = 1;
+  Offset = 0;
+  return Val;
+}
+
+/// If we find a cast of an allocation instruction, try to eliminate the cast by
+/// moving the type information into the alloc.
+Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
+                                                       AllocaInst &AI) {
+  PointerType *PTy = cast<PointerType>(CI.getType());
+  // Opaque pointers don't have an element type we could replace with.
+  if (PTy->isOpaque())
+    return nullptr;
+
+  IRBuilderBase::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(&AI);
+
+  // Get the type really allocated and the type casted to.
+  Type *AllocElTy = AI.getAllocatedType();
+  Type *CastElTy = PTy->getNonOpaquePointerElementType();
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
+
+  // This optimisation does not work for cases where the cast type
+  // is scalable and the allocated type is not. This because we need to
+  // know how many times the casted type fits into the allocated type.
+  // For the opposite case where the allocated type is scalable and the
+  // cast type is not this leads to poor code quality due to the
+  // introduction of 'vscale' into the calculations. It seems better to
+  // bail out for this case too until we've done a proper cost-benefit
+  // analysis.
+  bool AllocIsScalable = isa<ScalableVectorType>(AllocElTy);
+  bool CastIsScalable = isa<ScalableVectorType>(CastElTy);
+  if (AllocIsScalable != CastIsScalable) return nullptr;
+
+  Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
+  Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
+  if (CastElTyAlign < AllocElTyAlign) return nullptr;
+
+  // If the allocation has multiple uses, only promote it if we are strictly
+  // increasing the alignment of the resultant allocation.  If we keep it the
+  // same, we open the door to infinite loops of various kinds.
+  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
+
+  // The alloc and cast types should be either both fixed or both scalable.
+  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getKnownMinValue();
+  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getKnownMinValue();
+  if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
+
+  // If the allocation has multiple uses, only promote it if we're not
+  // shrinking the amount of memory being allocated.
+  uint64_t AllocElTyStoreSize =
+      DL.getTypeStoreSize(AllocElTy).getKnownMinValue();
+  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getKnownMinValue();
+  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
+
+  // See if we can satisfy the modulus by pulling a scale out of the array
+  // size argument.
+  unsigned ArraySizeScale;
+  uint64_t ArrayOffset;
+  Value *NumElements = // See if the array size is a decomposable linear expr.
+    decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+
+  // If we can now satisfy the modulus, by using a non-1 scale, we really can
+  // do the xform.
+  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr;
+
+  // We don't currently support arrays of scalable types.
+  assert(!AllocIsScalable || (ArrayOffset == 1 && ArraySizeScale == 0));
+
+  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+  Value *Amt = nullptr;
+  if (Scale == 1) {
+    Amt = NumElements;
+  } else {
+    Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
+    // Insert before the alloca, not before the cast.
+    Amt = Builder.CreateMul(Amt, NumElements);
+  }
+
+  if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
+                                  Offset, true);
+    Amt = Builder.CreateAdd(Amt, Off);
+  }
+
+  AllocaInst *New = Builder.CreateAlloca(CastElTy, AI.getAddressSpace(), Amt);
+  New->setAlignment(AI.getAlign());
+  New->takeName(&AI);
+  New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
+  New->setMetadata(LLVMContext::MD_DIAssignID,
+                   AI.getMetadata(LLVMContext::MD_DIAssignID));
+
+  replaceAllDbgUsesWith(AI, *New, *New, DT);
+
+  // If the allocation has multiple real uses, insert a cast and change all
+  // things that used it to use the new cast.  This will also hack on CI, but it
+  // will die soon.
+  if (!AI.hasOneUse()) {
+    // New is the allocation instruction, pointer typed. AI is the original
+    // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+    Value *NewCast = Builder.CreateBitCast(New, AI.getType(), "tmpcast");
+    replaceInstUsesWith(AI, NewCast);
+    eraseInstFromFunction(AI);
+  }
+  return replaceInstUsesWith(CI, New);
+}
+#endif // INTEL_SYCL_OPAQUEPOINTER_READY
+
 /// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
 /// true for, actually insert the code to evaluate the expression.
 Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
@@ -2634,9 +2796,18 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
     return replaceInstUsesWith(CI, Src);
 
 #ifndef INTEL_SYCL_OPAQUEPOINTER_READY
-  if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy))
+  if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy)) {
+    // If we are casting a alloca to a pointer to a type of the same
+    // size, rewrite the allocation instruction to allocate the "right" type.
+    // There is no need to modify malloc calls because it is their bitcast that
+    // needs to be cleaned up.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
+      if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+        return V;
+
     if (Instruction *I = convertBitCastToGEP(CI, Builder, DL))
       return I;
+  }
 #endif // INTEL_SYCL_OPAQUEPOINTER_READY
 
   if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -642,6 +642,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                          bool isSigned, bool Inside);
+#ifndef INTEL_SYCL_OPAQUEPOINTER_READY
+  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
+#endif // INTEL_SYCL_OPAQUEPOINTER_READY
   bool mergeStoreIntoSuccessor(StoreInst &SI);
 
   /// Given an initial instruction, check to see if it is the root of a
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/intel-alloca-bitcast.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/intel-alloca-bitcast.ll
@@ -0,0 +1,75 @@
+; RUN: opt -opaque-pointers=0 -passes=instcombine -S %s -o - \
+; RUN: | FileCheck %s
+
+;; NOTE: This test uses typed pointers because it is testing a code path that
+;; doesn't get exercised with opaque pointers. If/when PromoteCastOfAllocation
+;; is removed from visitBitCast this test should just be deleted.
+
+;; Check that allocas generated in InstCombine's PromoteCastOfAllocation
+;; have DIAssignID copied from the original alloca.
+;;
+;; $ cat reduce.cpp
+;; struct c {
+;;   c(int);
+;;   int a, b;
+;; };
+;; c d() {
+;;   c e(1);
+;;   return e;
+;; }
+;; $ clang -O2 -c -g reduce.cpp -fno-inline -Xclang -disable-llvm-passes -emit-llvm -S \
+;;   | opt -opaque-pointers=0 -passes=declare-to-assign -S
+
+; CHECK: entry:
+; CHECK-NEXT: %retval = alloca i64, align 8, !DIAssignID ![[ID:[0-9]+]]
+; CHECK-NEXT: %tmpcast = bitcast i64* %retval to %struct.c*
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata ![[e:[0-9]+]], metadata !DIExpression(), metadata ![[ID]], metadata i64* %retval, metadata !DIExpression()), !dbg
+; CHECK: ![[e]] = !DILocalVariable(name: "e",
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.c = type { i32, i32 }
+
+define dso_local i64 @_Z1dv() !dbg !7 {
+entry:
+  %retval = alloca %struct.c, align 4, !DIAssignID !21
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !21, metadata %struct.c* %retval, metadata !DIExpression()), !dbg !22
+  call void @_ZN1cC1Ei(%struct.c* %retval, i32 1), !dbg !23
+  %0 = bitcast %struct.c* %retval to i64*, !dbg !24
+  %1 = load i64, i64* %0, align 4, !dbg !24
+  ret i64 %1, !dbg !24
+}
+
+declare dso_local void @_ZN1cC1Ei(%struct.c*, i32) unnamed_addr
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !1000}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "reduce.cpp", directory: "/")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 12.0.0"}
+!7 = distinct !DISubprogram(name: "d", linkageName: "_Z1dv", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !19)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "c", file: !1, line: 1, size: 64, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !11, identifier: "_ZTS1c")
+!11 = !{!12, !14, !15}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !1, line: 3, baseType: !13, size: 32)
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !1, line: 3, baseType: !13, size: 32, offset: 32)
+!15 = !DISubprogram(name: "c", scope: !10, file: !1, line: 2, type: !16, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!16 = !DISubroutineType(types: !17)
+!17 = !{null, !18, !13}
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!19 = !{!20}
+!20 = !DILocalVariable(name: "e", scope: !7, file: !1, line: 6, type: !10)
+!21 = distinct !DIAssignID()
+!22 = !DILocation(line: 0, scope: !7)
+!23 = !DILocation(line: 6, column: 5, scope: !7)
+!24 = !DILocation(line: 7, column: 3, scope: !7)
+!1000 = !{i32 7, !"debug-info-assignment-tracking", i1 true}