[SVE] Fix isLoadInvariantInLoop for scalable vectors

david-arm · david-arm · commit 69cccb3189d6 · 2020-09-15T08:30:19.000+01:00
I've amended the isLoadInvariantInLoop function to bail out for scalable vectors for now since the invariant.start intrinsic is only ever generated by the clang frontend for thread locals or struct and class constructors, neither of which support sizeless types. In addition, the intrinsic itself does not currently support the concept of a scaled size, which makes it impossible to compare the sizes of different scalable objects, e.g. <vscale x 32 x i8> and <vscale x 16 x i8>. Added new tests here: Transforms/LICM/AArch64/sve-load-hoist.ll Transforms/LICM/hoisting.ll Differential Revision: https://reviews.llvm.org/D87227
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -5010,6 +5010,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
     break;
   }
+  case Intrinsic::invariant_start: {
+    ConstantInt *InvariantSize = dyn_cast<ConstantInt>(Call.getArgOperand(0));
+    Assert(InvariantSize &&
+               (!InvariantSize->isNegative() || InvariantSize->isMinusOne()),
+           "invariant_start parameter must be -1, 0 or a positive number",
+           &Call);
+    break;
+  }
   case Intrinsic::matrix_multiply:
   case Intrinsic::matrix_transpose:
   case Intrinsic::matrix_column_major_load:
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -940,7 +940,19 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
                                   Loop *CurLoop) {
   Value *Addr = LI->getOperand(0);
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+  const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
+
+  // It is not currently possible for clang to generate an invariant.start
+  // intrinsic with scalable vector types because we don't support thread local
+  // sizeless types and we don't permit sizeless types in structs or classes.
+  // Furthermore, even if support is added for this in future the intrinsic
+  // itself is defined to have a size of -1 for variable sized objects. This
+  // makes it impossible to verify if the intrinsic envelops our region of
+  // interest. For example, both <vscale x 32 x i8> and <vscale x 16 x i8>
+  // types would have a -1 parameter, but the former is clearly double the size
+  // of the latter.
+  if (LocSizeInBits.isScalable())
+    return false;
 
   // if the type is i8 addrspace(x)*, we know this is the type of
   // llvm.invariant.start operand
@@ -970,13 +982,17 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
     if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
         !II->use_empty())
       continue;
-    unsigned InvariantSizeInBits =
-        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    ConstantInt *InvariantSize = cast<ConstantInt>(II->getArgOperand(0));
+    // The intrinsic supports having a -1 argument for variable sized objects
+    // so we should check for that here.
+    if (InvariantSize->isNegative())
+      continue;
+    uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8;
     // Confirm the invariant.start location size contains the load operand size
     // in bits. Also, the invariant.start should dominate the load, and we
     // should not hoist the load out of a loop that contains this dominating
     // invariant.start.
-    if (LocSizeInBits <= InvariantSizeInBits &&
+    if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits &&
         DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
       return true;
   }
diff --git a/llvm/test/Transforms/LICM/AArch64/lit.local.cfg b/llvm/test/Transforms/LICM/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll b/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll
@@ -0,0 +1,30 @@
+; RUN: opt -licm -mtriple aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s
+
+define void @no_hoist_load1_nxv2i64(<vscale x 2 x i64>* %out, i8* %in8, i32 %n) {
+; CHECK-LABEL: @no_hoist_load1_nxv2i64(
+; CHECK: entry:
+; CHECK-NOT: load
+; CHECK: for.body:
+; CHECK: load
+entry:
+  %cmp0 = icmp ugt i32 %n, 0
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 16, i8* %in8)
+  %in = bitcast i8* %in8 to <vscale x 2 x i64>*
+  br i1 %cmp0, label %for.body, label %for.end
+
+for.body:
+  %i = phi i32 [0, %entry], [%inc, %for.body]
+  %i2 = zext i32 %i to i64
+  %ptr = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %out, i64 %i2
+  %val = load <vscale x 2 x i64>, <vscale x 2 x i64>* %in, align 16
+  store <vscale x 2 x i64> %val, <vscale x 2 x i64>* %ptr, align 16
+  %inc = add nuw nsw i32 %i, 1
+  %cmp = icmp ult i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+
diff --git a/llvm/test/Transforms/LICM/hoisting.ll b/llvm/test/Transforms/LICM/hoisting.ll
@@ -360,3 +360,36 @@ loop:
 loopexit:
   ret i32 %sum
 }
+
+; We can't hoist the invariant load out of the loop because
+; the marker is given a variable size (-1).
+define i32 @test_fence5(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence5
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 -1, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+if not 'AArch64' in config.root.targets:`
	`2`	`+ config.unsupported = True`