Skip to content

Commit 69cccb3

Browse files
committed
[SVE] Fix isLoadInvariantInLoop for scalable vectors
I've amended the isLoadInvariantInLoop function to bail out for scalable vectors for now since the invariant.start intrinsic is only ever generated by the clang frontend for thread locals or struct and class constructors, neither of which support sizeless types. In addition, the intrinsic itself does not currently support the concept of a scaled size, which makes it impossible to compare the sizes of different scalable objects, e.g. <vscale x 32 x i8> and <vscale x 16 x i8>. Added new tests here: Transforms/LICM/AArch64/sve-load-hoist.ll Transforms/LICM/hoisting.ll Differential Revision: https://reviews.llvm.org/D87227
1 parent 943b0c8 commit 69cccb3

File tree

5 files changed

+93
-4
lines changed

5 files changed

+93
-4
lines changed

llvm/lib/IR/Verifier.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5010,6 +5010,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
50105010
Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
50115011
break;
50125012
}
5013+
case Intrinsic::invariant_start: {
5014+
ConstantInt *InvariantSize = dyn_cast<ConstantInt>(Call.getArgOperand(0));
5015+
Assert(InvariantSize &&
5016+
(!InvariantSize->isNegative() || InvariantSize->isMinusOne()),
5017+
"invariant_start parameter must be -1, 0 or a positive number",
5018+
&Call);
5019+
break;
5020+
}
50135021
case Intrinsic::matrix_multiply:
50145022
case Intrinsic::matrix_transpose:
50155023
case Intrinsic::matrix_column_major_load:

llvm/lib/Transforms/Scalar/LICM.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -940,7 +940,19 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
940940
Loop *CurLoop) {
941941
Value *Addr = LI->getOperand(0);
942942
const DataLayout &DL = LI->getModule()->getDataLayout();
943-
const uint32_t LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
943+
const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
944+
945+
// It is not currently possible for clang to generate an invariant.start
946+
// intrinsic with scalable vector types because we don't support thread local
947+
// sizeless types and we don't permit sizeless types in structs or classes.
948+
// Furthermore, even if support is added for this in future the intrinsic
949+
// itself is defined to have a size of -1 for variable sized objects. This
950+
// makes it impossible to verify if the intrinsic envelops our region of
951+
// interest. For example, both <vscale x 32 x i8> and <vscale x 16 x i8>
952+
// types would have a -1 parameter, but the former is clearly double the size
953+
// of the latter.
954+
if (LocSizeInBits.isScalable())
955+
return false;
944956

945957
// if the type is i8 addrspace(x)*, we know this is the type of
946958
// llvm.invariant.start operand
@@ -970,13 +982,17 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
970982
if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
971983
!II->use_empty())
972984
continue;
973-
unsigned InvariantSizeInBits =
974-
cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
985+
ConstantInt *InvariantSize = cast<ConstantInt>(II->getArgOperand(0));
986+
// The intrinsic supports having a -1 argument for variable sized objects
987+
// so we should check for that here.
988+
if (InvariantSize->isNegative())
989+
continue;
990+
uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8;
975991
// Confirm the invariant.start location size contains the load operand size
976992
// in bits. Also, the invariant.start should dominate the load, and we
977993
// should not hoist the load out of a loop that contains this dominating
978994
// invariant.start.
979-
if (LocSizeInBits <= InvariantSizeInBits &&
995+
if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits &&
980996
DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
981997
return true;
982998
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
if not 'AArch64' in config.root.targets:
2+
config.unsupported = True
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
; RUN: opt -licm -mtriple aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s
2+
3+
define void @no_hoist_load1_nxv2i64(<vscale x 2 x i64>* %out, i8* %in8, i32 %n) {
4+
; CHECK-LABEL: @no_hoist_load1_nxv2i64(
5+
; CHECK: entry:
6+
; CHECK-NOT: load
7+
; CHECK: for.body:
8+
; CHECK: load
9+
entry:
10+
%cmp0 = icmp ugt i32 %n, 0
11+
%invst = call {}* @llvm.invariant.start.p0i8(i64 16, i8* %in8)
12+
%in = bitcast i8* %in8 to <vscale x 2 x i64>*
13+
br i1 %cmp0, label %for.body, label %for.end
14+
15+
for.body:
16+
%i = phi i32 [0, %entry], [%inc, %for.body]
17+
%i2 = zext i32 %i to i64
18+
%ptr = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %out, i64 %i2
19+
%val = load <vscale x 2 x i64>, <vscale x 2 x i64>* %in, align 16
20+
store <vscale x 2 x i64> %val, <vscale x 2 x i64>* %ptr, align 16
21+
%inc = add nuw nsw i32 %i, 1
22+
%cmp = icmp ult i32 %inc, %n
23+
br i1 %cmp, label %for.body, label %for.end
24+
25+
for.end:
26+
ret void
27+
}
28+
29+
declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
30+

llvm/test/Transforms/LICM/hoisting.ll

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,3 +360,36 @@ loop:
360360
loopexit:
361361
ret i32 %sum
362362
}
363+
364+
; We can't hoist the invariant load out of the loop because
365+
; the marker is given a variable size (-1).
366+
define i32 @test_fence5(i8* %addr, i32 %n, i8* %volatile) {
367+
; CHECK-LABEL: @test_fence5
368+
; CHECK-LABEL: entry
369+
; CHECK: invariant.start
370+
; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
371+
; CHECK: br label %loop
372+
entry:
373+
%gep = getelementptr inbounds i8, i8* %addr, i64 8
374+
%addr.i = bitcast i8* %gep to i32 *
375+
store atomic i32 5, i32 * %addr.i unordered, align 8
376+
fence release
377+
%invst = call {}* @llvm.invariant.start.p0i8(i64 -1, i8* %gep)
378+
br label %loop
379+
380+
loop:
381+
%indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
382+
%sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
383+
%volload = load atomic i8, i8* %volatile unordered, align 8
384+
fence acquire
385+
%volchk = icmp eq i8 %volload, 0
386+
%addrld = load atomic i32, i32* %addr.i unordered, align 8
387+
%sel = select i1 %volchk, i32 0, i32 %addrld
388+
%sum.next = add i32 %sel, %sum
389+
%indvar.next = add i32 %indvar, 1
390+
%cond = icmp slt i32 %indvar.next, %n
391+
br i1 %cond, label %loop, label %loopexit
392+
393+
loopexit:
394+
ret i32 %sum
395+
}

0 commit comments

Comments
 (0)