Skip to content

Commit 5afaab4

Browse files
committed
[LV]: Teach LV to recursively (de)interleave.
Currently available intrinsics are only ld2/st2, which don't support interleaving factor > 2. This patch teaches the LV to use ld2/st2 recursively to support high interleaving factors. Change-Id: I96af28dc6aeca0c6929d604176cc9ba29fca17df
1 parent 806a936 commit 5afaab4

File tree

4 files changed

+249
-39
lines changed

4 files changed

+249
-39
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9159,9 +9159,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91599159
CM.getWideningDecision(IG->getInsertPos(), VF) ==
91609160
LoopVectorizationCostModel::CM_Interleave);
91619161
// For scalable vectors, the only interleave factor currently supported
9162-
// is 2 since we require the (de)interleave2 intrinsics instead of
9163-
// shufflevectors.
9164-
assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9162+
// must be power of 2 since we require the (de)interleave2 intrinsics
9163+
// instead of shufflevectors.
9164+
assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
91659165
"Unsupported interleave factor for scalable vectors");
91669166
return Result;
91679167
};

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 102 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "llvm/Transforms/Utils/LoopUtils.h"
3636
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
3737
#include <cassert>
38+
#include <queue>
3839

3940
using namespace llvm;
4041

@@ -2779,10 +2780,39 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
27792780
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
27802781
// must use intrinsics to interleave.
27812782
if (VecTy->isScalableTy()) {
2782-
VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2783-
return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2784-
Vals,
2785-
/*FMFSource=*/nullptr, Name);
2783+
unsigned InterleaveFactor = Vals.size();
2784+
SmallVector<Value *> InterleavingValues;
2785+
unsigned InterleavingValuesCount =
2786+
InterleaveFactor + (InterleaveFactor - 2);
2787+
InterleavingValues.resize(InterleaveFactor);
2788+
// Place the values to be interleaved in the correct order for the
2789+
// interleaving
2790+
for (unsigned I = 0, J = InterleaveFactor / 2, K = 0; K < InterleaveFactor;
2791+
K++) {
2792+
if (K % 2 == 0) {
2793+
InterleavingValues[K] = Vals[I];
2794+
I++;
2795+
} else {
2796+
InterleavingValues[K] = Vals[J];
2797+
J++;
2798+
}
2799+
}
2800+
#ifndef NDEBUG
2801+
for (Value *Val : InterleavingValues)
2802+
assert(Val && "NULL Interleaving Value");
2803+
#endif
2804+
for (unsigned I = 1; I < InterleavingValuesCount; I += 2) {
2805+
VectorType *InterleaveTy =
2806+
cast<VectorType>(InterleavingValues[I]->getType());
2807+
VectorType *WideVecTy =
2808+
VectorType::getDoubleElementsVectorType(InterleaveTy);
2809+
auto *InterleaveRes = Builder.CreateIntrinsic(
2810+
WideVecTy, Intrinsic::vector_interleave2,
2811+
{InterleavingValues[I - 1], InterleavingValues[I]},
2812+
/*FMFSource=*/nullptr, Name);
2813+
InterleavingValues.push_back(InterleaveRes);
2814+
}
2815+
return InterleavingValues[InterleavingValuesCount];
27862816
}
27872817

27882818
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2868,15 +2898,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
28682898
&InterleaveFactor](Value *MaskForGaps) -> Value * {
28692899
if (State.VF.isScalable()) {
28702900
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2871-
assert(InterleaveFactor == 2 &&
2901+
assert(isPowerOf2_32(InterleaveFactor) &&
28722902
"Unsupported deinterleave factor for scalable vectors");
28732903
auto *ResBlockInMask = State.get(BlockInMask);
2874-
SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2875-
auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2876-
State.VF.getKnownMinValue() * 2, true);
2877-
return State.Builder.CreateIntrinsic(
2878-
MaskTy, Intrinsic::vector_interleave2, Ops,
2879-
/*FMFSource=*/nullptr, "interleaved.mask");
2904+
SmallVector<Value *> Ops;
2905+
Ops.resize(InterleaveFactor, ResBlockInMask);
2906+
return interleaveVectors(State.Builder, Ops, "interleaved.mask");
28802907
}
28812908

28822909
if (!BlockInMask)
@@ -2916,35 +2943,76 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
29162943
ArrayRef<VPValue *> VPDefs = definedValues();
29172944
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
29182945
if (VecTy->isScalableTy()) {
2919-
assert(InterleaveFactor == 2 &&
2946+
assert(isPowerOf2_32(InterleaveFactor) &&
29202947
"Unsupported deinterleave factor for scalable vectors");
29212948

29222949
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
29232950
// so must use intrinsics to deinterleave.
2924-
Value *DI = State.Builder.CreateIntrinsic(
2925-
Intrinsic::vector_deinterleave2, VecTy, NewLoad,
2926-
/*FMFSource=*/nullptr, "strided.vec");
2927-
unsigned J = 0;
2928-
for (unsigned I = 0; I < InterleaveFactor; ++I) {
2929-
Instruction *Member = Group->getMember(I);
2930-
2931-
if (!Member)
2932-
continue;
2933-
2934-
Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
2935-
// If this member has different type, cast the result type.
2936-
if (Member->getType() != ScalarTy) {
2937-
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
2938-
StridedVec =
2939-
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
2940-
}
29412951

2942-
if (Group->isReverse())
2943-
StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");
2952+
SmallVector<Value *> DeinterleavedValues;
2953+
// If the InterleaveFactor is > 2, so we will have to do recursive
2954+
// deinterleaving, because the current available deinterleave intrinsice
2955+
// supports only Factor of 2. DeinterleaveCount represent how many times
2956+
// we will do deinterleaving, we will do deinterleave on all nonleaf
2957+
// nodes in the deinterleave tree.
2958+
unsigned DeinterleaveCount = InterleaveFactor - 1;
2959+
std::queue<Value *> TempDeinterleavedValues;
2960+
TempDeinterleavedValues.push(NewLoad);
2961+
for (unsigned I = 0; I < DeinterleaveCount; ++I) {
2962+
Value *ValueToDeinterleave = TempDeinterleavedValues.front();
2963+
auto *DiTy = ValueToDeinterleave->getType();
2964+
TempDeinterleavedValues.pop();
2965+
Value *DI = State.Builder.CreateIntrinsic(
2966+
Intrinsic::vector_deinterleave2, DiTy, ValueToDeinterleave,
2967+
/*FMFSource=*/nullptr, "strided.vec");
2968+
Value *StridedVec = State.Builder.CreateExtractValue(DI, 0);
2969+
TempDeinterleavedValues.push(StridedVec);
2970+
StridedVec = State.Builder.CreateExtractValue(DI, 1);
2971+
TempDeinterleavedValues.push(StridedVec);
2972+
}
29442973

2945-
State.set(VPDefs[J], StridedVec);
2946-
++J;
2947-
}
2974+
assert(TempDeinterleavedValues.size() == InterleaveFactor &&
2975+
"Num of deinterleaved values must equals to InterleaveFactor");
2976+
// Sort deinterleaved values
2977+
DeinterleavedValues.resize(InterleaveFactor);
2978+
for (unsigned I = 0, J = InterleaveFactor / 2, K = 0;
2979+
K < InterleaveFactor; K++) {
2980+
auto *DeinterleavedValue = TempDeinterleavedValues.front();
2981+
TempDeinterleavedValues.pop();
2982+
if (K % 2 == 0) {
2983+
DeinterleavedValues[I] = DeinterleavedValue;
2984+
I++;
2985+
} else {
2986+
DeinterleavedValues[J] = DeinterleavedValue;
2987+
J++;
2988+
}
2989+
}
2990+
#ifndef NDEBUG
2991+
for (Value *Val : DeinterleavedValues)
2992+
assert(Val && "NULL Deinterleaved Value");
2993+
#endif
2994+
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
2995+
Instruction *Member = Group->getMember(I);
2996+
Value *StridedVec = DeinterleavedValues[I];
2997+
if (!Member) {
2998+
// This value is not needed as it's not used
2999+
static_cast<Instruction *>(StridedVec)->eraseFromParent();
3000+
continue;
3001+
}
3002+
// If this member has different type, cast the result type.
3003+
if (Member->getType() != ScalarTy) {
3004+
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
3005+
StridedVec =
3006+
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
3007+
}
3008+
3009+
if (Group->isReverse())
3010+
StridedVec =
3011+
State.Builder.CreateVectorReverse(StridedVec, "reverse");
3012+
3013+
State.set(VPDefs[J], StridedVec);
3014+
++J;
3015+
}
29483016

29493017
return;
29503018
}

llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2-
; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
2+
; RUN: opt < %s -passes=loop-vectorize,interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
33

44

55
define void @deinterleave4(ptr %src) {
@@ -136,3 +136,145 @@ define void @negative_deinterleave4_test(ptr %src) {
136136

137137
ret void
138138
}
139+
140+
%struct.xyzt = type { i32, i32, i32, i32 }
141+
142+
define void @interleave_deinterleave(ptr writeonly %dst, ptr readonly %a, ptr readonly %b) {
143+
; CHECK-LABEL: define void @interleave_deinterleave
144+
; CHECK-SAME: (ptr writeonly [[DST:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR0]] {
145+
; CHECK-NEXT: entry:
146+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
147+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
148+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
149+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
150+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
151+
; CHECK: vector.memcheck:
152+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 16384
153+
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 16384
154+
; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i64 16384
155+
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
156+
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
157+
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
158+
; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
159+
; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
160+
; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
161+
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
162+
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
163+
; CHECK: vector.ph:
164+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
165+
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
166+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
167+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
168+
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
169+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
170+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
171+
; CHECK: vector.body:
172+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
173+
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
174+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A]], i64 [[TMP7]]
175+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
176+
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP9]])
177+
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
178+
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
179+
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
180+
; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
181+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[TMP7]]
182+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
183+
; CHECK-NEXT: [[LDN14:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP15]])
184+
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN14]], 0
185+
; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN14]], 1
186+
; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN14]], 2
187+
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN14]], 3
188+
; CHECK-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP10]]
189+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[TMP7]]
190+
; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP11]], [[TMP17]]
191+
; CHECK-NEXT: [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP12]], [[TMP18]]
192+
; CHECK-NEXT: [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP13]], [[TMP19]]
193+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i64 12
194+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 -3
195+
; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP26]])
196+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
197+
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
198+
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
199+
; CHECK: middle.block:
200+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
201+
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
202+
; CHECK: scalar.ph:
203+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
204+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
205+
; CHECK: for.body:
206+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
207+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
208+
; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
209+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
210+
; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
211+
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]]
212+
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
213+
; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
214+
; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
215+
; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Y]], align 4
216+
; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
217+
; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Y11]], align 4
218+
; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP30]], [[TMP31]]
219+
; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
220+
; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4
221+
; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
222+
; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[Z]], align 4
223+
; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
224+
; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Z19]], align 4
225+
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP32]], [[TMP33]]
226+
; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
227+
; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4
228+
; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
229+
; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[T]], align 4
230+
; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
231+
; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[T27]], align 4
232+
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP34]], [[TMP35]]
233+
; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
234+
; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4
235+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
236+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
237+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
238+
; CHECK: for.cond.cleanup:
239+
; CHECK-NEXT: ret void
240+
;
241+
entry:
242+
br label %for.body
243+
244+
for.body:
245+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
246+
%arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
247+
%0 = load i32, ptr %arrayidx, align 4
248+
%arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
249+
%1 = load i32, ptr %arrayidx2, align 4
250+
%add = add nsw i32 %1, %0
251+
%arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
252+
store i32 %add, ptr %arrayidx5, align 4
253+
%y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
254+
%2 = load i32, ptr %y, align 4
255+
%y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
256+
%3 = load i32, ptr %y11, align 4
257+
%sub = sub nsw i32 %2, %3
258+
%y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
259+
store i32 %sub, ptr %y14, align 4
260+
%z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
261+
%4 = load i32, ptr %z, align 4
262+
%z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
263+
%5 = load i32, ptr %z19, align 4
264+
%shl = shl i32 %4, %5
265+
%z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
266+
store i32 %shl, ptr %z22, align 4
267+
%t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
268+
%6 = load i32, ptr %t, align 4
269+
%t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
270+
%7 = load i32, ptr %t27, align 4
271+
%shr = ashr i32 %6, %7
272+
%t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
273+
store i32 %shr, ptr %t30, align 4
274+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
275+
%exitcond.not = icmp eq i64 %indvars.iv.next, 1024
276+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
277+
278+
for.cond.cleanup:
279+
ret void
280+
}

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
396396
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
397397
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
398398
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
399-
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
400399
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
400+
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
401401
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
402402
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
403403
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]

0 commit comments

Comments
 (0)