Skip to content

Commit aec4b95

Browse files
committed
[resolve comments]: add tests for masked interleaved accesses
1 parent 155cc1e commit aec4b95

File tree

3 files changed

+376
-9
lines changed

3 files changed

+376
-9
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2779,11 +2779,10 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
27792779
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
27802780
// must use intrinsics to interleave.
27812781
if (VecTy->isScalableTy()) {
2782-
unsigned InterleaveFactor = Vals.size();
27832782
SmallVector<Value *> InterleavingValues(Vals);
27842783
// When interleaving, the number of values will be shrunk until we have the
27852784
// single final interleaved value.
2786-
VectorType *InterleaveTy =
2785+
auto *InterleaveTy =
27872786
cast<VectorType>(InterleavingValues[0]->getType());
27882787
for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
27892788
InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
@@ -2937,18 +2936,18 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
29372936
// iteration.
29382937
// When deinterleaving, the number of values will double until we
29392938
// have "InterleaveFactor".
2940-
for (int NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) {
2941-
// deinterleave the elements within the vector
2942-
std::vector<Value *> TempDeinterleavedValues(NumVectors);
2943-
for (int I = 0; I < NumVectors; ++I) {
2939+
for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; NumVectors *= 2) {
2940+
// Deinterleave the elements within the vector
2941+
SmallVector<Value *> TempDeinterleavedValues(NumVectors);
2942+
for (unsigned I = 0; I < NumVectors; ++I) {
29442943
auto *DiTy = DeinterleavedValues[I]->getType();
29452944
TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
29462945
Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
29472946
/*FMFSource=*/nullptr, "strided.vec");
29482947
}
29492948
// Extract the deinterleaved values:
2950-
for (int I = 0; I < 2; ++I)
2951-
for (int J = 0; J < NumVectors; ++J)
2949+
for (unsigned I = 0; I < 2; ++I)
2950+
for (unsigned J = 0; J < NumVectors; ++J)
29522951
DeinterleavedValues[NumVectors * I + J] =
29532952
State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
29542953
}

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1548,13 +1548,15 @@ end:
15481548
ret void
15491549
}
15501550

1551-
%struct.xyzt = type { i32, i32, i32, i32 }
1551+
; Check vectorization on an interleaved load/store groups of factor 4
1552+
15521553
; for (int i = 0; i < 1024; ++i) {
15531554
; dst[i].x = a[i].x + b[i].x;
15541555
; dst[i].y = a[i].y - b[i].y;
15551556
; dst[i].z = a[i].z << b[i].z;
15561557
; dst[i].t = a[i].t >> b[i].t;
15571558
; }
1559+
%struct.xyzt = type { i32, i32, i32, i32 }
15581560

15591561
define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
15601562
; CHECK-LABEL: @interleave_deinterleave(
@@ -1690,5 +1692,119 @@ for.end:
16901692
ret void
16911693
}
16921694

1695+
; Check vectorization on a reverse interleaved load/store groups of factor 4
1696+
1697+
; for (int i = 1023; i >= 0; i--) {
1698+
; int a = A[i].x + i;
1699+
; int b = A[i].y - i;
1700+
; int c = A[i].z * i;
1701+
; int d = A[i].t << i;
1702+
; B[i].x = a;
1703+
; B[i].y = b;
1704+
; B[i].z = c;
1705+
; B[i].t = d;
1706+
; }
1707+
1708+
define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
1709+
; CHECK-LABEL: @interleave_deinterleave_reverse(
1710+
; CHECK-NEXT: entry:
1711+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1712+
; CHECK: vector.ph:
1713+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1714+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
1715+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
1716+
; CHECK-NEXT: [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
1717+
; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
1718+
; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
1719+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
1720+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
1721+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1722+
; CHECK: vector.body:
1723+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1724+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1725+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
1726+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
1727+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
1728+
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
1729+
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
1730+
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
1731+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
1732+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
1733+
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
1734+
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
1735+
; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
1736+
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
1737+
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
1738+
; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
1739+
; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
1740+
; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
1741+
; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
1742+
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
1743+
; CHECK-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
1744+
; CHECK-NEXT: [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
1745+
; CHECK-NEXT: [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
1746+
; CHECK-NEXT: [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
1747+
; CHECK-NEXT: [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
1748+
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
1749+
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
1750+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
1751+
; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
1752+
; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
1753+
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
1754+
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
1755+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
1756+
; CHECK-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
1757+
; CHECK-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
1758+
; CHECK-NEXT: [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
1759+
; CHECK-NEXT: [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
1760+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
1761+
; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
1762+
; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
1763+
; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
1764+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
1765+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
1766+
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
1767+
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
1768+
; CHECK: middle.block:
1769+
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
1770+
; CHECK: scalar.ph:
1771+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
1772+
; CHECK: for.cond.cleanup:
1773+
; CHECK-NEXT: ret void
1774+
; CHECK: for.body:
1775+
; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]]
1776+
;
1777+
entry:
1778+
br label %for.body
1779+
for.cond.cleanup: ; preds = %for.body
1780+
ret void
1781+
for.body: ; preds = %for.body, %entry
1782+
%indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
1783+
%x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0
1784+
%load1 = load i32, ptr %x, align 4
1785+
%trunc = trunc i64 %indvars.iv to i32
1786+
%add = add nsw i32 %load1, %trunc
1787+
%y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1
1788+
%load2 = load i32, ptr %y, align 4
1789+
%sub = sub nsw i32 %load2, %trunc
1790+
%z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2
1791+
%load3 = load i32, ptr %z, align 4
1792+
%mul = mul nsw i32 %load3, %trunc
1793+
%t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3
1794+
%load4 = load i32, ptr %t, align 4
1795+
%shl = shl nuw nsw i32 %load4, %trunc
1796+
%x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0
1797+
store i32 %add, ptr %x5, align 4
1798+
%y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1
1799+
store i32 %sub, ptr %y8, align 4
1800+
%z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2
1801+
store i32 %mul, ptr %z5, align 4
1802+
%t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3
1803+
store i32 %shl, ptr %t8, align 4
1804+
%indvars.iv.next = add nsw i64 %indvars.iv, -1
1805+
%cmp = icmp sgt i64 %indvars.iv, 0
1806+
br i1 %cmp, label %for.body, label %for.cond.cleanup
1807+
1808+
}
16931809
attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
16941810
attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }

0 commit comments

Comments
 (0)