Skip to content

Commit 2dd4167

Browse files
authored
[LoopVectorize][AArch64] Add limited support for scalable vectorisation of i1 types (#95920)
Previously isElementTypeLegalForScalableVector returned false for i1 types, which also prevented vectorisation of loops with i1 reductions. This is overkill - we only need to disable vectorisation for loads and/or stores of i1 types. I've added i1 as a legal type, but changed the cost model to return an invalid cost for loads and stores.
1 parent 1462605 commit 2dd4167

File tree

9 files changed

+102
-33
lines changed

9 files changed

+102
-33
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3259,11 +3259,16 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
32593259
if (!LT.first.isValid())
32603260
return InstructionCost::getInvalid();
32613261

3262+
// Return an invalid cost for element types that we are unable to lower.
3263+
auto *VT = cast<VectorType>(Src);
3264+
if (VT->getElementType()->isIntegerTy(1))
3265+
return InstructionCost::getInvalid();
3266+
32623267
// The code-generator is currently not able to handle scalable vectors
32633268
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
32643269
// it. This change will be removed when code-generation for these types is
32653270
// sufficiently reliable.
3266-
if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
3271+
if (VT->getElementCount() == ElementCount::getScalable(1))
32673272
return InstructionCost::getInvalid();
32683273

32693274
return LT.first;
@@ -3284,16 +3289,17 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
32843289
if (!LT.first.isValid())
32853290
return InstructionCost::getInvalid();
32863291

3292+
// Return an invalid cost for element types that we are unable to lower.
32873293
if (!LT.second.isVector() ||
3288-
!isElementTypeLegalForScalableVector(VT->getElementType()))
3294+
!isElementTypeLegalForScalableVector(VT->getElementType()) ||
3295+
VT->getElementType()->isIntegerTy(1))
32893296
return InstructionCost::getInvalid();
32903297

32913298
// The code-generator is currently not able to handle scalable vectors
32923299
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
32933300
// it. This change will be removed when code-generation for these types is
32943301
// sufficiently reliable.
3295-
if (cast<VectorType>(DataTy)->getElementCount() ==
3296-
ElementCount::getScalable(1))
3302+
if (VT->getElementCount() == ElementCount::getScalable(1))
32973303
return InstructionCost::getInvalid();
32983304

32993305
ElementCount LegalVF = LT.second.getVectorElementCount();
@@ -3331,8 +3337,12 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
33313337
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
33323338
// it. This change will be removed when code-generation for these types is
33333339
// sufficiently reliable.
3340+
// We also only support full register predicate loads and stores.
33343341
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3335-
if (VTy->getElementCount() == ElementCount::getScalable(1))
3342+
if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3343+
(VTy->getElementType()->isIntegerTy(1) &&
3344+
!VTy->getElementCount().isKnownMultipleOf(
3345+
ElementCount::getScalable(16))))
33363346
return InstructionCost::getInvalid();
33373347

33383348
// TODO: consider latency as well for TCK_SizeAndLatency.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
248248
if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
249249
return true;
250250

251-
if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
251+
if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
252252
Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
253253
return true;
254254

llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ define void @scalable() {
7575
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
7676
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
7777
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
78+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
7879
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
7980
;
8081
entry:
@@ -103,6 +104,9 @@ entry:
103104
%nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
104105
%nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
105106

107+
; Types that are legal, but for which we have no masked load/store lowering
108+
%nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
109+
106110
ret void
107111
}
108112

@@ -265,6 +269,7 @@ declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>
265269
declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
266270
declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
267271
declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
272+
declare <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
268273

269274

270275
declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)

llvm/test/Analysis/CostModel/AArch64/sve-gather.ll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,27 @@ define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nx
1212
; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
1313
; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
1414
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
15+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
1516
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
1617
;
1718
; CHECK-VSCALE-2-LABEL: 'masked_gathers'
1819
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
1920
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
2021
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
22+
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
2123
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2224
;
2325
; CHECK-VSCALE-1-LABEL: 'masked_gathers'
2426
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
2527
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
2628
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
29+
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
2730
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2831
;
2932
%res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
3033
%res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
3134
%res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
35+
%res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
3236
ret void
3337
}
3438

@@ -130,6 +134,7 @@ attributes #3 = { "target-features"="+sve" vscale_range(2, 2) }
130134
declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
131135
declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
132136
declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
137+
declare <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
133138
declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
134139
declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
135140
declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)

llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,18 @@ define void @scalable_loads() {
99
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i8 = load <vscale x 16 x i8>, ptr undef, align 16
1010
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i8 = load <vscale x 32 x i8>, ptr undef, align 32
1111
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = load <vscale x 1 x i64>, ptr undef, align 8
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i1 = load <vscale x 32 x i1>, ptr undef, align 4
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i1 = load <vscale x 16 x i1>, ptr undef, align 2
14+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = load <vscale x 4 x i1>, ptr undef, align 1
1215
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
1316
;
1417
%res.nxv8i8 = load <vscale x 8 x i8>, ptr undef
1518
%res.nxv16i8 = load <vscale x 16 x i8>, ptr undef
1619
%res.nxv32i8 = load <vscale x 32 x i8>, ptr undef
1720
%res.nxv1i64 = load <vscale x 1 x i64>, ptr undef
21+
%res.nxv32i1 = load <vscale x 32 x i1>, ptr undef
22+
%res.nxv16i1 = load <vscale x 16 x i1>, ptr undef
23+
%res.nxv4i1 = load <vscale x 4 x i1>, ptr undef
1824
ret void
1925
}
2026

@@ -24,12 +30,18 @@ define void @scalable_stores() {
2430
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
2531
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 32 x i8> undef, ptr undef, align 32
2632
; CHECK-NEXT: Cost Model: Invalid cost for instruction: store <vscale x 1 x i64> undef, ptr undef, align 8
33+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 32 x i1> undef, ptr undef, align 4
34+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i1> undef, ptr undef, align 2
35+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: store <vscale x 4 x i1> undef, ptr undef, align 1
2736
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2837
;
2938
store <vscale x 8 x i8> undef, ptr undef
3039
store <vscale x 16 x i8> undef, ptr undef
3140
store <vscale x 32 x i8> undef, ptr undef
3241
store <vscale x 1 x i64> undef, ptr undef
42+
store <vscale x 32 x i1> undef, ptr undef
43+
store <vscale x 16 x i1> undef, ptr undef
44+
store <vscale x 4 x i1> undef, ptr undef
3345
ret void
3446
}
3547

llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,27 @@ define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %n
1212
; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
1313
; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
1414
; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
15+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
1516
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
1617
;
1718
; CHECK-VSCALE-2-LABEL: 'masked_scatters'
1819
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
1920
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
2021
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
22+
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
2123
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2224
;
2325
; CHECK-VSCALE-1-LABEL: 'masked_scatters'
2426
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
2527
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
2628
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
29+
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
2730
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2831
;
2932
call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
3033
call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
3134
call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
35+
call void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
3236
ret void
3337
}
3438

@@ -112,6 +116,7 @@ attributes #2 = { "target-features"="+sve" }
112116
declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
113117
declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
114118
declare void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
119+
declare void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
115120
declare void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
116121
declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x ptr>, i32, <vscale x 2 x i1>)
117122
declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)

llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,38 @@ for.cond.cleanup:
350350
ret void
351351
}
352352

353+
354+
; ADD (with reduction of i1)
355+
356+
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
357+
define i1 @add_trunc_i32_i1(ptr nocapture %src, i64 %N) {
358+
; CHECK-LABEL: @add_trunc_i32_i1
359+
; CHECK: vector.body:
360+
; CHECK: %[[PHI1:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %20, %vector.body ]
361+
; CHECK: %[[PHI2:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %21, %vector.body ]
362+
; CHECK: %[[TRUNC1:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
363+
; CHECK: %[[TRUNC2:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
364+
; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI1]], %[[TRUNC1]]
365+
; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI2]], %[[TRUNC2]]
366+
entry:
367+
br label %for.body
368+
369+
for.body:
370+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
371+
%red = phi i1 [ 0, %entry ], [ %red.next, %for.body ]
372+
%arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv
373+
%load32 = load i32, ptr %arrayidx, align 4
374+
%trunc = trunc i32 %load32 to i1
375+
%red.next = xor i1 %red, %trunc
376+
%iv.next = add i64 %iv, 1
377+
%exitcond.not = icmp eq i64 %iv.next, %N
378+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
379+
380+
for.end:
381+
ret i1 %red.next
382+
}
383+
384+
353385
; Reduction cannot be vectorized
354386

355387
; MUL

llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -80,33 +80,6 @@ for.end:
8080
ret void
8181
}
8282

83-
; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
84-
define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
85-
; CHECK-LABEL: @uniform_store_i1
86-
; CHECK: vector.body
87-
; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x ptr> {{.*}}, i64 1
88-
; CHECK: %[[ICMP:.*]] = icmp eq <64 x ptr> %[[GEP]], %[[SPLAT:.*]]
89-
; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 63
90-
; CHECK: store i1 %[[EXTRACT1]], ptr %dst
91-
; CHECK-NOT: vscale
92-
entry:
93-
br label %for.body
94-
95-
for.body:
96-
%first.sroa = phi ptr [ %incdec.ptr, %for.body ], [ %start, %entry ]
97-
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
98-
%iv.next = add i64 %iv, 1
99-
%0 = load i64, ptr %first.sroa
100-
%incdec.ptr = getelementptr inbounds i64, ptr %first.sroa, i64 1
101-
%cmp.not = icmp eq ptr %incdec.ptr, %start
102-
store i1 %cmp.not, ptr %dst
103-
%cmp = icmp ult i64 %iv, %N
104-
br i1 %cmp, label %for.body, label %end, !llvm.loop !0
105-
106-
end:
107-
ret void
108-
}
109-
11083
define dso_local void @loop_fixed_width_i128(ptr nocapture %ptr, i64 %N) {
11184
; CHECK-LABEL: @loop_fixed_width_i128
11285
; CHECK: load <4 x i128>, ptr

0 commit comments

Comments
 (0)