Skip to content

[LoopVectorize][AArch64] Add limited support for scalable vectorisation of i1 types #95920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3187,11 +3187,16 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
if (!LT.first.isValid())
return InstructionCost::getInvalid();

// Return an invalid cost for element types that we are unable to lower.
auto *VT = cast<VectorType>(Src);
if (VT->getElementType()->isIntegerTy(1))
return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
if (VT->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();

return LT.first;
Expand All @@ -3212,16 +3217,17 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
if (!LT.first.isValid())
return InstructionCost::getInvalid();

// Return an invalid cost for element types that we are unable to lower.
if (!LT.second.isVector() ||
!isElementTypeLegalForScalableVector(VT->getElementType()))
!isElementTypeLegalForScalableVector(VT->getElementType()) ||
VT->getElementType()->isIntegerTy(1))
return InstructionCost::getInvalid();

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (cast<VectorType>(DataTy)->getElementCount() ==
ElementCount::getScalable(1))
if (VT->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();

ElementCount LegalVF = LT.second.getVectorElementCount();
Expand Down Expand Up @@ -3259,8 +3265,12 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
// We also only support full register predicate loads and stores.
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
if (VTy->getElementCount() == ElementCount::getScalable(1))
if (VTy->getElementCount() == ElementCount::getScalable(1) ||
(VTy->getElementType()->isIntegerTy(1) &&
!VTy->getElementCount().isKnownMultipleOf(
ElementCount::getScalable(16))))
return InstructionCost::getInvalid();

// TODO: consider latency as well for TCK_SizeAndLatency.
Expand Down Expand Up @@ -4234,4 +4244,4 @@ bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);

return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
}
}
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
return true;

if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
return true;

Expand Down
5 changes: 5 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ define void @scalable() {
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
entry:
Expand Down Expand Up @@ -103,6 +104,9 @@ entry:
%nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
%nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)

; Types that are legal, but for which we have no masked load/store lowering
%nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)

ret void
}

Expand Down Expand Up @@ -265,6 +269,7 @@ declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>
declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
declare <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)


declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,27 @@ define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nx
; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-VSCALE-2-LABEL: 'masked_gathers'
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-VSCALE-1-LABEL: 'masked_gathers'
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
%res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
%res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
%res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
ret void
}

Expand Down Expand Up @@ -130,6 +134,7 @@ attributes #3 = { "target-features"="+sve" vscale_range(2, 2) }
declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
declare <vscale x 4 x i1> @llvm.masked.gather.nxv4i1(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i1>)
declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x double>)
declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x float>)
Expand Down
12 changes: 12 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ define void @scalable_loads() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i8 = load <vscale x 16 x i8>, ptr undef, align 16
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i8 = load <vscale x 32 x i8>, ptr undef, align 32
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = load <vscale x 1 x i64>, ptr undef, align 8
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i1 = load <vscale x 32 x i1>, ptr undef, align 4
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i1 = load <vscale x 16 x i1>, ptr undef, align 2
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = load <vscale x 4 x i1>, ptr undef, align 1
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%res.nxv8i8 = load <vscale x 8 x i8>, ptr undef
%res.nxv16i8 = load <vscale x 16 x i8>, ptr undef
%res.nxv32i8 = load <vscale x 32 x i8>, ptr undef
%res.nxv1i64 = load <vscale x 1 x i64>, ptr undef
%res.nxv32i1 = load <vscale x 32 x i1>, ptr undef
%res.nxv16i1 = load <vscale x 16 x i1>, ptr undef
%res.nxv4i1 = load <vscale x 4 x i1>, ptr undef
ret void
}

Expand All @@ -24,12 +30,18 @@ define void @scalable_stores() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i8> undef, ptr undef, align 16
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 32 x i8> undef, ptr undef, align 32
; CHECK-NEXT: Cost Model: Invalid cost for instruction: store <vscale x 1 x i64> undef, ptr undef, align 8
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 32 x i1> undef, ptr undef, align 4
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i1> undef, ptr undef, align 2
; CHECK-NEXT: Cost Model: Invalid cost for instruction: store <vscale x 4 x i1> undef, ptr undef, align 1
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
store <vscale x 8 x i8> undef, ptr undef
store <vscale x 16 x i8> undef, ptr undef
store <vscale x 32 x i8> undef, ptr undef
store <vscale x 1 x i64> undef, ptr undef
store <vscale x 32 x i1> undef, ptr undef
store <vscale x 16 x i1> undef, ptr undef
store <vscale x 4 x i1> undef, ptr undef
ret void
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,27 @@ define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %n
; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-VSCALE-2-LABEL: 'masked_scatters'
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-VSCALE-1-LABEL: 'masked_scatters'
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
call void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
call void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
ret void
}

Expand Down Expand Up @@ -112,6 +116,7 @@ attributes #2 = { "target-features"="+sve" }
declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
declare void @llvm.masked.scatter.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
declare void @llvm.masked.scatter.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x ptr>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x ptr>, i32, <vscale x 8 x i1>)
Expand Down
32 changes: 32 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,38 @@ for.cond.cleanup:
ret void
}


; ADD (with reduction of i1)

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i1 @add_trunc_i32_i1(ptr nocapture %src, i64 %N) {
; CHECK-LABEL: @add_trunc_i32_i1
; CHECK: vector.body:
; CHECK: %[[PHI1:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %20, %vector.body ]
; CHECK: %[[PHI2:.*]] = phi <vscale x 8 x i1> [ zeroinitializer, %{{.*}} ], [ %21, %vector.body ]
; CHECK: %[[TRUNC1:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
; CHECK: %[[TRUNC2:.*]] = trunc <vscale x 8 x i32> %{{.*}} to <vscale x 8 x i1>
; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI1]], %[[TRUNC1]]
; CHECK: %{{.*}} = xor <vscale x 8 x i1> %[[PHI2]], %[[TRUNC2]]
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%red = phi i1 [ 0, %entry ], [ %red.next, %for.body ]
%arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv
%load32 = load i32, ptr %arrayidx, align 4
%trunc = trunc i32 %load32 to i1
%red.next = xor i1 %red, %trunc
%iv.next = add i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %N
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0

for.end:
ret i1 %red.next
}


; Reduction cannot be vectorized

; MUL
Expand Down
27 changes: 0 additions & 27 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,33 +80,6 @@ for.end:
ret void
}

; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
; CHECK-LABEL: @uniform_store_i1
; CHECK: vector.body
; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x ptr> {{.*}}, i64 1
; CHECK: %[[ICMP:.*]] = icmp eq <64 x ptr> %[[GEP]], %[[SPLAT:.*]]
; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 63
; CHECK: store i1 %[[EXTRACT1]], ptr %dst
; CHECK-NOT: vscale
entry:
br label %for.body

for.body:
%first.sroa = phi ptr [ %incdec.ptr, %for.body ], [ %start, %entry ]
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
%iv.next = add i64 %iv, 1
%0 = load i64, ptr %first.sroa
%incdec.ptr = getelementptr inbounds i64, ptr %first.sroa, i64 1
%cmp.not = icmp eq ptr %incdec.ptr, %start
store i1 %cmp.not, ptr %dst
%cmp = icmp ult i64 %iv, %N
br i1 %cmp, label %for.body, label %end, !llvm.loop !0

end:
ret void
}

define dso_local void @loop_fixed_width_i128(ptr nocapture %ptr, i64 %N) {
; CHECK-LABEL: @loop_fixed_width_i128
; CHECK: load <4 x i128>, ptr
Expand Down
Loading
Loading