Skip to content

[LLVM][SVE] Honour calling convention when using SVE for fixed length vectors. #70847

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26718,3 +26718,99 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
return Subtarget->getMinimumJumpTableEntries();
}

MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
bool NonUnitFixedLengthVector =
VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

EVT VT1;
MVT RegisterVT;
unsigned NumIntermediates;
getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
RegisterVT);
return RegisterVT;
}

unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
bool NonUnitFixedLengthVector =
VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

EVT VT1;
MVT VT2;
unsigned NumIntermediates;
return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
NumIntermediates, VT2);
}

unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
if (!RegisterVT.isFixedLengthVector() ||
RegisterVT.getFixedSizeInBits() <= 128)
return NumRegs;

assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");

// A size mismatch here implies either type promotion or widening and would
// have resulted in scalarisation if larger vectors had not be available.
if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
EVT EltTy = VT.getVectorElementType();
EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
if (!isTypeLegal(NewVT))
NewVT = EltTy;

IntermediateVT = NewVT;
NumIntermediates = VT.getVectorNumElements();
RegisterVT = getRegisterType(Context, NewVT);
return NumIntermediates;
}

// SVE VLS support does not introduce a new ABI so we should use NEON sized
// types for vector arguments and returns.

unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
NumIntermediates *= NumSubRegs;
NumRegs *= NumSubRegs;

switch (RegisterVT.getVectorElementType().SimpleTy) {
default:
llvm_unreachable("unexpected element type for vector");
case MVT::i8:
IntermediateVT = RegisterVT = MVT::v16i8;
break;
case MVT::i16:
IntermediateVT = RegisterVT = MVT::v8i16;
break;
case MVT::i32:
IntermediateVT = RegisterVT = MVT::v4i32;
break;
case MVT::i64:
IntermediateVT = RegisterVT = MVT::v2i64;
break;
case MVT::f16:
IntermediateVT = RegisterVT = MVT::v8f16;
break;
case MVT::f32:
IntermediateVT = RegisterVT = MVT::v4f32;
break;
case MVT::f64:
IntermediateVT = RegisterVT = MVT::v2f64;
break;
case MVT::bf16:
IntermediateVT = RegisterVT = MVT::v8bf16;
break;
}

return NumRegs;
}
12 changes: 12 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,18 @@ class AArch64TargetLowering : public TargetLowering {
// used for 64bit and 128bit vectors as well.
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;

// Follow NEON ABI rules even when using SVE for fixed length vectors.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
EVT VT) const override;
unsigned getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const override;
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
CallingConv::ID CC, EVT VT,
EVT &IntermediateVT,
unsigned &NumIntermediates,
MVT &RegisterVT) const override;

private:
/// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
Expand Down
245 changes: 245 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-function-calls.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s | FileCheck %s
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

declare void @foo_v32i8(<32 x i8>)
define void @test_v32i8(<32 x i8> %unused, <32 x i8> %a) #0 {
; CHECK-LABEL: test_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v32i8
tail call void @foo_v32i8(<32 x i8> %a)
ret void
}

declare void @foo_v16i16(<16 x i16>)
define void @test_v16i16(<16 x i16> %unused, <16 x i16> %a) #0 {
; CHECK-LABEL: test_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v16i16
tail call void @foo_v16i16(<16 x i16> %a)
ret void
}

declare void @foo_v8i32(<8 x i32>)
define void @test_v8i32(<8 x i32> %unused, <8 x i32> %a) #0 {
; CHECK-LABEL: test_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v8i32
tail call void @foo_v8i32(<8 x i32> %a)
ret void
}

declare void @foo_v4i64(<4 x i64>)
define void @test_v4i64(<4 x i64> %unused, <4 x i64> %a) #0 {
; CHECK-LABEL: test_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v4i64
tail call void @foo_v4i64(<4 x i64> %a)
ret void
}

declare void @foo_v16f16(<16 x half>)
define void @test_v16f16(<16 x half> %unused, <16 x half> %a) #0 {
; CHECK-LABEL: test_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v16f16
tail call void @foo_v16f16(<16 x half> %a)
ret void
}

declare void @foo_v8f32(<8 x float>)
define void @test_v8f32(<8 x float> %unused, <8 x float> %a) #0 {
; CHECK-LABEL: test_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v8f32
tail call void @foo_v8f32(<8 x float> %a)
ret void
}

declare void @foo_v4f64(<4 x double>)
define void @test_v4f64(<4 x double> %unused, <4 x double> %a) #0 {
; CHECK-LABEL: test_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v4f64
tail call void @foo_v4f64(<4 x double> %a)
ret void
}

declare void @foo_v16bf16(<16 x bfloat>)
define void @test_v16bf16(<16 x bfloat> %unused, <16 x bfloat> %a) #0 {
; CHECK-LABEL: test_v16bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v16bf16
tail call void @foo_v16bf16(<16 x bfloat> %a)
ret void
}

declare void @foo_v3i64(<3 x i64>)
define void @test_v3i64(<3 x i64> %unused, <3 x i64> %a) #0 {
; CHECK-LABEL: test_v3i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d2, d5
; CHECK-NEXT: fmov d1, d4
; CHECK-NEXT: fmov d0, d3
; CHECK-NEXT: b foo_v3i64
tail call void @foo_v3i64(<3 x i64> %a)
ret void
}

declare void @foo_v5i64(<5 x i64>)
define void @test_v5i64(<5 x i64> %unused, <5 x i64> %a) #0 {
; CHECK-LABEL: test_v5i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d1, d6
; CHECK-NEXT: fmov d0, d5
; CHECK-NEXT: fmov d2, d7
; CHECK-NEXT: ldp d3, d4, [sp]
; CHECK-NEXT: b foo_v5i64
tail call void @foo_v5i64(<5 x i64> %a)
ret void
}

declare void @foo_v1i16(<1 x i16>)
define void @test_v1i16(<1 x i16> %unused, <1 x i16> %a) #0 {
; CHECK-LABEL: test_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: b foo_v1i16
tail call void @foo_v1i16(<1 x i16> %a)
ret void
}

declare void @foo_v9i16(<9 x i16>)
define void @test_v9i16(<9 x i16> %unused, <9 x i16> %a) #0 {
; CHECK-LABEL: test_v9i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr w0, [sp, #8]
; CHECK-NEXT: ldr w1, [sp, #16]
; CHECK-NEXT: ldr w2, [sp, #24]
; CHECK-NEXT: ldr w3, [sp, #32]
; CHECK-NEXT: ldr w4, [sp, #40]
; CHECK-NEXT: ldr w5, [sp, #48]
; CHECK-NEXT: ldr w6, [sp, #56]
; CHECK-NEXT: ldr w7, [sp, #64]
; CHECK-NEXT: ldr w8, [sp, #72]
; CHECK-NEXT: str w8, [sp]
; CHECK-NEXT: b foo_v9i16
tail call void @foo_v9i16(<9 x i16> %a)
ret void
}

declare void @foo_v16i1(<16 x i1>)
define void @test_v16i1(<16 x i1> %unused, <16 x i1> %a) #0 {
; CHECK-LABEL: test_v16i1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: b foo_v16i1
tail call void @foo_v16i1(<16 x i1> %a)
ret void
}

; UTC_ARGS: --disable
; The output from this test is large and generally not useful, what matters is
; no vector registers are used.
declare void @foo_v32i1(<32 x i1>)
define void @test_v32i1(<32 x i1> %unused, <32 x i1> %a) #0 {
; CHECK-LABEL: test_v32i1:
; CHECK: // %bb.0:
; CHECK-NOT: [q,v,z][0-9]+
; CHECK: b foo_v32i1
tail call void @foo_v32i1(<32 x i1> %a)
ret void
}
; UTC_ARGS: --enable

declare void @foo_v1i128(<1 x i128>)
define void @test_v1i128(<1 x i128> %unused, <1 x i128> %a) #0 {
; CHECK-LABEL: test_v1i128:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x1, x3
; CHECK-NEXT: mov x0, x2
; CHECK-NEXT: b foo_v1i128
tail call void @foo_v1i128(<1 x i128> %a)
ret void
}

declare void @foo_v2i128(<2 x i128>)
define void @test_v2i128(<2 x i128> %unused, <2 x i128> %a) #0 {
; CHECK-LABEL: test_v2i128:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x3, x7
; CHECK-NEXT: mov x2, x6
; CHECK-NEXT: mov x0, x4
; CHECK-NEXT: mov x1, x5
; CHECK-NEXT: b foo_v2i128
tail call void @foo_v2i128(<2 x i128> %a)
ret void
}

declare void @foo_v1i256(<1 x i256>)
define void @test_v1i256(<1 x i256> %unused, <1 x i256> %a) #0 {
; CHECK-LABEL: test_v1i256:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x3, x7
; CHECK-NEXT: mov x2, x6
; CHECK-NEXT: mov x0, x4
; CHECK-NEXT: mov x1, x5
; CHECK-NEXT: b foo_v1i256
tail call void @foo_v1i256(<1 x i256> %a)
ret void
}

declare void @foo_v2i256(<2 x i256>)
define void @test_v2i256(<2 x i256> %unused, <2 x i256> %a) #0 {
; CHECK-LABEL: test_v2i256:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp x0, x1, [sp]
; CHECK-NEXT: ldp x2, x3, [sp, #16]
; CHECK-NEXT: ldp x4, x5, [sp, #32]
; CHECK-NEXT: ldp x6, x7, [sp, #48]
; CHECK-NEXT: b foo_v2i256
tail call void @foo_v2i256(<2 x i256> %a)
ret void
}

declare void @foo_v1f128(<1 x fp128>)
define void @test_v1f128(<1 x fp128> %unused, <1 x fp128> %a) #0 {
; CHECK-LABEL: test_v1f128:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: b foo_v1f128
tail call void @foo_v1f128(<1 x fp128> %a)
ret void
}

declare void @foo_v2f128(<2 x fp128>)
define void @test_v2f128(<2 x fp128> %unused, <2 x fp128> %a) #0 {
; CHECK-LABEL: test_v2f128:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v1.16b, v3.16b
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: b foo_v2f128
tail call void @foo_v2f128(<2 x fp128> %a)
ret void
}

attributes #0 = { "target-features"="+sve,+bf16" nounwind }
Loading