-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][SME] Remove immediate argument restriction for svldr and svstr #68565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-clang ChangesThe svldr_vnum_za and svstr_vnum_za builtins/intrinsics currently require that the vnum argument be an immediate, since the instructions take an immediate vector number. However, we emit 0 as the immediate for the instruction no matter what, and instead modify the base register. This patch removes that restriction on the argument, so that the argument can be a non-immediate. If an appropriate immediate was passed to the builtin then CGBuiltin passes that directly to the LLVM intrinsic, otherwise it modifies the base register as is existing behaviour. Full diff: https://github.com/llvm/llvm-project/pull/68565.diff 9 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index d014900d719c338..49ef6b6b3fc4359 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -44,10 +44,9 @@ defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0
defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>;
defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>;
-def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQi", "",
+def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQn", "",
[IsOverloadNone, IsStreamingCompatible, IsSharedZA],
- MemEltTyDefault, "aarch64_sme_ldr",
- [ImmCheck<2, ImmCheck0_15>]>;
+ MemEltTyDefault, "aarch64_sme_ldr">;
def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
[IsOverloadNone, IsStreamingCompatible, IsSharedZA],
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d14cf0dccb09982..ca4bf498cab9535 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9606,7 +9606,7 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
}
Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) {
- llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int32Ty, false);
+ llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int64Ty, false);
return Builder.CreateAdd(Base, CastOffset, "tileslice");
}
@@ -9665,18 +9665,34 @@ Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
SmallVectorImpl<Value *> &Ops,
unsigned IntID) {
- if (Ops.size() == 3) {
- Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
- llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb");
- llvm::Value *MulVL = Builder.CreateMul(
- CntsbCall,
- Builder.getInt64(cast<llvm::ConstantInt>(Ops[2])->getZExtValue()),
- "mulvl");
-
- Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL);
- Ops[0] = EmitTileslice(Ops[0], Ops[2]);
- Ops.erase(&Ops[2]);
- }
+ if (Ops.size() == 2) {
+ // Intrinsics without a vecnum also use this function, so just provide 0
+ Ops.push_back(Ops[1]);
+ Ops[1] = Builder.getInt32(0);
+ } else {
+ int Imm = -1;
+ if (ConstantInt* C = dyn_cast<ConstantInt>(Ops[2]))
+ if (C->getZExtValue() <= 15)
+ Imm = C->getZExtValue();
+
+ if (Imm != -1) {
+ Ops[2] = Ops[1];
+ Ops[1] = Builder.getInt32(Imm);
+ } else {
+ Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
+ llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb");
+
+ llvm::Value *VecNum = Ops[2];
+ llvm::Value *MulVL = Builder.CreateMul(
+ CntsbCall,
+ VecNum,
+ "mulvl");
+
+ Ops[2] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL);
+ Ops[1] = Builder.getInt32(0);
+ Ops[0] = Builder.CreateIntCast(EmitTileslice(Ops[0], VecNum), Int32Ty, false);
+ }
+ }
Function *F = CGM.getIntrinsic(IntID, {});
return Builder.CreateCall(F, Ops);
}
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index acddc2ef50a3ddf..df7ff4ca995b544 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -8,7 +8,7 @@
// CHECK-C-LABEL: @test_svldr_vnum_za(
// CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
@@ -18,22 +18,34 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
// CHECK-C-LABEL: @test_svldr_vnum_za_1(
// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
-// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
-// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) {
svldr_vnum_za(slice_base, ptr, 15);
}
+// CHECK-C-LABEL: @test_svldr_vnum_za_var(
+// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvm(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]]
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP2]], i32 0, ptr [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, uint64_t vnum) {
+ svldr_vnum_za(slice_base, ptr, vnum);
+}
+
// CHECK-C-LABEL: @test_svldr_za(
// CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
+//
void test_svldr_za(uint32_t slice_base, const void *ptr) {
svldr_za(slice_base, ptr);
}
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index 2728f9ac0cd12d3..f384bd76899b0fd 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -8,31 +8,28 @@
// CHECK-C-LABEL: @test_svstr_vnum_za(
// CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svstr_vnum_za(uint32_t slice_base, void *ptr) {
svstr_vnum_za(slice_base, ptr, 0);
}
-// CHECK-C-LABEL: @test_svstr_vnum_za_1(
-// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv(
+// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_1(
+// CHECK-CXX-LABEL: define dso_local void @_Z20test_svstr_vnum_za_1jPv(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15
-// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
-// CHECK-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) {
svstr_vnum_za(slice_base, ptr, 15);
}
-// CHECK-C-LABEL: @test_svstr_za(
-// CHECK-CXX-LABEL: @_Z13test_svstr_zajPv(
+// CHECK-C-LABEL: define dso_local void @test_svstr_za(
+// CHECK-CXX-LABEL: define dso_local void @_Z13test_svstr_zajPv(
+// CHECK-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]])
// CHECK-NEXT: ret void
//
void test_svstr_za(uint32_t slice_base, void *ptr) {
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
index 7475fd53b80ba2b..1faa5638c801c2d 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
@@ -143,11 +143,6 @@ void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) {
// expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(16, slice, pg, ptr, 1);
- // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
- SVE_ACLE_FUNC(svldr_vnum_za,,,)(-1, ptr, 16);
- // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
- SVE_ACLE_FUNC(svstr_vnum_za,,,)(-1, ptr, -1);
-
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, slice);
// expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
@@ -171,9 +166,6 @@ void test_constant(uint64_t u64, svbool_t pg, void *ptr) {
SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}}
SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}}
- SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}}
- SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, ptr, u64); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}}
-
SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, 0); // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}}
SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(u64, 0, pg, svundef_s64()); // expected-error-re {{argument to 'svwrite_ver_za64{{.*}}_m' must be a constant integer}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 557063c8813268e..26827cf6110d497 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2680,9 +2680,9 @@ let TargetPrefix = "aarch64" in {
// Spill + fill
def int_aarch64_sme_ldr : DefaultAttrsIntrinsic<
- [], [llvm_i32_ty, llvm_ptr_ty]>;
+ [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>;
def int_aarch64_sme_str : DefaultAttrsIntrinsic<
- [], [llvm_i32_ty, llvm_ptr_ty]>;
+ [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>;
class SME_TileToVector_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index edd24b4a849b547..5b5b6a31705df33 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -794,8 +794,8 @@ multiclass sme_spill<string opcodestr> {
(!cast<Instruction>(NAME) MatrixOp:$ZAt,
MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
// base
- def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
- (!cast<Instruction>(NAME) ZA, $idx, 0, $base, 0)>;
+ def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base),
+ (!cast<Instruction>(NAME) ZA, $idx, $imm, $base, 0)>;
}
multiclass sme_fill<string opcodestr> {
@@ -805,7 +805,7 @@ multiclass sme_fill<string opcodestr> {
MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
def NAME # _PSEUDO
: Pseudo<(outs),
- (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4,
+ (ins MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm4,
GPR64sp:$base), []>,
Sched<[]> {
// Translated to actual instruction in AArch64ISelLowering.cpp
@@ -813,8 +813,8 @@ multiclass sme_fill<string opcodestr> {
let mayLoad = 1;
}
// base
- def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
- (!cast<Instruction>(NAME # _PSEUDO) $idx, 0, $base)>;
+ def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base),
+ (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm, $base)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll b/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll
new file mode 100644
index 000000000000000..8d457220ea9c5ae
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
+
+define float @f(ptr %x) {
+; CHECK-NOVEC-LABEL: 'f'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load float, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload
+;
+; CHECK-FP-LABEL: 'f'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load float, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload
+;
+entry:
+ %a.0.copyload = load float, ptr %x, align 1
+ ret float %a.0.copyload
+}
+
+define float @ff(ptr %x, float %f) {
+; CHECK-NOVEC-LABEL: 'ff'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %f, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef
+;
+; CHECK-FP-LABEL: 'ff'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store float %f, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef
+;
+entry:
+ store float %f, ptr %x, align 1
+ ret float undef
+}
+
+define double @d(ptr %x) {
+; CHECK-NOVEC-LABEL: 'd'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load double, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload
+;
+; CHECK-FP-LABEL: 'd'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load double, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload
+;
+entry:
+ %a.0.copyload = load double, ptr %x, align 1
+ ret double %a.0.copyload
+}
+
+define double @dd(ptr %x, double %f) {
+; CHECK-NOVEC-LABEL: 'dd'
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store double %f, ptr %x, align 1
+; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef
+;
+; CHECK-FP-LABEL: 'dd'
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double %f, ptr %x, align 1
+; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef
+;
+entry:
+ store double %f, ptr %x, align 1
+ ret double undef
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index c96aca366ed43f2..f5d25a3229a7f82 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -252,10 +252,28 @@ define void @ldr(ptr %ptr) {
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: ldr za[w12, 0], [x0]
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr)
+ call void @llvm.aarch64.sme.ldr(i32 0, i32 0, ptr %ptr)
ret void;
}
+define void @ldr_vnum(i32 %tile_slice, ptr %ptr, i64 %vnum) {
+; CHECK-LABEL: ldr_vnum:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: add w12, w2, w0
+; CHECK-NEXT: madd x8, x8, x2, x1
+; CHECK-NEXT: ldr za[w12, 0], [x8]
+; CHECK-NEXT: ret
+entry:
+ %svlb = tail call i64 @llvm.aarch64.sme.cntsb()
+ %mulvl = mul i64 %svlb, %vnum
+ %0 = getelementptr i8, ptr %ptr, i64 %mulvl
+ %1 = trunc i64 %vnum to i32
+ %2 = add i32 %1, %tile_slice
+ tail call void @llvm.aarch64.sme.ldr(i32 %2, i32 0, ptr %0)
+ ret void
+}
+
define void @ldr_with_off_15(ptr %ptr) {
; CHECK-LABEL: ldr_with_off_15:
; CHECK: // %bb.0:
@@ -264,7 +282,7 @@ define void @ldr_with_off_15(ptr %ptr) {
; CHECK-NEXT: ldr za[w12, 0], [x8]
; CHECK-NEXT: ret
%base = getelementptr i8, ptr %ptr, i64 15
- call void @llvm.aarch64.sme.ldr(i32 15, ptr %base)
+ call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base)
ret void;
}
@@ -278,7 +296,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 240
%base = getelementptr i8, ptr %ptr, i64 %mulvl
- call void @llvm.aarch64.sme.ldr(i32 15, ptr %base)
+ call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base)
ret void;
}
@@ -292,7 +310,7 @@ define void @ldr_with_off_16mulvl(ptr %ptr) {
%vscale = call i64 @llvm.vscale.i64()
%mulvl = mul i64 %vscale, 256
%base = getelementptr i8, ptr %ptr, i64 %mulvl
- call void @llvm.aarch64.sme.ldr(i32 16, ptr %base)
+ call void @llvm.aarch64.sme.ldr(i32 16, i32 0, ptr %base)
ret void;
}
@@ -302,13 +320,13 @@ define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src,
; CHECK-LABEL: test_ld1_sink_tile0_offset_operand:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w12, w1
-; CHECK-NEXT: .LBB14_1: // %for.body
+; CHECK-NEXT: .LBB15_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0]
; CHECK-NEXT: subs w2, w2, #1
; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0]
; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0]
-; CHECK-NEXT: b.ne .LBB14_1
+; CHECK-NEXT: b.ne .LBB15_1
; CHECK-NEXT: // %bb.2: // %exit
; CHECK-NEXT: ret
entry:
@@ -341,5 +359,6 @@ declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32)
declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32)
declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32)
-declare void @llvm.aarch64.sme.ldr(i32, ptr)
+declare void @llvm.aarch64.sme.ldr(i32, i32, ptr)
declare i64 @llvm.vscale.i64()
+declare i64 @llvm.aarch64.sme.cntsb()
|
This patch seems sort of contradictory to me. You're encouraging users to pass a non-constant slice_offset, presumably because we expect the backend to optimize it... but at the same time, you're special-casing constant slice_offsets because you don't expect the backend to optimize it. What do you actually expect the backend to do in practice? I don't see any backend tests involving a non-zero slice_offset? We currently have some code in getMemVTFromNode() in AArch64ISelDAGToDAG.cpp that handles sme_ldr/sme_str. Is that code currently reachable? If it is reachable, does it need to be adjusted for this change? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes to the LLVM IR intrinsics are to better utilise the reg+imm form of the instructions, but they aren't required to remove the immediate-argument restriction for svldr/svstr. I think it would be better to split the PR into two PRs:
- One patch that fixes
vnum
(uint64_t -> int64_t) and removes the 'must be immediate argument' restriction. - Another patch that tries to better use the immediate in the instruction by changing the LLVM IR intrinsic with an extra parameter.
I've decided to keep this PR up for the patch that modifies the codegen and IR intrinsic since there's a lot of good relevant review of that here. I've made a new PR with the separated changes that just modify the clang builtin. This PR will later be rebased on top of that PR after it's merged. |
022fc40
to
91ddfb7
Compare
I've updated this patch with my progress in lowering the intrinsics in DAGToDAG, hopefully capturing your feedback in the process. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you briefly comment on the tradeoff of adding an argument to the intrinsic, vs. pattern-matching constant offsets to the existing intrinsic?
Also, you might want to consider marking the offset immarg, instead of trying to handle variable offsets in isel. |
|
I've changed the approach to consider immediates outside of 0-15 and fixed the issue of the tile slice not being updated. Please let me know what you think. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The resulting code here still isn't great quality for the case where the constant isn't folded. Maybe worth considering doing the lowering earlier (DAGCombine? Maybe even a late IR optimization?), so the lowered arithmetic can be optimized.
86a03c3
to
2f57f92
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
c349aaa
to
b03e02a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for all the changes! LGTM with few little nits addressed.
Thanks! |
2f04ebf
to
eb47e7f
Compare
The svldr_vnum_za and svstr_vnum_za builtins/intrinsics currently require that the vnum argument be an immediate, since the instructions take an immediate vector number. However, we emit 0 as the immediate for the instruction no matter what, and instead modify the base register. This patch removes that restriction on the argument, so that the argument can be a non-immediate. If an appropriate immediate was passed to the builtin then CGBuiltin passes that directly to the LLVM intrinsic, otherwise it modifies the base register as is existing behaviour.
ccb0d39
to
8172f6b
Compare
…str (llvm#68565) The svldr_vnum and svstr_vnum builtins always modify the base register and tile slice and provide immediate offsets of zero, even when the offset provided to the builtin is an immediate. This patch optimises the output of the builtins when the offset is an immediate, to pass it directly to the instruction and to not need the base register and tile slice updates.
…str (llvm#68565) The svldr_vnum and svstr_vnum builtins always modify the base register and tile slice and provide immediate offsets of zero, even when the offset provided to the builtin is an immediate. This patch optimises the output of the builtins when the offset is an immediate, to pass it directly to the instruction and to not need the base register and tile slice updates.
The svldr_vnum and svstr_vnum builtins always modify the base register and tile slice and provide immediate offsets of zero, even when the offset provided to the builtin is an immediate. This patch optimises the output of the builtins when the offset is an immediate, to pass it directly to the instruction and to not need the base register and tile slice updates.