-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AArch64] Enable UseFixedOverScalableIfEqualCost for more Cortex-x cpus. #122807
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesFor similar reasons for fixed-width being prefered to scalable for Neoverse V2, this patch enables the UseFixedOverScalableIfEqualCost feature when using -mcpu=cortex-x2, x3, x4 and x925 that are similar to Neoverse V2. Full diff: https://github.com/llvm/llvm-project/pull/122807.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 364ab0d82bf888..f6df56ca1f2b33 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -229,6 +229,7 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
+ FeatureUseFixedOverScalableIfEqualCost,
FeaturePredictableSelectIsExpensive]>;
def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
@@ -238,6 +239,7 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
+ FeatureUseFixedOverScalableIfEqualCost,
FeaturePredictableSelectIsExpensive]>;
def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
@@ -247,6 +249,7 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
+ FeatureUseFixedOverScalableIfEqualCost,
FeaturePredictableSelectIsExpensive]>;
def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
@@ -256,6 +259,7 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
+ FeatureUseFixedOverScalableIfEqualCost,
FeaturePredictableSelectIsExpensive]>;
def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll
index 41595cc7d8996b..c39c6e8fbcea47 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll
@@ -1,4 +1,9 @@
-; RUN: opt -S < %s -passes=loop-vectorize -force-target-instruction-cost=1 | FileCheck %s
+; RUN: opt -S < %s -passes=loop-vectorize -mcpu=generic | FileCheck %s --check-prefix=CHECK-GENERIC
+; RUN: opt -S < %s -passes=loop-vectorize -mcpu=neoverse-v2 | FileCheck %s --check-prefix=CHECK-PREFFIXED
+; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x2 | FileCheck %s --check-prefix=CHECK-PREFFIXED
+; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x3 | FileCheck %s --check-prefix=CHECK-PREFFIXED
+; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x4 | FileCheck %s --check-prefix=CHECK-PREFFIXED
+; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x925 | FileCheck %s --check-prefix=CHECK-PREFFIXED
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64-unknown-linux-gnu"
@@ -6,9 +11,11 @@ target triple = "aarch64-unknown-linux-gnu"
@a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
@b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
-define void @NeoverseV2() #0 {
-; CHECK-LABEL: define void @NeoverseV2(
-; CHECK: store <4 x float>
+define void @test() #0 {
+; CHECK-GENERIC-LABEL: define void @test(
+; CHECK-GENERIC: store <vscale x 4 x float>
+; CHECK-PREFFIXED-LABEL: define void @test(
+; CHECK-PREFFIXED: store <4 x float>
;
entry:
br label %for.body
@@ -31,30 +38,4 @@ for.body:
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
-define void @GenericCPU() #1 {
-; CHECK-LABEL: define void @GenericCPU(
-; CHECK: store <vscale x 4 x float>
-;
-entry:
- br label %for.body
-
-for.cond.cleanup:
- ret void
-
-for.body:
- %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
- %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
- %0 = load float, ptr %arrayidx, align 4
- %arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
- %1 = load float, ptr %arrayidx2, align 4
- %add = fadd fast float %1, %0
- %2 = add nuw nsw i64 %indvars.iv, 16000
- %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
- store float %add, ptr %arrayidx5, align 4
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, 16000
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" }
-attributes #1 = { vscale_range(1,16) "target-cpu"="generic" "target-features"="+sve,+v9a" }
+attributes #0 = { vscale_range(1,16) "target-features"="+sve,+sve2,+v9a" }
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. If you're happy then I am happy too. :)
But more serious, I think this makes a lot of sense. It's still on my list to further look into this as I think there's more room for improvement, but I am happy to see more adoption. If this gets more adoption, we could maybe later discuss what the default should be.
Maybe give it a day before merging to give @david-arm a chance to express his happiness or unhappiness.
llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll
Outdated
Show resolved
Hide resolved
For similar reasons for fixed-width being prefered to scalable for Neoverse V2, this patch enables the UseFixedOverScalableIfEqualCost feature when using -mcpu=cortex-x2, x3, x4 and x925 that are similar to Neoverse V2.
5cdece4
to
c9009fd
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
For similar reasons for fixed-width being prefered to scalable for Neoverse V2, this patch enables the UseFixedOverScalableIfEqualCost feature when using -mcpu=cortex-x2, x3, x4 and x925 that are similar to Neoverse V2.