Skip to content

Commit 5cdece4

Browse files
committed
[AArch64] Enable UseFixedOverScalableIfEqualCost for more Cortex-x cpus.
For similar reasons for fixed-width being prefered to scalable for Neoverse V2, this patch enables the UseFixedOverScalableIfEqualCost feature when using -mcpu=cortex-x2, x3, x4 and x925 that are similar to Neoverse V2.
1 parent a10ce71 commit 5cdece4

File tree

2 files changed

+16
-31
lines changed

2 files changed

+16
-31
lines changed

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
229229
FeatureALULSLFast,
230230
FeaturePostRAScheduler,
231231
FeatureEnableSelectOptimize,
232+
FeatureUseFixedOverScalableIfEqualCost,
232233
FeaturePredictableSelectIsExpensive]>;
233234

234235
def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
@@ -238,6 +239,7 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
238239
FeatureFuseAES,
239240
FeaturePostRAScheduler,
240241
FeatureEnableSelectOptimize,
242+
FeatureUseFixedOverScalableIfEqualCost,
241243
FeaturePredictableSelectIsExpensive]>;
242244

243245
def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
@@ -247,6 +249,7 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
247249
FeatureFuseAES,
248250
FeaturePostRAScheduler,
249251
FeatureEnableSelectOptimize,
252+
FeatureUseFixedOverScalableIfEqualCost,
250253
FeaturePredictableSelectIsExpensive]>;
251254

252255
def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
@@ -256,6 +259,7 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
256259
FeatureFuseAES,
257260
FeaturePostRAScheduler,
258261
FeatureEnableSelectOptimize,
262+
FeatureUseFixedOverScalableIfEqualCost,
259263
FeaturePredictableSelectIsExpensive]>;
260264

261265
def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
1-
; RUN: opt -S < %s -passes=loop-vectorize -force-target-instruction-cost=1 | FileCheck %s
1+
; RUN: opt -S < %s -passes=loop-vectorize -mcpu=generic | FileCheck %s --check-prefix=CHECK-GENERIC
2+
; RUN: opt -S < %s -passes=loop-vectorize -mcpu=neoverse-v2 | FileCheck %s --check-prefix=CHECK-PREFFIXED
3+
; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x2 | FileCheck %s --check-prefix=CHECK-PREFFIXED
4+
; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x3 | FileCheck %s --check-prefix=CHECK-PREFFIXED
5+
; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x4 | FileCheck %s --check-prefix=CHECK-PREFFIXED
6+
; RUN: opt -S < %s -passes=loop-vectorize -mcpu=cortex-x925 | FileCheck %s --check-prefix=CHECK-PREFFIXED
27

38
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
49
target triple = "aarch64-unknown-linux-gnu"
510

611
@a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
712
@b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64
813

9-
define void @NeoverseV2() #0 {
10-
; CHECK-LABEL: define void @NeoverseV2(
11-
; CHECK: store <4 x float>
14+
define void @test() #0 {
15+
; CHECK-GENERIC-LABEL: define void @test(
16+
; CHECK-GENERIC: store <vscale x 4 x float>
17+
; CHECK-PREFFIXED-LABEL: define void @test(
18+
; CHECK-PREFFIXED: store <4 x float>
1219
;
1320
entry:
1421
br label %for.body
@@ -31,30 +38,4 @@ for.body:
3138
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
3239
}
3340

34-
define void @GenericCPU() #1 {
35-
; CHECK-LABEL: define void @GenericCPU(
36-
; CHECK: store <vscale x 4 x float>
37-
;
38-
entry:
39-
br label %for.body
40-
41-
for.cond.cleanup:
42-
ret void
43-
44-
for.body:
45-
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
46-
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
47-
%0 = load float, ptr %arrayidx, align 4
48-
%arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
49-
%1 = load float, ptr %arrayidx2, align 4
50-
%add = fadd fast float %1, %0
51-
%2 = add nuw nsw i64 %indvars.iv, 16000
52-
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
53-
store float %add, ptr %arrayidx5, align 4
54-
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
55-
%exitcond.not = icmp eq i64 %indvars.iv.next, 16000
56-
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
57-
}
58-
59-
attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" }
60-
attributes #1 = { vscale_range(1,16) "target-cpu"="generic" "target-features"="+sve,+v9a" }
41+
attributes #0 = { vscale_range(1,16) "target-features"="+sve,+sve2,+v9a" }

0 commit comments

Comments
 (0)