Skip to content

Commit 96d5326

Browse files
committed
[LV]: For memory interleaving factor > 2 skip comparing
legacy-based cost model to VPlan-based model.
1 parent 2f69919 commit 96d5326

File tree

2 files changed

+114
-11
lines changed

2 files changed

+114
-11
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7434,6 +7434,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
74347434
for (VPRecipeBase &R : *VPBB) {
74357435
if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
74367436
auto *IG = IR->getInterleaveGroup();
7437+
// The legacy-based cost model is more accurate for interleaving and
7438+
// comparing against the VPlan-based cost isn't desirable.
7439+
// At least skip interleaving with factor > 2 as higher factors
7440+
// cause higher cost difference.
7441+
if (IG->getFactor() > 2)
7442+
return true;
74377443
unsigned NumMembers = IG->getNumMembers();
74387444
for (unsigned I = 0; I != NumMembers; ++I) {
74397445
if (Instruction *M = IG->getMember(I))

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses-cost.ll

Lines changed: 108 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,123 @@ define void @test_masked_interleave(ptr noalias %A, ptr noalias %B, ptr noalias
1010
; CHECK-LABEL: define void @test_masked_interleave(
1111
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] {
1212
; CHECK-NEXT: [[ENTRY:.*]]:
13-
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
14-
; CHECK: [[LOOP_HEADER]]:
15-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
13+
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
14+
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16
15+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
16+
; CHECK: [[VECTOR_PH]]:
17+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
18+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
19+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 252, [[TMP3]]
20+
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
21+
; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
22+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 252, [[TMP5]]
23+
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
24+
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16
25+
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 4
26+
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 16 x i64> @llvm.stepvector.nxv16i64()
27+
; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], splat (i64 4)
28+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP10]]
29+
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 4, [[TMP7]]
30+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP11]], i64 0
31+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
32+
; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
33+
; CHECK-NEXT: [[TMP13:%.*]] = mul <vscale x 16 x i32> [[TMP12]], splat (i32 4)
34+
; CHECK-NEXT: [[INDUCTION1:%.*]] = add <vscale x 16 x i32> zeroinitializer, [[TMP13]]
35+
; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP7]] to i32
36+
; CHECK-NEXT: [[TMP15:%.*]] = mul i32 4, [[TMP14]]
37+
; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP15]], i64 0
38+
; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
39+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x ptr> poison, ptr [[C]], i64 0
40+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 16 x ptr> poison, <vscale x 16 x i32> zeroinitializer
41+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <vscale x 16 x ptr> poison, ptr [[B]], i64 0
42+
; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <vscale x 16 x ptr> [[BROADCAST_SPLATINSERT13]], <vscale x 16 x ptr> poison, <vscale x 16 x i32> zeroinitializer
43+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
44+
; CHECK: [[VECTOR_BODY]]:
45+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
46+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
47+
; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION1]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], %[[VECTOR_BODY]] ]
48+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
49+
; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0
1650
; CHECK-NEXT: [[IV_1:%.*]] = or disjoint i64 [[IV]], 1
1751
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV_1]]
18-
; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_A_1]], align 1
52+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 64 x i8>, ptr [[GEP_A_1]], align 1
53+
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_VEC]])
54+
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
55+
; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
56+
; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP19]])
57+
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP20]])
58+
; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC6]], 0
59+
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq <vscale x 16 x i8> [[TMP21]], zeroinitializer
60+
; CHECK-NEXT: [[TMP23:%.*]] = add <vscale x 16 x i64> [[VEC_IND]], splat (i64 2)
61+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[A]], <vscale x 16 x i64> [[TMP23]]
62+
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <vscale x 16 x ptr> [[TMP24]], i32 0
63+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i32 -2
64+
; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> [[TMP22]])
65+
; CHECK-NEXT: [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> [[TMP22]])
66+
; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
67+
; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP26]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]], <vscale x 64 x i8> poison)
68+
; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
69+
; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC10]], 0
70+
; CHECK-NEXT: [[TMP28:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC10]], 1
71+
; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP27]])
72+
; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP28]])
73+
; CHECK-NEXT: [[TMP29:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC11]], 0
74+
; CHECK-NEXT: [[TMP30:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC12]], 0
75+
; CHECK-NEXT: [[TMP31:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC11]], 1
76+
; CHECK-NEXT: [[TMP32:%.*]] = zext <vscale x 16 x i8> [[TMP31]] to <vscale x 16 x i32>
77+
; CHECK-NEXT: [[TMP33:%.*]] = shl <vscale x 16 x i32> [[TMP32]], splat (i32 2)
78+
; CHECK-NEXT: [[TMP34:%.*]] = zext <vscale x 16 x i8> [[TMP30]] to <vscale x 16 x i32>
79+
; CHECK-NEXT: [[TMP35:%.*]] = shl <vscale x 16 x i32> [[TMP34]], splat (i32 2)
80+
; CHECK-NEXT: [[TMP36:%.*]] = or <vscale x 16 x i32> [[TMP35]], [[TMP33]]
81+
; CHECK-NEXT: [[TMP37:%.*]] = zext <vscale x 16 x i8> [[TMP29]] to <vscale x 16 x i32>
82+
; CHECK-NEXT: [[TMP38:%.*]] = or <vscale x 16 x i32> [[TMP36]], [[TMP37]]
83+
; CHECK-NEXT: [[TMP39:%.*]] = shl <vscale x 16 x i32> [[TMP38]], splat (i32 2)
84+
; CHECK-NEXT: [[TMP40:%.*]] = or <vscale x 16 x i32> splat (i32 3), [[VEC_IND4]]
85+
; CHECK-NEXT: [[TMP41:%.*]] = or <vscale x 16 x i32> [[TMP39]], [[TMP40]]
86+
; CHECK-NEXT: [[TMP42:%.*]] = lshr <vscale x 16 x i32> [[TMP41]], splat (i32 2)
87+
; CHECK-NEXT: [[TMP43:%.*]] = lshr <vscale x 16 x i32> [[TMP32]], splat (i32 2)
88+
; CHECK-NEXT: [[TMP44:%.*]] = trunc <vscale x 16 x i32> [[TMP43]] to <vscale x 16 x i8>
89+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP44]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP22]])
90+
; CHECK-NEXT: [[TMP45:%.*]] = lshr <vscale x 16 x i32> [[TMP32]], splat (i32 5)
91+
; CHECK-NEXT: [[TMP46:%.*]] = trunc <vscale x 16 x i32> [[TMP45]] to <vscale x 16 x i8>
92+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP46]], <vscale x 16 x ptr> [[BROADCAST_SPLAT]], i32 1, <vscale x 16 x i1> [[TMP22]])
93+
; CHECK-NEXT: [[TMP47:%.*]] = trunc <vscale x 16 x i32> [[TMP42]] to <vscale x 16 x i8>
94+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP47]], <vscale x 16 x ptr> [[BROADCAST_SPLAT14]], i32 1, <vscale x 16 x i1> [[TMP22]])
95+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
96+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i64> [[VEC_IND]], [[DOTSPLAT]]
97+
; CHECK-NEXT: [[VEC_IND_NEXT5]] = add <vscale x 16 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
98+
; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
99+
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
100+
; CHECK: [[MIDDLE_BLOCK]]:
101+
; CHECK-NEXT: br label %[[SCALAR_PH]]
102+
; CHECK: [[SCALAR_PH]]:
103+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
104+
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
105+
; CHECK: [[LOOP_HEADER]]:
106+
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
107+
; CHECK-NEXT: [[IV_3:%.*]] = or disjoint i64 [[IV1]], 1
108+
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV_3]]
109+
; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_A_2]], align 1
19110
; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L_1]], 0
20111
; CHECK-NEXT: br i1 [[C_1]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
21112
; CHECK: [[THEN]]:
22-
; CHECK-NEXT: [[IV_2:%.*]] = or disjoint i64 [[IV]], 2
113+
; CHECK-NEXT: [[IV_2:%.*]] = or disjoint i64 [[IV1]], 2
23114
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV_2]]
24115
; CHECK-NEXT: [[L_2:%.*]] = load i8, ptr [[ARRAYIDX7]], align 1
25116
; CHECK-NEXT: [[CONV8:%.*]] = zext i8 [[L_2]] to i32
26117
; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[CONV8]], 2
27-
; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i64 [[IV]], 1
118+
; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i64 [[IV1]], 1
28119
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr i8, ptr [[A]], i64 [[ADD9]]
29120
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1
30121
; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[TMP0]] to i32
31122
; CHECK-NEXT: [[SHL12:%.*]] = shl i32 [[CONV11]], 2
32123
; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL12]], [[SHL]]
33-
; CHECK-NEXT: [[B2:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
124+
; CHECK-NEXT: [[B2:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV1]]
34125
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[B2]], align 1
35126
; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP1]] to i32
36127
; CHECK-NEXT: [[OR16:%.*]] = or i32 [[OR]], [[CONV15]]
37128
; CHECK-NEXT: [[SHL17:%.*]] = shl i32 [[OR16]], 2
38-
; CHECK-NEXT: [[CONV19:%.*]] = trunc i64 [[IV]] to i32
129+
; CHECK-NEXT: [[CONV19:%.*]] = trunc i64 [[IV1]] to i32
39130
; CHECK-NEXT: [[ADD20:%.*]] = or i32 3, [[CONV19]]
40131
; CHECK-NEXT: [[DEST_0:%.*]] = or i32 [[SHL17]], [[ADD20]]
41132
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[DEST_0]], 2
@@ -49,9 +140,9 @@ define void @test_masked_interleave(ptr noalias %A, ptr noalias %B, ptr noalias
49140
; CHECK-NEXT: store i8 [[CONV34]], ptr [[B]], align 1
50141
; CHECK-NEXT: br label %[[LOOP_LATCH]]
51142
; CHECK: [[LOOP_LATCH]]:
52-
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4
53-
; CHECK-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV]], 1000
54-
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
143+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 4
144+
; CHECK-NEXT: [[EC:%.*]] = icmp ugt i64 [[IV1]], 1000
145+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
55146
; CHECK: [[EXIT]]:
56147
; CHECK-NEXT: ret void
57148
;
@@ -107,3 +198,9 @@ exit:
107198
}
108199

109200
attributes #0 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" }
201+
;.
202+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
203+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
204+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
205+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
206+
;.

0 commit comments

Comments
 (0)