Skip to content

Commit 07507cb

Browse files
committed
[SLP]Fix shuffling of entries of the different sizes
Need to choose the size of vector factor for mask based on the entries vector factors, not mask size, to generate correct code. Fixes #117170
1 parent ba668eb commit 07507cb

File tree

3 files changed

+150
-4
lines changed

3 files changed

+150
-4
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10130,6 +10130,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1013010130
InVectors.size() == 1 ? nullptr : InVectors.back(),
1013110131
CommonMask);
1013210132
transformMaskAfterShuffle(CommonMask, CommonMask);
10133+
} else if (InVectors.size() == 2) {
10134+
Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10135+
transformMaskAfterShuffle(CommonMask, CommonMask);
1013310136
}
1013410137
SameNodesEstimated = false;
1013510138
if (!E2 && InVectors.size() == 1) {
@@ -10147,8 +10150,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1014710150
Cost += createShuffle(InVectors.front(), &E1, CommonMask);
1014810151
transformMaskAfterShuffle(CommonMask, CommonMask);
1014910152
} else {
10153+
auto P = InVectors.front();
1015010154
Cost += createShuffle(&E1, E2, Mask);
10151-
transformMaskAfterShuffle(CommonMask, Mask);
10155+
unsigned VF = std::max(E1.getVectorFactor(), E2->getVectorFactor());
10156+
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10157+
if (Mask[Idx] != PoisonMaskElem)
10158+
CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10159+
Cost += createShuffle(P, InVectors.front(), CommonMask);
10160+
transformMaskAfterShuffle(CommonMask, CommonMask);
1015210161
}
1015310162
}
1015410163

@@ -14007,9 +14016,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1400714016
transformMaskAfterShuffle(CommonMask, CommonMask);
1400814017
}
1400914018
V1 = createShuffle(V1, V2, Mask);
14019+
unsigned VF = std::max(getVF(V1), getVF(Vec));
1401014020
for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
1401114021
if (Mask[Idx] != PoisonMaskElem)
14012-
CommonMask[Idx] = Idx + Sz;
14022+
CommonMask[Idx] = Idx + VF;
1401314023
InVectors.front() = Vec;
1401414024
if (InVectors.size() == 2)
1401514025
InVectors.back() = V1;
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr=+avx512vl < %s | FileCheck %s
3+
4+
@GLOB = external global [16000 x i8], align 32
5+
6+
define void @test() {
7+
; CHECK-LABEL: define void @test(
8+
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
9+
; CHECK-NEXT: [[ALLOCA_0:.*:]]
10+
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1208), align 4
11+
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1612), align 4
12+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
13+
; CHECK-NEXT: [[GEPLOAD1612:%.*]] = extractelement <16 x float> [[TMP4]], i32 0
14+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <16 x float> [[TMP0]], i32 0
15+
; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]]
16+
; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]]
17+
; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16
18+
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
19+
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16
20+
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16
21+
; CHECK-NEXT: [[TMP13:%.*]] = load <8 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16
22+
; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4
23+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP8]], <16 x i32> <i32 poison, i32 0, i32 2, i32 1, i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
24+
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
25+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP12]], <16 x i32> <i32 1, i32 1, i32 17, i32 17, i32 18, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 19, i32 19, i32 19, i32 19>
26+
; CHECK-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP12]], <8 x float> [[TMP13]], i64 8)
27+
; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP15]], <4 x float> [[TMP7]], i64 0)
28+
; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP16]], <2 x float> [[TMP9]], i64 6)
29+
; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]]
30+
; CHECK-NEXT: store <16 x float> [[TMP18]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16
31+
; CHECK-NEXT: ret void
32+
;
33+
alloca_0:
34+
%gepload1208 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1208), align 4
35+
%gepload1212 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1212), align 4
36+
%gepload1216 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1216), align 4
37+
%gepload1220 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1220), align 4
38+
%gepload1224 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1224), align 4
39+
%gepload1228 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1228), align 4
40+
%gepload1232 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1232), align 4
41+
%gepload1236 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1236), align 4
42+
%gepload1612 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1612), align 4
43+
%0 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1208
44+
%1 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1208
45+
store float %1, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16
46+
%2 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1212
47+
store float %2, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2932), align 4
48+
%3 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1216
49+
store float %3, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2936), align 8
50+
%4 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1220
51+
store float %4, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2940), align 4
52+
%5 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1224
53+
store float %5, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2944), align 32
54+
%6 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1228
55+
store float %6, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2948), align 4
56+
%7 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1232
57+
store float %7, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2952), align 8
58+
%8 = fmul reassoc ninf nsz arcp contract afn float %gepload1612, %gepload1236
59+
store float %8, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2956), align 4
60+
%gepload1240 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1240), align 16
61+
%gepload1244 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1244), align 16
62+
%gepload1248 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1248), align 16
63+
%gepload1252 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1252), align 16
64+
%gepload1256 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1256), align 16
65+
%gepload1260 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1260), align 16
66+
%gepload1264 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1264), align 16
67+
%gepload1268 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1268), align 16
68+
%gepload1272 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
69+
%gepload1276 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1276), align 16
70+
%gepload1616 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1616), align 16
71+
%9 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1240
72+
store float %9, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2960), align 16
73+
%10 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1244
74+
store float %10, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2964), align 4
75+
%11 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1248
76+
store float %11, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2968), align 8
77+
%12 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1252
78+
store float %12, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2972), align 4
79+
%13 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1256
80+
store float %13, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2976), align 32
81+
%14 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1260
82+
store float %14, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2980), align 4
83+
%15 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1264
84+
store float %15, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2984), align 8
85+
%16 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1268
86+
store float %16, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2988), align 4
87+
%17 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1272
88+
store float %17, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16
89+
%18 = fmul reassoc ninf nsz arcp contract afn float %gepload1616, %gepload1276
90+
store float %18, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2996), align 4
91+
%gepload1280 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1280), align 16
92+
%gepload1284 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1284), align 16
93+
%gepload1288 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16
94+
%gepload1292 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1292), align 16
95+
%gepload1296 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16
96+
%gepload1300 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1300), align 16
97+
%gepload1304 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16
98+
%gepload1308 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1308), align 16
99+
%gepload1312 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1312), align 16
100+
%gepload1316 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1316), align 16
101+
%gepload1620 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4
102+
%19 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1280
103+
store float %19, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3000), align 8
104+
%20 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1284
105+
store float %20, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3004), align 4
106+
%21 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1288
107+
store float %21, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3008), align 32
108+
%22 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1292
109+
store float %22, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3012), align 4
110+
%23 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1296
111+
store float %23, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3016), align 8
112+
%24 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1300
113+
store float %24, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3020), align 4
114+
%25 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1304
115+
store float %25, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3024), align 16
116+
%26 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1308
117+
store float %26, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3028), align 4
118+
%27 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1312
119+
store float %27, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3032), align 8
120+
%28 = fmul reassoc ninf nsz arcp contract afn float %gepload1620, %gepload1316
121+
store float %28, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3036), align 4
122+
%gepload1320 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1320), align 16
123+
%gepload1324 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1324), align 16
124+
%gepload1328 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1328), align 16
125+
%gepload1332 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1332), align 16
126+
%gepload1624 = load float, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1624), align 8
127+
%29 = fmul reassoc ninf nsz arcp contract afn float %gepload1624, %gepload1320
128+
store float %29, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3040), align 32
129+
%30 = fmul reassoc ninf nsz arcp contract afn float %gepload1624, %gepload1324
130+
store float %30, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3044), align 4
131+
%31 = fmul reassoc ninf nsz arcp contract afn float %gepload1624, %gepload1328
132+
store float %31, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3048), align 8
133+
%32 = fmul reassoc ninf nsz arcp contract afn float %gepload1624, %gepload1332
134+
store float %32, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 3052), align 4
135+
ret void
136+
}

llvm/test/Transforms/SLPVectorizer/shuffle-multivector.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s %}
3-
; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux -slp-threshold=-163 | FileCheck %s %}
2+
; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-165 | FileCheck %s %}
3+
; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux -slp-threshold=-165 | FileCheck %s %}
44

55
define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) {
66
; CHECK-LABEL: @test1(

0 commit comments

Comments
 (0)