1
- ; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
2
- ; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
1
+ ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg - force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
2
+ ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg - force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
3
3
4
4
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
5
5
target triple = "i386-unknown-linux-gnu"
@@ -9,9 +9,13 @@ target triple = "i386-unknown-linux-gnu"
9
9
; interleaved-group but rather as a scalarized accesses.
10
10
; (For SKX, Gather is not supported by the compiler for chars, therefore
11
11
; the only remaining alternative is to scalarize).
12
+ ; In this case a scalar epilogue is not needed.
13
+ ;
12
14
; When masked-interleave-group is enabled we expect to find the proper mask
13
15
; shuffling code, feeding the wide masked load for an interleave-group (with
14
16
; a single member).
17
+ ; Since the last (second) member of the load-group is a gap, peeling is used,
18
+ ; so we also expect to find a scalar epilogue loop.
15
19
;
16
20
; void masked_strided1(const unsigned char* restrict p,
17
21
; unsigned char* restrict q,
@@ -38,6 +42,8 @@ target triple = "i386-unknown-linux-gnu"
38
42
;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
39
43
;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
40
44
;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
45
+ ;DISABLED_MASKED_STRIDED-NOT: for.body:
46
+ ;DISABLED_MASKED_STRIDED: for.end:
41
47
42
48
;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
43
49
;ENABLED_MASKED_STRIDED: vector.body:
@@ -47,6 +53,7 @@ target triple = "i386-unknown-linux-gnu"
47
53
;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
48
54
;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
49
55
;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
56
+ ;ENABLED_MASKED_STRIDED: for.body:
50
57
51
58
define dso_local void @masked_strided1 (i8* noalias nocapture readonly %p , i8* noalias nocapture %q , i8 zeroext %guard ) local_unnamed_addr {
52
59
entry:
@@ -75,6 +82,109 @@ for.end:
75
82
ret void
76
83
}
77
84
85
+ ; Exactly the same scenario except we are now optimizing for size, therefore
86
+ ; we check that no scalar epilogue is created. Since we can't create an epilog
87
+ ; the interleave-group is invalidated because is has gaps, so we end up
88
+ ; scalarizing.
89
+ ; (Before the fix that this test checks, we used to create an epilogue despite
90
+ ; optsize, and vectorized the access as an interleaved-group. This is now fixed,
91
+ ; and we make sure that a scalar epilogue does not exist).
92
+
93
+ ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
94
+ ;ENABLED_MASKED_STRIDED: vector.body:
95
+ ;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
96
+ ;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
97
+ ;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask =
98
+ ;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
99
+ ;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
100
+ ;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
101
+ ;ENABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
102
+ ;ENABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
103
+ ;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask =
104
+ ;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
105
+ ;ENABLED_MASKED_STRIDED-NOT: for.body:
106
+ ;ENABLED_MASKED_STRIDED: for.end:
107
+
108
+ define dso_local void @masked_strided1_optsize (i8* noalias nocapture readonly %p , i8* noalias nocapture %q , i8 zeroext %guard ) local_unnamed_addr optsize {
109
+ entry:
110
+ %conv = zext i8 %guard to i32
111
+ br label %for.body
112
+
113
+ for.body:
114
+ %ix.09 = phi i32 [ 0 , %entry ], [ %inc , %for.inc ]
115
+ %cmp1 = icmp ugt i32 %ix.09 , %conv
116
+ br i1 %cmp1 , label %if.then , label %for.inc
117
+
118
+ if.then:
119
+ %mul = shl nuw nsw i32 %ix.09 , 1
120
+ %arrayidx = getelementptr inbounds i8 , i8* %p , i32 %mul
121
+ %0 = load i8 , i8* %arrayidx , align 1
122
+ %arrayidx3 = getelementptr inbounds i8 , i8* %q , i32 %ix.09
123
+ store i8 %0 , i8* %arrayidx3 , align 1
124
+ br label %for.inc
125
+
126
+ for.inc:
127
+ %inc = add nuw nsw i32 %ix.09 , 1
128
+ %exitcond = icmp eq i32 %inc , 1024
129
+ br i1 %exitcond , label %for.end , label %for.body
130
+
131
+ for.end:
132
+ ret void
133
+ }
134
+
135
+ ; Same, but the load/store are not predicated. The interleave-group is
136
+ ; invalidated here as well because we have gaps and we can't create an epilog.
137
+ ; The access is thus scalarized.
138
+ ; (Before the fix that this test checks, we used to create an epilogue despite
139
+ ; optsize, and vectorized the access as an interleaved-group. This is now fixed,
140
+ ; and we make sure that a scalar epilogue does not exist).
141
+ ; Since enable-masked-interleaved-accesses currently only affects predicated
142
+ ; accesses, the behavior is the same with this switch set/unset.
143
+
144
+
145
+ ; void unconditional_strided1_optsize(const unsigned char* restrict p,
146
+ ; unsigned char* restrict q,
147
+ ; unsigned char guard) {
148
+ ; for(ix=0; ix < 1024; ++ix) {
149
+ ; char t = p[2*ix];
150
+ ; q[ix] = t;
151
+ ; }
152
+ ; }
153
+
154
+ ;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
155
+ ;DISABLED_MASKED_STRIDED: vector.body:
156
+ ;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
157
+ ;DISABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
158
+ ;DISABLED_MASKED_STRIDED-NOT: for.body:
159
+ ;DISABLED_MASKED_STRIDED: for.end:
160
+
161
+ ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
162
+ ;ENABLED_MASKED_STRIDED: vector.body:
163
+ ;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
164
+ ;ENABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
165
+ ;ENABLED_MASKED_STRIDED-NOT: for.body:
166
+ ;ENABLED_MASKED_STRIDED: for.end:
167
+
168
+ define dso_local void @unconditional_strided1_optsize (i8* noalias nocapture readonly %p , i8* noalias nocapture %q , i8 zeroext %guard ) local_unnamed_addr optsize {
169
+ entry:
170
+ br label %for.body
171
+
172
+ for.body:
173
+ %ix.06 = phi i32 [ 0 , %entry ], [ %inc , %for.body ]
174
+ %mul = shl nuw nsw i32 %ix.06 , 1
175
+ %arrayidx = getelementptr inbounds i8 , i8* %p , i32 %mul
176
+ %0 = load i8 , i8* %arrayidx , align 1
177
+ %arrayidx1 = getelementptr inbounds i8 , i8* %q , i32 %ix.06
178
+ store i8 %0 , i8* %arrayidx1 , align 1
179
+ %inc = add nuw nsw i32 %ix.06 , 1
180
+ %exitcond = icmp eq i32 %inc , 1024
181
+ br i1 %exitcond , label %for.end , label %for.body
182
+
183
+ for.end:
184
+ ret void
185
+ }
186
+
187
+
78
188
; Check also a scenario with full interleave-groups (no gaps) as well as both
79
189
; load and store groups. We check that when masked-interleave-group is disabled
80
190
; the predicated loads (and stores) are not vectorized as an
0 commit comments