Skip to content

Commit bef6e67

Browse files
committed
[VectorCombine] transform bitcasted shuffle to wider elements
bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' This is the widen shuffle elements enhancement to D76727. It builds on the analysis and simplifications in D77881 and rG6a7e958a423e. The phase ordering tests show that we can simplify inverse shuffles across a binop in both directions (widen/narrow or narrow/widen) now. There's another potential transform visible in some of the remaining TODOs - move a bitcasted operand of a shuffle after the shuffle. Differential Revision: https://reviews.llvm.org/D78371
1 parent 02b070e commit bef6e67

File tree

3 files changed

+52
-51
lines changed

3 files changed

+52
-51
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -259,42 +259,49 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) {
259259
return true;
260260
}
261261

262-
/// If this is a bitcast to narrow elements from a shuffle of wider elements,
263-
/// try to bitcast the source vector to the narrow type followed by shuffle.
264-
/// This can enable further transforms by moving bitcasts or shuffles together.
262+
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
263+
/// destination type followed by shuffle. This can enable further transforms by
264+
/// moving bitcasts or shuffles together.
265265
static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) {
266266
Value *V;
267267
ArrayRef<int> Mask;
268268
if (!match(&I, m_BitCast(m_OneUse(m_ShuffleVector(m_Value(V), m_Undef(),
269269
m_Mask(Mask))))))
270270
return false;
271271

272+
// Disallow non-vector casts and length-changing shuffles.
273+
// TODO: We could allow any shuffle.
272274
auto *DestTy = dyn_cast<VectorType>(I.getType());
273275
auto *SrcTy = cast<VectorType>(V->getType());
274276
if (!DestTy || I.getOperand(0)->getType() != SrcTy)
275277
return false;
276278

277-
// TODO: Handle bitcast from narrow element type to wide element type.
278-
unsigned DestNumElts = DestTy->getNumElements();
279-
unsigned SrcNumElts = SrcTy->getNumElements();
280-
if (SrcNumElts > DestNumElts)
281-
return false;
282-
283279
// The new shuffle must not cost more than the old shuffle. The bitcast is
284280
// moved ahead of the shuffle, so assume that it has the same cost as before.
285281
if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
286282
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
287283
return false;
288284

289-
// Bitcast the source vector and expand the shuffle mask to the equivalent for
290-
// narrow elements.
285+
unsigned DestNumElts = DestTy->getNumElements();
286+
unsigned SrcNumElts = SrcTy->getNumElements();
287+
SmallVector<int, 16> NewMask;
288+
if (SrcNumElts <= DestNumElts) {
289+
// The bitcast is from wide to narrow/equal elements. The shuffle mask can
290+
// always be expanded to the equivalent form choosing narrower elements.
291+
assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
292+
unsigned ScaleFactor = DestNumElts / SrcNumElts;
293+
narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
294+
} else {
295+
// The bitcast is from narrow elements to wide elements. The shuffle mask
296+
// must choose consecutive elements to allow casting first.
297+
assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
298+
unsigned ScaleFactor = SrcNumElts / DestNumElts;
299+
if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
300+
return false;
301+
}
291302
// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
292303
IRBuilder<> Builder(&I);
293304
Value *CastV = Builder.CreateBitCast(V, DestTy);
294-
SmallVector<int, 16> NewMask;
295-
assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
296-
unsigned ScaleFactor = DestNumElts / SrcNumElts;
297-
narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
298305
Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy),
299306
NewMask);
300307
I.replaceAllUsesWith(Shuf);

llvm/test/Transforms/PhaseOrdering/X86/shuffle.ll

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,13 @@ define <2 x i64> @shuffle_32_add_8_shuffle_32_masks_are_eq(<2 x i64> %v) {
4747
ret <2 x i64> %bc5
4848
}
4949

50-
; TODO: Eliminate redundant shuffles
50+
; Eliminate redundant shuffles
5151

5252
define <2 x i64> @shuffle_8_add_32_shuffle_8_masks_are_eq(<2 x i64> %v) {
5353
; CHECK-LABEL: @shuffle_8_add_32_shuffle_8_masks_are_eq(
54-
; CHECK-NEXT: [[BC0:%.*]] = bitcast <2 x i64> [[V:%.*]] to <16 x i8>
55-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[BC0]], <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
56-
; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[SHUFFLE]] to <4 x i32>
57-
; CHECK-NEXT: [[ADD_I:%.*]] = shl <4 x i32> [[BC2]], <i32 1, i32 1, i32 1, i32 1>
58-
; CHECK-NEXT: [[BC4:%.*]] = bitcast <4 x i32> [[ADD_I]] to <16 x i8>
59-
; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <16 x i8> [[BC4]], <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
60-
; CHECK-NEXT: [[BC5:%.*]] = bitcast <16 x i8> [[SHUFFLE4]] to <2 x i64>
54+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
55+
; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
56+
; CHECK-NEXT: [[BC5:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
6157
; CHECK-NEXT: ret <2 x i64> [[BC5]]
6258
;
6359
%bc0 = bitcast <2 x i64> %v to <16 x i8>
@@ -126,15 +122,14 @@ define <16 x i8> @shuffle_16_add_8_masks_are_eq(<8 x i16> %v1, <8 x i16> %v2) {
126122
ret <16 x i8> %add
127123
}
128124

129-
; TODO: Sink single shuffle.
125+
; Sink single shuffle.
130126

131127
define <4 x i32> @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(<8 x i16> %v1, <8 x i16> %v2) {
132128
; CHECK-LABEL: @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(
133-
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[V1:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
134-
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[V2:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
135-
; CHECK-NEXT: [[BC1:%.*]] = bitcast <8 x i16> [[SHUFFLE1]] to <4 x i32>
136-
; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[SHUFFLE2]] to <4 x i32>
137-
; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[BC2]], [[BC1]]
129+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32>
130+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[V2:%.*]] to <4 x i32>
131+
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]]
132+
; CHECK-NEXT: [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
138133
; CHECK-NEXT: ret <4 x i32> [[ADD]]
139134
;
140135
%shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
@@ -145,15 +140,14 @@ define <4 x i32> @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(<8 x i1
145140
ret <4 x i32> %add
146141
}
147142

148-
; TODO: Sink single shuffle.
143+
; Sink single shuffle.
149144

150145
define <4 x i32> @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up(<16 x i8> %v1, <16 x i8> %v2) {
151146
; CHECK-LABEL: @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up(
152-
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
153-
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x i8> [[V2:%.*]], <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
154-
; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <4 x i32>
155-
; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[SHUFFLE2]] to <4 x i32>
156-
; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[BC2]], [[BC1]]
147+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32>
148+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[V2:%.*]] to <4 x i32>
149+
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]]
150+
; CHECK-NEXT: [[ADD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
157151
; CHECK-NEXT: ret <4 x i32> [[ADD]]
158152
;
159153
%shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -229,13 +223,13 @@ define <16 x i8> @shuffle_32_bitcast_8_shuffle_8_can_not_be_converted_up(<4 x i3
229223
}
230224

231225
; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<16 x i8>(v)))
232-
; TODO: Narrow, squash shuffles, and widen type?
226+
; TODO: squash shuffles?
233227

234228
define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(<16 x i8> %v1) {
235229
; CHECK-LABEL: @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(
236-
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
237-
; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <4 x i32>
238-
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
230+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <4 x i32>
231+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
232+
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
239233
; CHECK-NEXT: ret <4 x i32> [[SHUFFLE2]]
240234
;
241235
%shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
@@ -245,13 +239,13 @@ define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(<16 x i8>
245239
}
246240

247241
; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<8 x i16>(v)))
248-
; TODO: Narrow, squash shuffles, and widen type?
242+
; TODO: squash shuffles?
249243

250244
define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up(<8 x i16> %v1) {
251245
; CHECK-LABEL: @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up(
252-
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[V1:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
253-
; CHECK-NEXT: [[BC1:%.*]] = bitcast <8 x i16> [[SHUFFLE1]] to <4 x i32>
254-
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[BC1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
246+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V1:%.*]] to <4 x i32>
247+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
248+
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
255249
; CHECK-NEXT: ret <4 x i32> [[SHUFFLE2]]
256250
;
257251
%shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
@@ -293,13 +287,13 @@ define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_not_be_converted_up(<8 x
293287
}
294288

295289
; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<16 x i8>(v)))
296-
; TODO: Narrow, squash shuffles, and widen type?
290+
; TODO: squash shuffles and widen type?
297291

298292
define <8 x i16> @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up(<16 x i8> %v1) {
299293
; CHECK-LABEL: @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up(
300-
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[V1:%.*]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
301-
; CHECK-NEXT: [[BC1:%.*]] = bitcast <16 x i8> [[SHUFFLE1]] to <8 x i16>
302-
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[BC1]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
294+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[V1:%.*]] to <8 x i16>
295+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
296+
; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
303297
; CHECK-NEXT: ret <8 x i16> [[SHUFFLE2]]
304298
;
305299
%shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>

llvm/test/Transforms/VectorCombine/X86/shuffle.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,13 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
5959
ret i128 %r
6060
}
6161

62-
; Negative test - but might want to try this
62+
; Widen shuffle elements
6363

6464
define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
6565
; CHECK-LABEL: @bitcast_shuf_wide_element(
66-
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
67-
; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i16> [[SHUF]] to <4 x i32>
68-
; CHECK-NEXT: ret <4 x i32> [[R]]
66+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32>
67+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
68+
; CHECK-NEXT: ret <4 x i32> [[TMP2]]
6969
;
7070
%shuf = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
7171
%r = bitcast <8 x i16> %shuf to <4 x i32>

0 commit comments

Comments
 (0)