Skip to content

Commit 9d438e1

Browse files
committed
[VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize
Previously, vectorization for load-insert failed when the Offset was not a multiple of the Load type size. This patch allow it in two steps, 1. Vectorize it using a common multiple of Offset and LoadSize. 2. Bitcast to fit Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ
1 parent c8ebc7a commit 9d438e1

File tree

3 files changed

+204
-91
lines changed

3 files changed

+204
-91
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
192192
if (!canWidenLoad(Load, TTI))
193193
return false;
194194

195+
auto MaxCommonDivisor = [](int n) {
196+
if (n % 4 == 0)
197+
return 4;
198+
if (n % 2 == 0)
199+
return 2;
200+
else
201+
return 1;
202+
};
203+
195204
Type *ScalarTy = Scalar->getType();
196205
uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
197206
unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
@@ -206,6 +215,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
206215
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
207216
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
208217
unsigned OffsetEltIndex = 0;
218+
unsigned VectorRange = 0;
219+
bool NeedCast = false;
209220
Align Alignment = Load->getAlign();
210221
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
211222
&DT)) {
@@ -222,15 +233,27 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
222233
if (Offset.isNegative())
223234
return false;
224235

225-
// The offset must be a multiple of the scalar element to shuffle cleanly
226-
// in the element's size.
236+
// If Offset is multiple of a Scalar element, it can be shuffled to the
237+
// element's size; otherwise, Offset and Scalar must be shuffled to the
238+
// appropriate element size for both.
227239
uint64_t ScalarSizeInBytes = ScalarSize / 8;
228-
if (Offset.urem(ScalarSizeInBytes) != 0)
229-
return false;
240+
if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes);
241+
UnalignedBytes != 0) {
242+
uint64_t OldScalarSizeInBytes = ScalarSizeInBytes;
243+
// Assign the greatest common divisor between UnalignedBytes and Offset to
244+
// ScalarSizeInBytes
245+
ScalarSizeInBytes = MaxCommonDivisor(UnalignedBytes);
246+
ScalarSize = ScalarSizeInBytes * 8;
247+
VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes;
248+
MinVecNumElts = MinVectorSize / ScalarSize;
249+
ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize);
250+
MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
251+
NeedCast = true;
252+
}
230253

231-
// If we load MinVecNumElts, will our target element still be loaded?
232254
OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
233-
if (OffsetEltIndex >= MinVecNumElts)
255+
// If we load MinVecNumElts, will our target element still be loaded?
256+
if (OffsetEltIndex + VectorRange >= MinVecNumElts)
234257
return false;
235258

236259
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
@@ -248,11 +271,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
248271
Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
249272
Type *LoadTy = Load->getType();
250273
unsigned AS = Load->getPointerAddressSpace();
274+
auto VecTy = cast<InsertElementInst>(&I)->getType();
275+
251276
InstructionCost OldCost =
252277
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
253-
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
278+
APInt DemandedElts =
279+
APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0);
254280
OldCost +=
255-
TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
281+
TTI.getScalarizationOverhead(VecTy, DemandedElts,
256282
/* Insert */ true, HasExtract, CostKind);
257283

258284
// New pattern: load VecPtr
@@ -265,15 +291,34 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
265291
// We assume this operation has no cost in codegen if there was no offset.
266292
// Note that we could use freeze to avoid poison problems, but then we might
267293
// still need a shuffle to change the vector size.
268-
auto *Ty = cast<FixedVectorType>(I.getType());
269-
unsigned OutputNumElts = Ty->getNumElements();
270-
SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
271-
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
272-
Mask[0] = OffsetEltIndex;
294+
SmallVector<int> Mask;
295+
assert(OffsetEltIndex + VectorRange < MinVecNumElts &&
296+
"Address offset too big");
297+
if (!NeedCast) {
298+
auto *Ty = cast<FixedVectorType>(I.getType());
299+
unsigned OutputNumElts = Ty->getNumElements();
300+
Mask.assign(OutputNumElts, PoisonMaskElem);
301+
Mask[0] = OffsetEltIndex;
302+
} else {
303+
Mask.assign(MinVecNumElts, PoisonMaskElem);
304+
for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++)
305+
Mask[InsertPos] = OffsetEltIndex++;
306+
}
307+
273308
if (OffsetEltIndex)
274309
NewCost +=
275310
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind);
276311

312+
if (NeedCast)
313+
NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy,
314+
TargetTransformInfo::CastContextHint::None,
315+
TargetTransformInfo::TCK_RecipThroughput);
316+
317+
if (NeedCast)
318+
NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy,
319+
TargetTransformInfo::CastContextHint::None,
320+
TargetTransformInfo::TCK_RecipThroughput);
321+
277322
// We can aggressively convert to the vector form because the backend can
278323
// invert this transform if it does not result in a performance win.
279324
if (OldCost < NewCost || !NewCost.isValid())
@@ -282,12 +327,16 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
282327
// It is safe and potentially profitable to load a vector directly:
283328
// inselt undef, load Scalar, 0 --> load VecPtr
284329
IRBuilder<> Builder(Load);
330+
Value *Result;
285331
Value *CastedPtr =
286332
Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
287-
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
288-
VecLd = Builder.CreateShuffleVector(VecLd, Mask);
333+
Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
334+
Result = Builder.CreateShuffleVector(Result, Mask);
289335

290-
replaceValue(I, *VecLd);
336+
if (NeedCast)
337+
Result = Builder.CreateBitOrPointerCast(Result, I.getType());
338+
339+
replaceValue(I, *Result);
291340
++NumVecLoad;
292341
return true;
293342
}

llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

Lines changed: 70 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -302,16 +302,18 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
302302
ret <8 x i16> %r
303303
}
304304

305-
; Negative test - if we are shuffling a load from the base pointer, the address offset
306-
; must be a multiple of element size.
307-
; TODO: Could bitcast around this limitation.
308-
309305
define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
310-
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
311-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
312-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
313-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
314-
; CHECK-NEXT: ret <4 x i32> [[R]]
306+
; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
307+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
308+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
309+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
310+
; SSE2-NEXT: ret <4 x i32> [[R]]
311+
;
312+
; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
313+
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
314+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
315+
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
316+
; AVX2-NEXT: ret <4 x i32> [[R]]
315317
;
316318
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
317319
%s = load i32, ptr %gep, align 1
@@ -320,11 +322,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
320322
}
321323

322324
define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
323-
; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
324-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
325-
; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
326-
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
327-
; CHECK-NEXT: ret <2 x i64> [[R]]
325+
; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
326+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
327+
; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
328+
; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
329+
; SSE2-NEXT: ret <2 x i64> [[R]]
330+
;
331+
; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
332+
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
333+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
334+
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
335+
; AVX2-NEXT: ret <2 x i64> [[R]]
328336
;
329337
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
330338
%s = load i64, ptr %gep, align 1
@@ -333,11 +341,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 der
333341
}
334342

335343
define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
336-
; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
337-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
338-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
339-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
340-
; CHECK-NEXT: ret <4 x i32> [[R]]
344+
; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
345+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
346+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
347+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
348+
; SSE2-NEXT: ret <4 x i32> [[R]]
349+
;
350+
; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
351+
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
352+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
353+
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
354+
; AVX2-NEXT: ret <4 x i32> [[R]]
341355
;
342356
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
343357
%s = load i32, ptr %gep, align 1
@@ -346,11 +360,17 @@ define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
346360
}
347361

348362
define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
349-
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
350-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
351-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
352-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
353-
; CHECK-NEXT: ret <4 x i32> [[R]]
363+
; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
364+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
365+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
366+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
367+
; SSE2-NEXT: ret <4 x i32> [[R]]
368+
;
369+
; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
370+
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
371+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
372+
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
373+
; AVX2-NEXT: ret <4 x i32> [[R]]
354374
;
355375
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
356376
%s = load i32, ptr %gep, align 1
@@ -359,11 +379,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der
359379
}
360380

361381
define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
362-
; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
363-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
364-
; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
365-
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
366-
; CHECK-NEXT: ret <2 x i64> [[R]]
382+
; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
383+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
384+
; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
385+
; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
386+
; SSE2-NEXT: ret <2 x i64> [[R]]
387+
;
388+
; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
389+
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
390+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
391+
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
392+
; AVX2-NEXT: ret <2 x i64> [[R]]
367393
;
368394
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
369395
%s = load i64, ptr %gep, align 1
@@ -372,23 +398,29 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 der
372398
}
373399

374400
define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
375-
; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
376-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
377-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
378-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
379-
; CHECK-NEXT: ret <4 x i32> [[R]]
401+
; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
402+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
403+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
404+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
405+
; SSE2-NEXT: ret <4 x i32> [[R]]
406+
;
407+
; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
408+
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
409+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
410+
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
411+
; AVX2-NEXT: ret <4 x i32> [[R]]
380412
;
381413
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
382414
%s = load i32, ptr %gep, align 1
383415
%r = insertelement <4 x i32> poison, i32 %s, i64 0
384416
ret <4 x i32> %r
385417
}
386418

387-
define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) nofree nosync {
419+
define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
388420
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
389-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
390-
; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
391-
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
421+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
422+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
423+
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
392424
; CHECK-NEXT: ret <2 x i64> [[R]]
393425
;
394426
%gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1

0 commit comments

Comments
 (0)