Skip to content

Commit 33a7de8

Browse files
committed
[VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize
Previously, vectorization for load-insert failed when the Offset was not a multiple of the Load type size. This patch allow it in two steps, 1. Vectorize it using a common multiple of Offset and LoadSize. 2. Bitcast to fit Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ
1 parent 4cee120 commit 33a7de8

File tree

2 files changed

+130
-54
lines changed

2 files changed

+130
-54
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
205205
if (!canWidenLoad(Load, TTI))
206206
return false;
207207

208+
auto MaxCommonDivisor = [](int n) {
209+
if (n % 4 == 0)
210+
return 4;
211+
if (n % 2 == 0)
212+
return 2;
213+
else
214+
return 1;
215+
};
216+
208217
Type *ScalarTy = Scalar->getType();
209218
uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
210219
unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
@@ -219,6 +228,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
219228
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
220229
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
221230
unsigned OffsetEltIndex = 0;
231+
unsigned VectorRange = 0;
232+
bool NeedCast = false;
222233
Align Alignment = Load->getAlign();
223234
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
224235
&DT)) {
@@ -235,15 +246,27 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
235246
if (Offset.isNegative())
236247
return false;
237248

238-
// The offset must be a multiple of the scalar element to shuffle cleanly
239-
// in the element's size.
249+
// If Offset is multiple of a Scalar element, it can be shuffled to the
250+
// element's size; otherwise, Offset and Scalar must be shuffled to the
251+
// appropriate element size for both.
240252
uint64_t ScalarSizeInBytes = ScalarSize / 8;
241-
if (Offset.urem(ScalarSizeInBytes) != 0)
242-
return false;
253+
if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes);
254+
UnalignedBytes != 0) {
255+
uint64_t OldScalarSizeInBytes = ScalarSizeInBytes;
256+
// Assign the greatest common divisor between UnalignedBytes and Offset to
257+
// ScalarSizeInBytes
258+
ScalarSizeInBytes = MaxCommonDivisor(UnalignedBytes);
259+
ScalarSize = ScalarSizeInBytes * 8;
260+
VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes;
261+
MinVecNumElts = MinVectorSize / ScalarSize;
262+
ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize);
263+
MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
264+
NeedCast = true;
265+
}
243266

244-
// If we load MinVecNumElts, will our target element still be loaded?
245267
OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
246-
if (OffsetEltIndex >= MinVecNumElts)
268+
// If we load MinVecNumElts, will our target element still be loaded?
269+
if (OffsetEltIndex + VectorRange >= MinVecNumElts)
247270
return false;
248271

249272
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
@@ -261,11 +284,14 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
261284
Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
262285
Type *LoadTy = Load->getType();
263286
unsigned AS = Load->getPointerAddressSpace();
287+
auto VecTy = cast<InsertElementInst>(&I)->getType();
288+
264289
InstructionCost OldCost =
265290
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
266-
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
291+
APInt DemandedElts =
292+
APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0);
267293
OldCost +=
268-
TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
294+
TTI.getScalarizationOverhead(VecTy, DemandedElts,
269295
/* Insert */ true, HasExtract, CostKind);
270296

271297
// New pattern: load VecPtr
@@ -278,15 +304,29 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
278304
// We assume this operation has no cost in codegen if there was no offset.
279305
// Note that we could use freeze to avoid poison problems, but then we might
280306
// still need a shuffle to change the vector size.
281-
auto *Ty = cast<FixedVectorType>(I.getType());
282-
unsigned OutputNumElts = Ty->getNumElements();
283-
SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
284-
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
285-
Mask[0] = OffsetEltIndex;
307+
SmallVector<int> Mask;
308+
assert(OffsetEltIndex + VectorRange < MinVecNumElts &&
309+
"Address offset too big");
310+
if (!NeedCast) {
311+
auto *Ty = cast<FixedVectorType>(I.getType());
312+
unsigned OutputNumElts = Ty->getNumElements();
313+
Mask.assign(OutputNumElts, PoisonMaskElem);
314+
Mask[0] = OffsetEltIndex;
315+
} else {
316+
Mask.assign(MinVecNumElts, PoisonMaskElem);
317+
for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++)
318+
Mask[InsertPos] = OffsetEltIndex++;
319+
}
320+
286321
if (OffsetEltIndex)
287322
NewCost +=
288323
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask, CostKind);
289324

325+
if (NeedCast)
326+
NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy,
327+
TargetTransformInfo::CastContextHint::None,
328+
CostKind);
329+
290330
// We can aggressively convert to the vector form because the backend can
291331
// invert this transform if it does not result in a performance win.
292332
if (OldCost < NewCost || !NewCost.isValid())
@@ -295,12 +335,16 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
295335
// It is safe and potentially profitable to load a vector directly:
296336
// inselt undef, load Scalar, 0 --> load VecPtr
297337
IRBuilder<> Builder(Load);
338+
Value *Result;
298339
Value *CastedPtr =
299340
Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
300-
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
301-
VecLd = Builder.CreateShuffleVector(VecLd, Mask);
341+
Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
342+
Result = Builder.CreateShuffleVector(Result, Mask);
302343

303-
replaceValue(I, *VecLd);
344+
if (NeedCast)
345+
Result = Builder.CreateBitOrPointerCast(Result, I.getType());
346+
347+
replaceValue(I, *Result);
304348
++NumVecLoad;
305349
return true;
306350
}

llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

Lines changed: 70 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -290,16 +290,18 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
290290
ret <8 x i16> %r
291291
}
292292

293-
; Negative test - if we are shuffling a load from the base pointer, the address offset
294-
; must be a multiple of element size.
295-
; TODO: Could bitcast around this limitation.
296-
297293
define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
298-
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
299-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
300-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
301-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
302-
; CHECK-NEXT: ret <4 x i32> [[R]]
294+
; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
295+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
296+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
297+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
298+
; SSE2-NEXT: ret <4 x i32> [[R]]
299+
;
300+
; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
301+
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
302+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
303+
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
304+
; AVX2-NEXT: ret <4 x i32> [[R]]
303305
;
304306
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
305307
%s = load i32, ptr %gep, align 1
@@ -308,11 +310,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
308310
}
309311

310312
define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
311-
; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
312-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
313-
; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
314-
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
315-
; CHECK-NEXT: ret <2 x i64> [[R]]
313+
; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
314+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
315+
; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
316+
; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
317+
; SSE2-NEXT: ret <2 x i64> [[R]]
318+
;
319+
; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
320+
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
321+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
322+
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
323+
; AVX2-NEXT: ret <2 x i64> [[R]]
316324
;
317325
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
318326
%s = load i64, ptr %gep, align 1
@@ -321,11 +329,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 der
321329
}
322330

323331
define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
324-
; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
325-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
326-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
327-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
328-
; CHECK-NEXT: ret <4 x i32> [[R]]
332+
; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
333+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
334+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
335+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
336+
; SSE2-NEXT: ret <4 x i32> [[R]]
337+
;
338+
; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
339+
; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
340+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
341+
; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
342+
; AVX2-NEXT: ret <4 x i32> [[R]]
329343
;
330344
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
331345
%s = load i32, ptr %gep, align 1
@@ -334,11 +348,17 @@ define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
334348
}
335349

336350
define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
337-
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
338-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
339-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
340-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
341-
; CHECK-NEXT: ret <4 x i32> [[R]]
351+
; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
352+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
353+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
354+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
355+
; SSE2-NEXT: ret <4 x i32> [[R]]
356+
;
357+
; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
358+
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
359+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
360+
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
361+
; AVX2-NEXT: ret <4 x i32> [[R]]
342362
;
343363
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
344364
%s = load i32, ptr %gep, align 1
@@ -347,11 +367,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der
347367
}
348368

349369
define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
350-
; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
351-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
352-
; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
353-
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
354-
; CHECK-NEXT: ret <2 x i64> [[R]]
370+
; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
371+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
372+
; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
373+
; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
374+
; SSE2-NEXT: ret <2 x i64> [[R]]
375+
;
376+
; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
377+
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
378+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
379+
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
380+
; AVX2-NEXT: ret <2 x i64> [[R]]
355381
;
356382
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
357383
%s = load i64, ptr %gep, align 1
@@ -360,23 +386,29 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 der
360386
}
361387

362388
define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
363-
; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
364-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
365-
; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
366-
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
367-
; CHECK-NEXT: ret <4 x i32> [[R]]
389+
; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
390+
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
391+
; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
392+
; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
393+
; SSE2-NEXT: ret <4 x i32> [[R]]
394+
;
395+
; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
396+
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
397+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
398+
; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
399+
; AVX2-NEXT: ret <4 x i32> [[R]]
368400
;
369401
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
370402
%s = load i32, ptr %gep, align 1
371403
%r = insertelement <4 x i32> poison, i32 %s, i64 0
372404
ret <4 x i32> %r
373405
}
374406

375-
define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) nofree nosync {
407+
define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
376408
; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
377-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
378-
; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
379-
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
409+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
410+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
411+
; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
380412
; CHECK-NEXT: ret <2 x i64> [[R]]
381413
;
382414
%gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1

0 commit comments

Comments
 (0)