Skip to content

Commit a5a9c4b

Browse files
committed
[AMDGPU] Consider i8 vectorizable chains
Change-Id: I5de8a20c835f7d7050b217e14faf65f9b93ced5e
1 parent 1aac9b9 commit a5a9c4b

File tree

3 files changed

+38
-68
lines changed

3 files changed

+38
-68
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,18 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306306
return !F || !ST->isSingleLaneExecution(*F);
307307
}
308308

309+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
310+
if (auto VTy = dyn_cast<FixedVectorType>(Tp)) {
311+
if (VTy->getScalarSizeInBits() == 8) {
312+
auto ElCount = VTy->getElementCount().getFixedValue();
313+
return ElCount / 4;
314+
}
315+
}
316+
317+
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
318+
return LT.first.isValid() ? *LT.first.getValue() : 0;
319+
}
320+
309321
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310322
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311323
// registers. See getRegisterClassForType for the implementation.
@@ -337,9 +349,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
337349
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338350
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339351
return 32 * 4 / ElemWidth;
340-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342-
: 1;
352+
353+
return (ElemWidth == 8) ? 4
354+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
355+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
356+
: 1;
343357
}
344358

345359
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
117117
return TTI::PSK_FastHardware;
118118
}
119119

120+
unsigned getNumberOfParts(Type *Tp) const;
120121
unsigned getNumberOfRegisters(unsigned RCID) const;
121122
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
122123
unsigned getMinVectorRegisterBitWidth() const;

llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll

Lines changed: 20 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -362,71 +362,26 @@ define protected amdgpu_kernel void @vectorizeShuffle(<16 x i8> %invec, ptr %out
362362
;
363363
; GFX8PLUS-LABEL: @vectorizeShuffle(
364364
; GFX8PLUS-NEXT: entry:
365-
; GFX8PLUS-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC:%.*]], i64 0
366-
; GFX8PLUS-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
367-
; GFX8PLUS-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
368-
; GFX8PLUS-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3
369-
; GFX8PLUS-NEXT: [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4
370-
; GFX8PLUS-NEXT: [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5
371-
; GFX8PLUS-NEXT: [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6
372-
; GFX8PLUS-NEXT: [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7
373-
; GFX8PLUS-NEXT: [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8
374-
; GFX8PLUS-NEXT: [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9
375-
; GFX8PLUS-NEXT: [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10
376-
; GFX8PLUS-NEXT: [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11
377-
; GFX8PLUS-NEXT: [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12
378-
; GFX8PLUS-NEXT: [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13
379-
; GFX8PLUS-NEXT: [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14
380-
; GFX8PLUS-NEXT: [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15
381-
; GFX8PLUS-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1
382-
; GFX8PLUS-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1
383-
; GFX8PLUS-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1
384-
; GFX8PLUS-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1
385-
; GFX8PLUS-NEXT: [[MUL4:%.*]] = mul i8 [[EL4]], 1
386-
; GFX8PLUS-NEXT: [[MUL5:%.*]] = mul i8 [[EL5]], 1
387-
; GFX8PLUS-NEXT: [[MUL6:%.*]] = mul i8 [[EL6]], 1
388-
; GFX8PLUS-NEXT: [[MUL7:%.*]] = mul i8 [[EL7]], 1
389-
; GFX8PLUS-NEXT: [[MUL8:%.*]] = mul i8 [[EL8]], 1
390-
; GFX8PLUS-NEXT: [[MUL9:%.*]] = mul i8 [[EL9]], 1
391-
; GFX8PLUS-NEXT: [[MUL10:%.*]] = mul i8 [[EL10]], 1
392-
; GFX8PLUS-NEXT: [[MUL11:%.*]] = mul i8 [[EL11]], 1
393-
; GFX8PLUS-NEXT: [[MUL12:%.*]] = mul i8 [[EL12]], 1
394-
; GFX8PLUS-NEXT: [[MUL13:%.*]] = mul i8 [[EL13]], 1
395-
; GFX8PLUS-NEXT: [[MUL14:%.*]] = mul i8 [[EL14]], 1
396-
; GFX8PLUS-NEXT: [[MUL15:%.*]] = mul i8 [[EL15]], 1
397-
; GFX8PLUS-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1
398-
; GFX8PLUS-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1
399-
; GFX8PLUS-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1
400-
; GFX8PLUS-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1
401-
; GFX8PLUS-NEXT: [[ADD4:%.*]] = add i8 [[MUL4]], 1
402-
; GFX8PLUS-NEXT: [[ADD5:%.*]] = add i8 [[MUL5]], 1
403-
; GFX8PLUS-NEXT: [[ADD6:%.*]] = add i8 [[MUL6]], 1
404-
; GFX8PLUS-NEXT: [[ADD7:%.*]] = add i8 [[MUL7]], 1
405-
; GFX8PLUS-NEXT: [[ADD8:%.*]] = add i8 [[MUL8]], 1
406-
; GFX8PLUS-NEXT: [[ADD9:%.*]] = add i8 [[MUL9]], 1
407-
; GFX8PLUS-NEXT: [[ADD10:%.*]] = add i8 [[MUL10]], 1
408-
; GFX8PLUS-NEXT: [[ADD11:%.*]] = add i8 [[MUL11]], 1
409-
; GFX8PLUS-NEXT: [[ADD12:%.*]] = add i8 [[MUL12]], 1
410-
; GFX8PLUS-NEXT: [[ADD13:%.*]] = add i8 [[MUL13]], 1
411-
; GFX8PLUS-NEXT: [[ADD14:%.*]] = add i8 [[MUL14]], 1
412-
; GFX8PLUS-NEXT: [[ADD15:%.*]] = add i8 [[MUL15]], 1
413-
; GFX8PLUS-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
414-
; GFX8PLUS-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
415-
; GFX8PLUS-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2
416-
; GFX8PLUS-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3
417-
; GFX8PLUS-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4
418-
; GFX8PLUS-NEXT: [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5
419-
; GFX8PLUS-NEXT: [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6
420-
; GFX8PLUS-NEXT: [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7
421-
; GFX8PLUS-NEXT: [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8
422-
; GFX8PLUS-NEXT: [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9
423-
; GFX8PLUS-NEXT: [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10
424-
; GFX8PLUS-NEXT: [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11
425-
; GFX8PLUS-NEXT: [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12
426-
; GFX8PLUS-NEXT: [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13
427-
; GFX8PLUS-NEXT: [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14
428-
; GFX8PLUS-NEXT: [[VECINS15:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15
429-
; GFX8PLUS-NEXT: store <16 x i8> [[VECINS15]], ptr [[OUT:%.*]], align 16
365+
; GFX8PLUS-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
366+
; GFX8PLUS-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], <i8 1, i8 1, i8 1, i8 1>
367+
; GFX8PLUS-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], <i8 1, i8 1, i8 1, i8 1>
368+
; GFX8PLUS-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
369+
; GFX8PLUS-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], <i8 1, i8 1, i8 1, i8 1>
370+
; GFX8PLUS-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], <i8 1, i8 1, i8 1, i8 1>
371+
; GFX8PLUS-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
372+
; GFX8PLUS-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], <i8 1, i8 1, i8 1, i8 1>
373+
; GFX8PLUS-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], <i8 1, i8 1, i8 1, i8 1>
374+
; GFX8PLUS-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
375+
; GFX8PLUS-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], <i8 1, i8 1, i8 1, i8 1>
376+
; GFX8PLUS-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], <i8 1, i8 1, i8 1, i8 1>
377+
; GFX8PLUS-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
378+
; GFX8PLUS-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
379+
; GFX8PLUS-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
380+
; GFX8PLUS-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
381+
; GFX8PLUS-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
382+
; GFX8PLUS-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
383+
; GFX8PLUS-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
384+
; GFX8PLUS-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT:%.*]], align 16
430385
; GFX8PLUS-NEXT: ret void
431386
;
432387
entry:

0 commit comments

Comments
 (0)