Skip to content

Commit 69ecffc

Browse files
committed
[AMDGPU] Allow SLP to analyze i8s
Change-Id: Ia995bc646e5f050083bd6277eeabe0b5ab410f47
1 parent 39f7846 commit 69ecffc

File tree

7 files changed

+943
-3
lines changed

7 files changed

+943
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,18 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306306
return !F || !ST->isSingleLaneExecution(*F);
307307
}
308308

309+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
310+
if (auto VTy = dyn_cast<FixedVectorType>(Tp)) {
311+
if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
312+
auto ElCount = VTy->getElementCount().getFixedValue();
313+
return ElCount / 4;
314+
}
315+
}
316+
317+
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
318+
return LT.first.isValid() ? *LT.first.getValue() : 0;
319+
}
320+
309321
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310322
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311323
// registers. See getRegisterClassForType for the implementation.
@@ -337,9 +349,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
337349
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338350
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339351
return 32 * 4 / ElemWidth;
340-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342-
: 1;
352+
353+
return (ElemWidth == 8) ? 4
354+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
355+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
356+
: 1;
343357
}
344358

345359
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
117117
return TTI::PSK_FastHardware;
118118
}
119119

120+
unsigned getNumberOfParts(Type *Tp) const;
120121
unsigned getNumberOfRegisters(unsigned RCID) const;
121122
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
122123
unsigned getMinVectorRegisterBitWidth() const;

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,11 +363,66 @@ bb:
363363
ret <4 x i16> %ins.3
364364
}
365365

366+
define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
367+
; GCN-LABEL: @uadd_sat_v4i8(
368+
; GCN-NEXT: bb:
369+
; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
370+
; GCN-NEXT: ret <4 x i8> [[TMP0]]
371+
;
372+
bb:
373+
%arg0.0 = extractelement <4 x i8> %arg0, i64 0
374+
%arg0.1 = extractelement <4 x i8> %arg0, i64 1
375+
%arg0.2 = extractelement <4 x i8> %arg0, i64 2
376+
%arg0.3 = extractelement <4 x i8> %arg0, i64 3
377+
%arg1.0 = extractelement <4 x i8> %arg1, i64 0
378+
%arg1.1 = extractelement <4 x i8> %arg1, i64 1
379+
%arg1.2 = extractelement <4 x i8> %arg1, i64 2
380+
%arg1.3 = extractelement <4 x i8> %arg1, i64 3
381+
%add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0)
382+
%add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1)
383+
%add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2)
384+
%add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3)
385+
%ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0
386+
%ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
387+
%ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
388+
%ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
389+
ret <4 x i8> %ins.3
390+
}
391+
392+
define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
393+
; GCN-LABEL: @usub_sat_v4i8(
394+
; GCN-NEXT: bb:
395+
; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
396+
; GCN-NEXT: ret <4 x i8> [[TMP0]]
397+
;
398+
bb:
399+
%arg0.0 = extractelement <4 x i8> %arg0, i64 0
400+
%arg0.1 = extractelement <4 x i8> %arg0, i64 1
401+
%arg0.2 = extractelement <4 x i8> %arg0, i64 2
402+
%arg0.3 = extractelement <4 x i8> %arg0, i64 3
403+
%arg1.0 = extractelement <4 x i8> %arg1, i64 0
404+
%arg1.1 = extractelement <4 x i8> %arg1, i64 1
405+
%arg1.2 = extractelement <4 x i8> %arg1, i64 2
406+
%arg1.3 = extractelement <4 x i8> %arg1, i64 3
407+
%add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0)
408+
%add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1)
409+
%add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2)
410+
%add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3)
411+
%ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0
412+
%ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
413+
%ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
414+
%ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
415+
ret <4 x i8> %ins.3
416+
}
417+
366418
declare i16 @llvm.uadd.sat.i16(i16, i16) #0
367419
declare i16 @llvm.usub.sat.i16(i16, i16) #0
368420
declare i16 @llvm.sadd.sat.i16(i16, i16) #0
369421
declare i16 @llvm.ssub.sat.i16(i16, i16) #0
370422

423+
declare i8 @llvm.uadd.sat.i8(i8, i8) #0
424+
declare i8 @llvm.usub.sat.i8(i8, i8) #0
425+
371426
declare i32 @llvm.uadd.sat.i32(i32, i32) #0
372427
declare i32 @llvm.usub.sat.i32(i32, i32) #0
373428
declare i32 @llvm.sadd.sat.i32(i32, i32) #0

llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,11 +363,67 @@ bb:
363363
ret <4 x i16> %ins.3
364364
}
365365

366+
define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1, ptr addrspace(1) %dst) {
367+
; GCN-LABEL: @uadd_sat_v4i8(
368+
; GCN-NEXT: bb:
369+
; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
370+
; GCN-NEXT: ret <4 x i8> [[TMP0]]
371+
;
372+
bb:
373+
%arg0.0 = extractelement <4 x i8> %arg0, i64 0
374+
%arg0.1 = extractelement <4 x i8> %arg0, i64 1
375+
%arg0.2 = extractelement <4 x i8> %arg0, i64 2
376+
%arg0.3 = extractelement <4 x i8> %arg0, i64 3
377+
%arg1.0 = extractelement <4 x i8> %arg1, i64 0
378+
%arg1.1 = extractelement <4 x i8> %arg1, i64 1
379+
%arg1.2 = extractelement <4 x i8> %arg1, i64 2
380+
%arg1.3 = extractelement <4 x i8> %arg1, i64 3
381+
%add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0)
382+
%add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1)
383+
%add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2)
384+
%add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3)
385+
%ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0
386+
%ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
387+
%ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
388+
%ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
389+
ret <4 x i8> %ins.3
390+
}
391+
define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
392+
; GCN-LABEL: @usub_sat_v4i8(
393+
; GCN-NEXT: bb:
394+
; GCN-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
395+
; GCN-NEXT: ret <4 x i8> [[TMP0]]
396+
;
397+
bb:
398+
%arg0.0 = extractelement <4 x i8> %arg0, i64 0
399+
%arg0.1 = extractelement <4 x i8> %arg0, i64 1
400+
%arg0.2 = extractelement <4 x i8> %arg0, i64 2
401+
%arg0.3 = extractelement <4 x i8> %arg0, i64 3
402+
%arg1.0 = extractelement <4 x i8> %arg1, i64 0
403+
%arg1.1 = extractelement <4 x i8> %arg1, i64 1
404+
%arg1.2 = extractelement <4 x i8> %arg1, i64 2
405+
%arg1.3 = extractelement <4 x i8> %arg1, i64 3
406+
%add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0)
407+
%add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1)
408+
%add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2)
409+
%add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3)
410+
%ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0
411+
%ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
412+
%ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
413+
%ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
414+
ret <4 x i8> %ins.3
415+
416+
}
417+
418+
366419
declare i16 @llvm.uadd.sat.i16(i16, i16) #0
367420
declare i16 @llvm.usub.sat.i16(i16, i16) #0
368421
declare i16 @llvm.sadd.sat.i16(i16, i16) #0
369422
declare i16 @llvm.ssub.sat.i16(i16, i16) #0
370423

424+
declare i8 @llvm.uadd.sat.i8(i8, i8) #0
425+
declare i8 @llvm.usub.sat.i8(i8, i8) #0
426+
371427
declare i32 @llvm.uadd.sat.i32(i32, i32) #0
372428
declare i32 @llvm.usub.sat.i32(i32, i32) #0
373429
declare i32 @llvm.sadd.sat.i32(i32, i32) #0

0 commit comments

Comments
 (0)