Skip to content

Commit af755af

Browse files
authored
AMDGPU: Handle demanded subvectors for readfirstlane (#128648)
1 parent dc69eae commit af755af

File tree

2 files changed

+66
-29
lines changed

2 files changed

+66
-29
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1574,35 +1574,59 @@ Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
15741574
const unsigned LastElt = DemandedElts.getActiveBits() - 1;
15751575
const unsigned MaskLen = LastElt - FirstElt + 1;
15761576

1577-
// TODO: Handle general subvector extract.
1578-
if (MaskLen != 1)
1577+
unsigned OldNumElts = VT->getNumElements();
1578+
if (MaskLen == OldNumElts && MaskLen != 1)
15791579
return nullptr;
15801580

15811581
Type *EltTy = VT->getElementType();
1582-
if (!isTypeLegal(EltTy))
1582+
Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
1583+
1584+
// Theoretically we should support these intrinsics for any legal type. Avoid
1585+
// introducing cases that aren't direct register types like v3i16.
1586+
if (!isTypeLegal(NewVT))
15831587
return nullptr;
15841588

15851589
Value *Src = II.getArgOperand(0);
15861590

1587-
assert(FirstElt == LastElt);
1588-
Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
1589-
15901591
// Make sure convergence tokens are preserved.
15911592
// TODO: CreateIntrinsic should allow directly copying bundles
15921593
SmallVector<OperandBundleDef, 2> OpBundles;
15931594
II.getOperandBundlesAsDefs(OpBundles);
15941595

15951596
Module *M = IC.Builder.GetInsertBlock()->getModule();
1596-
Function *Remangled = Intrinsic::getOrInsertDeclaration(
1597-
M, II.getIntrinsicID(), {Extract->getType()});
1597+
Function *Remangled =
1598+
Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
1599+
1600+
if (MaskLen == 1) {
1601+
Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
1602+
1603+
// TODO: Preserve callsite attributes?
1604+
CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
1605+
1606+
return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
1607+
NewCall, FirstElt);
1608+
}
1609+
1610+
SmallVector<int> ExtractMask(MaskLen, -1);
1611+
for (unsigned I = 0; I != MaskLen; ++I) {
1612+
if (DemandedElts[FirstElt + I])
1613+
ExtractMask[I] = FirstElt + I;
1614+
}
1615+
1616+
Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
15981617

15991618
// TODO: Preserve callsite attributes?
16001619
CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
16011620

1621+
SmallVector<int> InsertMask(OldNumElts, -1);
1622+
for (unsigned I = 0; I != MaskLen; ++I) {
1623+
if (DemandedElts[FirstElt + I])
1624+
InsertMask[FirstElt + I] = I;
1625+
}
1626+
16021627
// FIXME: If the call has a convergence bundle, we end up leaving the dead
16031628
// call behind.
1604-
return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()), NewCall,
1605-
FirstElt);
1629+
return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
16061630
}
16071631

16081632
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(

llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ define i16 @extract_elt2_v4i16_readfirstlane(<4 x i16> %src) {
6464
define <2 x i16> @extract_elt01_v4i16_readfirstlane(<4 x i16> %src) {
6565
; CHECK-LABEL: define <2 x i16> @extract_elt01_v4i16_readfirstlane(
6666
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
67-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
68-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
67+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
68+
; CHECK-NEXT: [[SHUFFLE:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
6969
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
7070
;
7171
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -76,8 +76,8 @@ define <2 x i16> @extract_elt01_v4i16_readfirstlane(<4 x i16> %src) {
7676
define <2 x i16> @extract_elt12_v4i16_readfirstlane(<4 x i16> %src) {
7777
; CHECK-LABEL: define <2 x i16> @extract_elt12_v4i16_readfirstlane(
7878
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
79-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
80-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 1, i32 2>
79+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 1, i32 2>
80+
; CHECK-NEXT: [[SHUFFLE:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
8181
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
8282
;
8383
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -88,8 +88,8 @@ define <2 x i16> @extract_elt12_v4i16_readfirstlane(<4 x i16> %src) {
8888
define <2 x i16> @extract_elt23_v4i16_readfirstlane(<4 x i16> %src) {
8989
; CHECK-LABEL: define <2 x i16> @extract_elt23_v4i16_readfirstlane(
9090
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
91-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
92-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
91+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
92+
; CHECK-NEXT: [[SHUFFLE:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
9393
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
9494
;
9595
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -100,8 +100,9 @@ define <2 x i16> @extract_elt23_v4i16_readfirstlane(<4 x i16> %src) {
100100
define <2 x i16> @extract_elt10_v4i16_readfirstlane(<4 x i16> %src) {
101101
; CHECK-LABEL: define <2 x i16> @extract_elt10_v4i16_readfirstlane(
102102
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
103-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
104-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 1, i32 0>
103+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
104+
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
105+
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
105106
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
106107
;
107108
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -112,7 +113,9 @@ define <2 x i16> @extract_elt10_v4i16_readfirstlane(<4 x i16> %src) {
112113
define <2 x i16> @extract_elt32_v4i16_readfirstlane(<4 x i16> %src) {
113114
; CHECK-LABEL: define <2 x i16> @extract_elt32_v4i16_readfirstlane(
114115
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
115-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
116+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
117+
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
118+
; CHECK-NEXT: [[VEC:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
116119
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 3, i32 2>
117120
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
118121
;
@@ -258,8 +261,8 @@ define <3 x i16> @extract_elt123_v4i16_readfirstlane(<4 x i16> %src) {
258261
define <3 x i32> @extract_elt012_v4i32_readfirstlane(<4 x i32> %src) {
259262
; CHECK-LABEL: define <3 x i32> @extract_elt012_v4i32_readfirstlane(
260263
; CHECK-SAME: <4 x i32> [[SRC:%.*]]) #[[ATTR0]] {
261-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[SRC]])
262-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
264+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SRC]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
265+
; CHECK-NEXT: [[SHUFFLE:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
263266
; CHECK-NEXT: ret <3 x i32> [[SHUFFLE]]
264267
;
265268
%vec = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src)
@@ -270,8 +273,8 @@ define <3 x i32> @extract_elt012_v4i32_readfirstlane(<4 x i32> %src) {
270273
define <3 x i32> @extract_elt123_v4i32_readfirstlane(<4 x i32> %src) {
271274
; CHECK-LABEL: define <3 x i32> @extract_elt123_v4i32_readfirstlane(
272275
; CHECK-SAME: <4 x i32> [[SRC:%.*]]) #[[ATTR0]] {
273-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[SRC]])
274-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <3 x i32> <i32 1, i32 2, i32 3>
276+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SRC]], <4 x i32> poison, <3 x i32> <i32 1, i32 2, i32 3>
277+
; CHECK-NEXT: [[SHUFFLE:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
275278
; CHECK-NEXT: ret <3 x i32> [[SHUFFLE]]
276279
;
277280
%vec = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src)
@@ -282,7 +285,9 @@ define <3 x i32> @extract_elt123_v4i32_readfirstlane(<4 x i32> %src) {
282285
define <2 x i32> @extract_elt13_v4i32_readfirstlane(<4 x i32> %src) {
283286
; CHECK-LABEL: define <2 x i32> @extract_elt13_v4i32_readfirstlane(
284287
; CHECK-SAME: <4 x i32> [[SRC:%.*]]) #[[ATTR0]] {
285-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[SRC]])
288+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SRC]], <4 x i32> poison, <3 x i32> <i32 1, i32 poison, i32 3>
289+
; CHECK-NEXT: [[TMP2:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
290+
; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP2]], <3 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 2>
286291
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
287292
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
288293
;
@@ -321,8 +326,9 @@ define < 2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1(i32 %src0,
321326
; CHECK-LABEL: define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1(
322327
; CHECK-SAME: i32 [[SRC0:%.*]], i32 [[SRC2:%.*]]) #[[ATTR0]] {
323328
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SRC0]], i64 0
324-
; CHECK-NEXT: [[INS_1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>
325-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]])
329+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 poison, i32 0>
330+
; CHECK-NEXT: [[TMP3:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP2]])
331+
; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 2>
326332
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
327333
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
328334
;
@@ -365,7 +371,10 @@ define < 2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1_convergenc
365371
; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry()
366372
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SRC0]], i64 0
367373
; CHECK-NEXT: [[INS_1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>
368-
; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]]) [ "convergencectrl"(token [[T]]) ]
374+
; CHECK-NEXT: [[VEC1:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]]) [ "convergencectrl"(token [[T]]) ]
375+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 poison, i32 0>
376+
; CHECK-NEXT: [[TMP3:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP2]]) [ "convergencectrl"(token [[T]]) ]
377+
; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 2>
369378
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
370379
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
371380
;
@@ -404,7 +413,9 @@ define <2 x i1> @extract_elt01_v4i1_readfirstlane(<4 x i1> %src) {
404413
define <2 x i32> @extract_elt13_v8i32_readfirstlane(<8 x i32> %src) {
405414
; CHECK-LABEL: define <2 x i32> @extract_elt13_v8i32_readfirstlane(
406415
; CHECK-SAME: <8 x i32> [[SRC:%.*]]) #[[ATTR0]] {
407-
; CHECK-NEXT: [[VEC:%.*]] = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> [[SRC]])
416+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[SRC]], <8 x i32> poison, <3 x i32> <i32 1, i32 poison, i32 3>
417+
; CHECK-NEXT: [[TMP2:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
418+
; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP2]], <3 x i32> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
408419
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> poison, <2 x i32> <i32 1, i32 3>
409420
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
410421
;
@@ -428,7 +439,9 @@ define <2 x i32> @extract_elt03_v4i32_readfirstlane(<4 x i32> %src) {
428439
define <3 x i32> @extract_elt124_v8i32_readfirstlane(<8 x i32> %src) {
429440
; CHECK-LABEL: define <3 x i32> @extract_elt124_v8i32_readfirstlane(
430441
; CHECK-SAME: <8 x i32> [[SRC:%.*]]) #[[ATTR0]] {
431-
; CHECK-NEXT: [[VEC:%.*]] = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> [[SRC]])
442+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[SRC]], <8 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 4>
443+
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[TMP1]])
444+
; CHECK-NEXT: [[VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 poison, i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 poison, i32 poison>
432445
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> poison, <3 x i32> <i32 1, i32 2, i32 4>
433446
; CHECK-NEXT: ret <3 x i32> [[SHUFFLE]]
434447
;

0 commit comments

Comments
 (0)