-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Handle demanded subvectors for readfirstlane #128648
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Handle demanded subvectors for readfirstlane #128648
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesFull diff: https://github.com/llvm/llvm-project/pull/128648.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 99016fdd0ff91..10deee1616a74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1549,33 +1549,60 @@ Value *GCNTTIImpl::simplifyAMDGCNLaneIntrinsicDemanded(
const unsigned LastElt = DemandedElts.getActiveBits() - 1;
const unsigned MaskLen = LastElt - FirstElt + 1;
- // TODO: Handle general subvector extract.
- if (MaskLen != 1)
+ unsigned OldNumElts = VT->getNumElements();
+ if (MaskLen == OldNumElts && MaskLen != 1)
return nullptr;
Type *EltTy = VT->getElementType();
- if (!isTypeLegal(EltTy))
+ Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
+
+ // Theoretically we should support these intrinsics for any legal type. Avoid
+ // introducing cases that aren't direct register types like v3i16.
+ if (!isTypeLegal(NewVT))
return nullptr;
Value *Src = II.getArgOperand(0);
- assert(FirstElt == LastElt);
- Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
-
// Make sure convergence tokens are preserved.
// TODO: CreateIntrinsic should allow directly copying bundles
SmallVector<OperandBundleDef, 2> OpBundles;
II.getOperandBundlesAsDefs(OpBundles);
Module *M = IC.Builder.GetInsertBlock()->getModule();
- Function *Remangled = Intrinsic::getOrInsertDeclaration(
- M, II.getIntrinsicID(), {Extract->getType()});
+ Function *Remangled =
+ Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
+
+ if (MaskLen == 1) {
+ Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
+
+ // TODO: Preserve callsite attributes?
+ CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
+
+ Value *Result = IC.Builder.CreateInsertElement(
+ PoisonValue::get(II.getType()), NewCall, FirstElt);
+ IC.replaceInstUsesWith(II, Result);
+ IC.eraseInstFromFunction(II);
+ return Result;
+ }
+
+ SmallVector<int> ExtractMask(MaskLen, -1);
+ for (unsigned I = 0; I != MaskLen; ++I) {
+ if (DemandedElts[FirstElt + I])
+ ExtractMask[I] = FirstElt + I;
+ }
+
+ Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
// TODO: Preserve callsite attributes?
CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
- Value *Result = IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
- NewCall, FirstElt);
+ SmallVector<int> InsertMask(OldNumElts, -1);
+ for (unsigned I = 0; I != MaskLen; ++I) {
+ if (DemandedElts[FirstElt + I])
+ InsertMask[FirstElt + I] = I;
+ }
+
+ Value *Result = IC.Builder.CreateShuffleVector(NewCall, InsertMask);
IC.replaceInstUsesWith(II, Result);
IC.eraseInstFromFunction(II);
return Result;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll
index e9d3b5e963b35..056caabb6d60a 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-demanded-vector-elts-lane-intrinsics.ll
@@ -64,8 +64,8 @@ define i16 @extract_elt2_v4i16_readfirstlane(<4 x i16> %src) {
define <2 x i16> @extract_elt01_v4i16_readfirstlane(<4 x i16> %src) {
; CHECK-LABEL: define <2 x i16> @extract_elt01_v4i16_readfirstlane(
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
;
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -76,8 +76,8 @@ define <2 x i16> @extract_elt01_v4i16_readfirstlane(<4 x i16> %src) {
define <2 x i16> @extract_elt12_v4i16_readfirstlane(<4 x i16> %src) {
; CHECK-LABEL: define <2 x i16> @extract_elt12_v4i16_readfirstlane(
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
;
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -88,8 +88,8 @@ define <2 x i16> @extract_elt12_v4i16_readfirstlane(<4 x i16> %src) {
define <2 x i16> @extract_elt23_v4i16_readfirstlane(<4 x i16> %src) {
; CHECK-LABEL: define <2 x i16> @extract_elt23_v4i16_readfirstlane(
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
;
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -100,8 +100,9 @@ define <2 x i16> @extract_elt23_v4i16_readfirstlane(<4 x i16> %src) {
define <2 x i16> @extract_elt10_v4i16_readfirstlane(<4 x i16> %src) {
; CHECK-LABEL: define <2 x i16> @extract_elt10_v4i16_readfirstlane(
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
;
%vec = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> %src)
@@ -112,7 +113,9 @@ define <2 x i16> @extract_elt10_v4i16_readfirstlane(<4 x i16> %src) {
define <2 x i16> @extract_elt32_v4i16_readfirstlane(<4 x i16> %src) {
; CHECK-LABEL: define <2 x i16> @extract_elt32_v4i16_readfirstlane(
; CHECK-SAME: <4 x i16> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[SRC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> [[TMP1]])
+; CHECK-NEXT: [[VEC:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC]], <4 x i16> poison, <2 x i32> <i32 3, i32 2>
; CHECK-NEXT: ret <2 x i16> [[SHUFFLE]]
;
@@ -258,8 +261,8 @@ define <3 x i16> @extract_elt123_v4i16_readfirstlane(<4 x i16> %src) {
define <3 x i32> @extract_elt012_v4i32_readfirstlane(<4 x i32> %src) {
; CHECK-LABEL: define <3 x i32> @extract_elt012_v4i32_readfirstlane(
; CHECK-SAME: <4 x i32> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[SRC]])
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SRC]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
; CHECK-NEXT: ret <3 x i32> [[SHUFFLE]]
;
%vec = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src)
@@ -270,8 +273,8 @@ define <3 x i32> @extract_elt012_v4i32_readfirstlane(<4 x i32> %src) {
define <3 x i32> @extract_elt123_v4i32_readfirstlane(<4 x i32> %src) {
; CHECK-LABEL: define <3 x i32> @extract_elt123_v4i32_readfirstlane(
; CHECK-SAME: <4 x i32> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[SRC]])
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SRC]], <4 x i32> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[SHUFFLE:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
; CHECK-NEXT: ret <3 x i32> [[SHUFFLE]]
;
%vec = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src)
@@ -282,7 +285,9 @@ define <3 x i32> @extract_elt123_v4i32_readfirstlane(<4 x i32> %src) {
define <2 x i32> @extract_elt13_v4i32_readfirstlane(<4 x i32> %src) {
; CHECK-LABEL: define <2 x i32> @extract_elt13_v4i32_readfirstlane(
; CHECK-SAME: <4 x i32> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SRC]], <4 x i32> poison, <3 x i32> <i32 1, i32 poison, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
+; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP2]], <3 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 2>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
;
@@ -321,8 +326,9 @@ define < 2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1(i32 %src0,
; CHECK-LABEL: define <2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1(
; CHECK-SAME: i32 [[SRC0:%.*]], i32 [[SRC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SRC0]], i64 0
-; CHECK-NEXT: [[INS_1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]])
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 poison, i32 0>
+; CHECK-NEXT: [[TMP3:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP2]])
+; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 2>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
;
@@ -364,8 +370,9 @@ define < 2 x i32> @extract_elt13_v4i32_readfirstlane_source_simplify1_convergenc
; CHECK-SAME: i32 [[SRC0:%.*]], i32 [[SRC2:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[T:%.*]] = call token @llvm.experimental.convergence.entry()
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SRC0]], i64 0
-; CHECK-NEXT: [[INS_1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>
-; CHECK-NEXT: [[VEC:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[INS_1]]) [ "convergencectrl"(token [[T]]) ]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 poison, i32 0>
+; CHECK-NEXT: [[TMP3:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP2]]) [ "convergencectrl"(token [[T]]) ]
+; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 2>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
;
@@ -404,7 +411,9 @@ define <2 x i1> @extract_elt01_v4i1_readfirstlane(<4 x i1> %src) {
define <2 x i32> @extract_elt13_v8i32_readfirstlane(<8 x i32> %src) {
; CHECK-LABEL: define <2 x i32> @extract_elt13_v8i32_readfirstlane(
; CHECK-SAME: <8 x i32> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[SRC]], <8 x i32> poison, <3 x i32> <i32 1, i32 poison, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> [[TMP1]])
+; CHECK-NEXT: [[VEC:%.*]] = shufflevector <3 x i32> [[TMP2]], <3 x i32> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> poison, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: ret <2 x i32> [[SHUFFLE]]
;
@@ -428,7 +437,9 @@ define <2 x i32> @extract_elt03_v4i32_readfirstlane(<4 x i32> %src) {
define <3 x i32> @extract_elt124_v8i32_readfirstlane(<8 x i32> %src) {
; CHECK-LABEL: define <3 x i32> @extract_elt124_v8i32_readfirstlane(
; CHECK-SAME: <8 x i32> [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[VEC:%.*]] = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[SRC]], <8 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 4>
+; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT: [[VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 poison, i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> poison, <3 x i32> <i32 1, i32 2, i32 4>
; CHECK-NEXT: ret <3 x i32> [[SHUFFLE]]
;
|
bfe67bc
to
c80695c
Compare
0c05270
to
ce66b73
Compare
ce66b73
to
2500163
Compare
2500163
to
632f45d
Compare
ping |
SmallVector<int> InsertMask(OldNumElts, -1); | ||
for (unsigned I = 0; I != MaskLen; ++I) { | ||
if (DemandedElts[FirstElt + I]) | ||
InsertMask[FirstElt + I] = I; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Can we not fold this in upper loop ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/14477 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/14398 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/14655 Here is the relevant piece of the build log for the reference
|
This turned some of our bots red, e.g., https://lab.llvm.org/buildbot/#/builders/140/builds/18458 |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/12608 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/24990 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/21386 Here is the relevant piece of the build log for the reference
|
No description provided.