-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Split wide integer dpp8 intrinsic calls #113500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The int_amdgcn_mov_dpp8 is declared with llvm_anyint_ty, but we can only select i32. To allow a corresponding builtin to be overloaded the same way as int_amdgcn_mov_dpp we need it to be able to split unsupported i64 values.
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesThe int_amdgcn_mov_dpp8 is declared with llvm_anyint_ty, but we can only select i32. To allow a corresponding builtin to be overloaded the same way as int_amdgcn_mov_dpp we need it to be able to split unsupported i64 values. Full diff: https://github.com/llvm/llvm-project/pull/113500.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index c49aab823b44a4..4e25f8c9464918 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -317,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
bool visitMinNum(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
+ bool visitMovDppIntrinsic(IntrinsicInst &I);
bool run(Function &F);
};
@@ -2099,6 +2100,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitMinNum(I);
case Intrinsic::sqrt:
return visitSqrt(I);
+ case Intrinsic::amdgcn_mov_dpp8:
+ return visitMovDppIntrinsic(I);
default:
return false;
}
@@ -2257,6 +2260,38 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
return true;
}
+// Split unsupported wide integer calls.
+bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) {
+ Type *SrcTy = I.getType();
+ assert(SrcTy->isIntegerTy());
+ unsigned Size = SrcTy->getPrimitiveSizeInBits();
+ assert(Size % 32 == 0);
+ if (Size <= 32)
+ return false;
+
+ IRBuilder<> Builder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+ unsigned NumElt = Size / 32;
+ IntegerType *EltTy = Builder.getInt32Ty();
+ Type *VecTy = VectorType::get(EltTy, NumElt, false);
+ Value *Vec = Builder.CreateBitCast(I.getArgOperand(0), VecTy);
+
+ unsigned IID = I.getIntrinsicID();
+ SmallVector<Value *, 6> Args(I.args());
+ SmallVector<Value *, 4> Elts;
+ for (unsigned N = 0; N != NumElt; ++N) {
+ Args[0] = Builder.CreateExtractElement(Vec, N);
+ Elts.push_back(Builder.CreateIntrinsic(EltTy, IID, Args));
+ }
+
+ Value *DppVec = insertValues(Builder, VecTy, Elts);
+ Value *NewVal = Builder.CreateBitCast(DppVec, SrcTy);
+ NewVal->takeName(&I);
+ I.replaceAllUsesWith(NewVal);
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Impl.Mod = &M;
Impl.DL = &Impl.Mod->getDataLayout();
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 8bff17b7299270..35aac8533aa153 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -24,6 +24,39 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
ret void
}
+; GFX10PLUS-LABEL: {{^}}dpp8_i64:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
+define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) #0
+ store i64 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i128:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
+define amdgpu_ps void @dpp8_i128(i128 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i128 @llvm.amdgcn.mov.dpp8.i128(i128 %in, i32 1) #0
+ store i128 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i96:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
+define amdgpu_ps void @dpp8_i96(i96 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i96 @llvm.amdgcn.mov.dpp8.i96(i96 %in, i32 1) #0
+ store i96 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
attributes #0 = { nounwind readnone convergent }
|
Ping |
@@ -2257,6 +2260,38 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { | |||
return true; | |||
} | |||
|
|||
// Split unsupported wide integer calls. | |||
bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an optimization pass and cannot be relied on for lowering. This should be handled in codegen proper like the other dpp intrinsics
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Didn't notice it does not happen at -O0. It looks like there is no place for it at -O0. The problem with 'other dpp intrinsics' is that it expands pseudo MI V_MOV_DPP_B64 only, past ISel, and does not handle any other types but i64. It almost calls for another pass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is purely lowering. We already have the code to split arbitrary readfirstlane etc. intrinsics, this is no different (e.g. see 5feb32b)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. Dropping it in favor of #114296.
The int_amdgcn_mov_dpp8 is declared with llvm_anyint_ty, but we can only select i32. To allow a corresponding builtin to be overloaded the same way as int_amdgcn_mov_dpp we need it to be able to split unsupported i64 values.