Skip to content

[AMDGPU] Split wide integer dpp8 intrinsic calls #113500

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
bool visitMinNum(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool visitMovDppIntrinsic(IntrinsicInst &I);
bool run(Function &F);
};

Expand Down Expand Up @@ -2099,6 +2100,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitMinNum(I);
case Intrinsic::sqrt:
return visitSqrt(I);
case Intrinsic::amdgcn_mov_dpp8:
return visitMovDppIntrinsic(I);
default:
return false;
}
Expand Down Expand Up @@ -2257,6 +2260,38 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
return true;
}

// Split unsupported wide integer calls.
bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an optimization pass and cannot be relied on for lowering. This should be handled in codegen proper like the other dpp intrinsics

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't notice it does not happen at -O0. It looks like there is no place for it at -O0. The problem with 'other dpp intrinsics' is that it expands pseudo MI V_MOV_DPP_B64 only, past ISel, and does not handle any other types but i64. It almost calls for another pass.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is purely lowering. We already have the code to split arbitrary readfirstlane etc. intrinsics, this is no different (e.g. see 5feb32b)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Dropping it in favor of #114296.

Type *SrcTy = I.getType();
assert(SrcTy->isIntegerTy());
unsigned Size = SrcTy->getPrimitiveSizeInBits();
if (Size <= 32)
return false;
assert(Size % 32 == 0);

IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
unsigned NumElt = Size / 32;
IntegerType *EltTy = Builder.getInt32Ty();
Type *VecTy = VectorType::get(EltTy, NumElt, false);
Value *Vec = Builder.CreateBitCast(I.getArgOperand(0), VecTy);

unsigned IID = I.getIntrinsicID();
SmallVector<Value *, 6> Args(I.args());
SmallVector<Value *, 4> Elts;
for (unsigned N = 0; N != NumElt; ++N) {
Args[0] = Builder.CreateExtractElement(Vec, N);
Elts.push_back(Builder.CreateIntrinsic(EltTy, IID, Args));
}

Value *DppVec = insertValues(Builder, VecTy, Elts);
Value *NewVal = Builder.CreateBitCast(DppVec, SrcTy);
NewVal->takeName(&I);
I.replaceAllUsesWith(NewVal);
I.eraseFromParent();
return true;
}

bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Impl.Mod = &M;
Impl.DL = &Impl.Mod->getDataLayout();
Expand Down
33 changes: 33 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,39 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_i64:
; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) {
%tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) #0
store i64 %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_i128:
; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
define amdgpu_ps void @dpp8_i128(i128 %in, ptr addrspace(1) %out) {
%tmp0 = call i128 @llvm.amdgcn.mov.dpp8.i128(i128 %in, i32 1) #0
store i128 %tmp0, ptr addrspace(1) %out
ret void
}

; GFX10PLUS-LABEL: {{^}}dpp8_i96:
; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
; GFX10PLUS: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
define amdgpu_ps void @dpp8_i96(i96 %in, ptr addrspace(1) %out) {
%tmp0 = call i96 @llvm.amdgcn.mov.dpp8.i96(i96 %in, i32 1) #0
store i96 %tmp0, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0

attributes #0 = { nounwind readnone convergent }
Loading