Skip to content

[AMDGPU][True16][CodeGen] Skip combineDpp with t16 instructions #128918

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,

bool HasVOP3DPP = ST->hasVOP3DPP();
auto OrigOp = OrigMI.getOpcode();
if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(OrigOp)) {
LLVM_DEBUG(
dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n");
return nullptr;
}
auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
if (DPPOp == -1) {
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
Expand Down
27 changes: 27 additions & 0 deletions llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
# XUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150

# FIXME-TRUE16 add gfx1200 runline when we have those true16 instructions supported

---

# V_MOV_B16_t16_e64_dpp is unsupported to combine
# GCN-label: name: vop3_u16
# GCN: %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
# GCN: %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec
name: vop3_u16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2

%0:vgpr_16 = COPY $vgpr0
%1:vgpr_16 = COPY $vgpr1
%2:vgpr_16 = COPY $vgpr2
%3:vgpr_16 = IMPLICIT_DEF
%4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
%5:vgpr_16 = V_ADD_NC_U16_t16_e64 0, %4, 0, %3, 0, 0, implicit $exec
%6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec
%7:vgpr_16 = V_ADD_NC_U16_t16_e64 4, %6, 8, %5, 0, 0, implicit $exec
...
34 changes: 30 additions & 4 deletions llvm/test/CodeGen/AMDGPU/dpp_combine.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16

; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
Expand Down Expand Up @@ -63,6 +65,30 @@ define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) {
ret void
}

; It is not expected to see a sequence of v_mov_b32_dpp feeding into a 16 bit instruction
; GCN-LABEL: {{^}}dpp_fadd_f16:
; GFX9GFX10: global_load_{{dword|b32}} [[V:v[0-9]+]],
; GFX9GFX10: v_add_f16_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX11-TRUE16: v_mov_b32_dpp {{v[0-9]+}}, {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-TRUE16: v_add_f16_e32
; GFX11-FAKE16: global_load_{{dword|b32}} [[V:v[0-9]+]],
; GFX11-FAKE16: v_add_f16_e64_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
define amdgpu_kernel void @dpp_fadd_f16(ptr addrspace(1) %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
%load = load i32, ptr addrspace(1) %gep
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
%tmp01 = trunc i32 %tmp0 to i16
%tmp1 = bitcast i16 %tmp01 to half
%tt = trunc i32 %load to i16
%t = bitcast i16 %tt to half
%add = fadd half %tmp1, %t
%tmp2 = bitcast half %add to i16
%tmp3 = zext i16 %tmp2 to i32
store i32 %tmp3, ptr addrspace(1) %gep
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
declare float @llvm.ceil.f32(float)
Expand Down
124 changes: 124 additions & 0 deletions llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN

---

name: vopc
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2

; GCN-LABEL: name: vopc
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
; GCN-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
; GCN-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
; GCN-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc
; GCN-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr2
%3:vgpr_32 = IMPLICIT_DEF

%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec

%10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMPX_GT_U32_nosdst_e64 %10, %0, implicit-def $exec, implicit $mode, implicit $exec

%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec

%13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%14:sgpr_32 = V_CMP_NGE_F32_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec

%17:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%18:sgpr_32 = V_CMP_NGE_F32_e64 0, %17, 0, %0, 0, implicit $mode, implicit $exec
%19:sgpr_32 = S_AND_B32 %18, 10101, implicit-def $scc

%20:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
V_CMP_LT_I32_e32 %0, %20, implicit-def $vcc, implicit $exec

...
---

# V_MOV_B16_t16_e64_dpp is unsupported to combine
name: vopc_16
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16

; GCN-LABEL: name: vopc_16
; GCN: liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY $vgpr0_lo16
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY $vgpr1_hi16
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY $vgpr255_hi16
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: V_CMPX_EQ_I16_t16_nosdst_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]], 0, implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp1]], 0, [[COPY]], 0, implicit-def $vcc_lo, implicit $mode, implicit $exec
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp2:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_t16_e64 1, [[V_MOV_B16_t16_e64_dpp2]], 0, [[COPY]], 1, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp3:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
; GCN-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, [[V_CMP_NGE_F16_t16_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_16 = COPY $vgpr0_lo16
%1:vgpr_16 = COPY $vgpr1_hi16
%2:vgpr_16 = COPY $vgpr255_hi16
%3:vgpr_16 = IMPLICIT_DEF

%5:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
V_CMPX_EQ_I16_t16_nosdst_e64 0, %5, 0, %0, 0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec

%6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
%7:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %6, 0, %0, 0, implicit-def $vcc, implicit $mode, implicit $exec

%8:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
%9:sgpr_32 = V_CMP_GE_F16_t16_e64 1, %8, 0, %0, 1, 0, implicit $mode, implicit $exec

%15:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
%16:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, %16, 0, %0, 0, 0, implicit $mode, implicit $exec

...
---

name: mask_not_full
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2

; GCN-LABEL: name: mask_not_full
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]].lo16, 0, [[COPY1]].hi16, 0, 1, 15, 14, 1, implicit $exec
; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]].lo16, 0, implicit-def $vcc_lo, implicit $mode, implicit $exec
; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[COPY1]], 1, 13, 15, 1, implicit $exec
; GCN-NEXT: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F32_e64 1, [[V_MOV_B32_dpp]], 0, [[COPY]], 1, implicit $mode, implicit $exec
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec

%4:vgpr_16 = V_MOV_B16_t16_e64_dpp %2.lo16, 0, %1.hi16, 0, 1, 15, 14, 1, implicit $exec
%99:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %4, 0, %0.lo16, 0, implicit-def $vcc, implicit $mode, implicit $exec

%5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 13, 15, 1, implicit $exec
%6:sgpr_32 = V_CMP_GE_F32_e64 1, %5, 0, %0, 1, implicit $mode, implicit $exec

...