-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][CodeGen] Skip combineDpp with t16 instructions #128918
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
3ad3153
to
9921cdc
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesWe only emits v_mov_b32/64_dpp. Don't combine t16 instructions with mov dpp. Update the test inputs to be legal. It is future work to emit v_mov_b16_dpp, and then update GCNDPPCombine to combine it with the 16-bit instructions. Full diff: https://github.com/llvm/llvm-project/pull/128918.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index b22babb4a00d8..5439ea2f59111 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -215,6 +215,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
bool HasVOP3DPP = ST->hasVOP3DPP();
auto OrigOp = OrigMI.getOpcode();
+ if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(OrigOp)) {
+ LLVM_DEBUG(
+ dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n");
+ return nullptr;
+ }
auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
if (DPPOp == -1) {
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir
new file mode 100644
index 0000000000000..792acda60620e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir
@@ -0,0 +1,27 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
+# XUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
+
+# FIXME-TRUE16 add gfx1200 runline when we have those true16 instructions supported
+
+---
+
+# V_MOV_B16_t16_e64_dpp is unsupported to combine
+# GCN-label: name: vop3_u16
+# GCN: %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec
+name: vop3_u16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ %0:vgpr_16 = COPY $vgpr0
+ %1:vgpr_16 = COPY $vgpr1
+ %2:vgpr_16 = COPY $vgpr2
+ %3:vgpr_16 = IMPLICIT_DEF
+ %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
+ %5:vgpr_16 = V_ADD_NC_U16_t16_e64 0, %4, 0, %3, 0, 0, implicit $exec
+ %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec
+ %7:vgpr_16 = V_ADD_NC_U16_t16_e64 4, %6, 8, %5, 0, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
index 5162092f78aca..926c2a3f12aab 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -1,7 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
@@ -63,6 +65,30 @@ define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) {
ret void
}
+; It is not expected to see a sequence of v_mov_b32_dpp feeding into a 16 bit instruction
+; GCN-LABEL: {{^}}dpp_fadd_f16:
+; GFX9GFX10: global_load_{{dword|b32}} [[V:v[0-9]+]],
+; GFX9GFX10: v_add_f16_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; GFX11-TRUE16: v_mov_b32_dpp {{v[0-9]+}}, {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11-TRUE16: v_add_f16_e32
+; GFX11-FAKE16: global_load_{{dword|b32}} [[V:v[0-9]+]],
+; GFX11-FAKE16: v_add_f16_e64_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+define amdgpu_kernel void @dpp_fadd_f16(ptr addrspace(1) %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+ %load = load i32, ptr addrspace(1) %gep
+ %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
+ %tmp01 = trunc i32 %tmp0 to i16
+ %tmp1 = bitcast i16 %tmp01 to half
+ %tt = trunc i32 %load to i16
+ %t = bitcast i16 %tt to half
+ %add = fadd half %tmp1, %t
+ %tmp2 = bitcast half %add to i16
+ %tmp3 = zext i16 %tmp2 to i32
+ store i32 %tmp3, ptr addrspace(1) %gep
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
declare float @llvm.ceil.f32(float)
diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir b/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir
new file mode 100644
index 0000000000000..8f63f6c8cb1c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir
@@ -0,0 +1,124 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
+
+---
+
+name: vopc
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GCN-LABEL: name: vopc
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
+ ; GCN-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
+ ; GCN-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc
+ ; GCN-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ %3:vgpr_32 = IMPLICIT_DEF
+
+ %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
+ V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec
+
+ %10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
+ V_CMPX_GT_U32_nosdst_e64 %10, %0, implicit-def $exec, implicit $mode, implicit $exec
+
+ %11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
+ %12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec
+
+ %13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
+ %14:sgpr_32 = V_CMP_NGE_F32_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec
+
+ %17:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
+ %18:sgpr_32 = V_CMP_NGE_F32_e64 0, %17, 0, %0, 0, implicit $mode, implicit $exec
+ %19:sgpr_32 = S_AND_B32 %18, 10101, implicit-def $scc
+
+ %20:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
+ V_CMP_LT_I32_e32 %0, %20, implicit-def $vcc, implicit $exec
+
+...
+---
+
+# V_MOV_B16_t16_e64_dpp is unsupported to combine
+name: vopc_16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16
+
+ ; GCN-LABEL: name: vopc_16
+ ; GCN: liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY $vgpr0_lo16
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY $vgpr1_hi16
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY $vgpr255_hi16
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: V_CMPX_EQ_I16_t16_nosdst_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]], 0, implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp1]], 0, [[COPY]], 0, implicit-def $vcc_lo, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp2:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_t16_e64 1, [[V_MOV_B16_t16_e64_dpp2]], 0, [[COPY]], 1, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp3:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, [[V_CMP_NGE_F16_t16_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_16 = COPY $vgpr0_lo16
+ %1:vgpr_16 = COPY $vgpr1_hi16
+ %2:vgpr_16 = COPY $vgpr255_hi16
+ %3:vgpr_16 = IMPLICIT_DEF
+
+ %5:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
+ V_CMPX_EQ_I16_t16_nosdst_e64 0, %5, 0, %0, 0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec
+
+ %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
+ %7:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %6, 0, %0, 0, implicit-def $vcc, implicit $mode, implicit $exec
+
+ %8:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
+ %9:sgpr_32 = V_CMP_GE_F16_t16_e64 1, %8, 0, %0, 1, 0, implicit $mode, implicit $exec
+
+ %15:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
+ %16:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, %16, 0, %0, 0, 0, implicit $mode, implicit $exec
+
+...
+---
+
+name: mask_not_full
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GCN-LABEL: name: mask_not_full
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]].lo16, 0, [[COPY1]].hi16, 0, 1, 15, 14, 1, implicit $exec
+ ; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]].lo16, 0, implicit-def $vcc_lo, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[COPY1]], 1, 13, 15, 1, implicit $exec
+ ; GCN-NEXT: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F32_e64 1, [[V_MOV_B32_dpp]], 0, [[COPY]], 1, implicit $mode, implicit $exec
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+ %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %2.lo16, 0, %1.hi16, 0, 1, 15, 14, 1, implicit $exec
+ %99:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %4, 0, %0.lo16, 0, implicit-def $vcc, implicit $mode, implicit $exec
+
+ %5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 13, 15, 1, implicit $exec
+ %6:sgpr_32 = V_CMP_GE_F32_e64 1, %5, 0, %0, 1, implicit $mode, implicit $exec
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir b/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir
index 656c849bbd56b..249a2d31e551b 100644
--- a/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -mattr=-real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
---
@@ -19,13 +19,15 @@ body: |
; GCN-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
; GCN-NEXT: V_CMPX_EQ_I16_fake16_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec
- ; GCN-NEXT: [[V_CMP_CLASS_F16_fake16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_fake16_e64_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit $exec
- ; GCN-NEXT: [[V_CMP_GE_F16_fake16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_fake16_e64_dpp 1, [[COPY1]], 0, [[COPY]], 1, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
- ; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp1]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CMP_CLASS_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_fake16_e64 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], implicit-def $vcc_lo, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_fake16_e64 1, [[V_MOV_B32_dpp2]], 0, [[COPY]], 1, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp3]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
; GCN-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
; GCN-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
- ; GCN-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
; GCN-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_fake16_e64 0, [[V_CMP_NGE_F16_fake16_e64_]], 0, [[COPY]], 0, implicit $mode, implicit $exec
; GCN-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc
|
CI error is not related |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
rebased |
…#128918) We only emits v_mov_b32/64_dpp. Don't combine t16 instructions with mov dpp. Update the test inputs to be legal. It is future work to emit v_mov_b16_dpp, and then update GCNDPPCombine to combine it with the 16-bit instructions.
We only emits v_mov_b32/64_dpp. Don't combine t16 instructions with mov dpp. Update the test inputs to be legal.
It is future work to emit v_mov_b16_dpp, and then update GCNDPPCombine to combine it with the 16-bit instructions.