Skip to content

[AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64 #137930

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
May 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
390fd92
[AMDGPU] SIPeepholeSDWA: Reject V_CNDMASK_B32_e64 instead of V_CNDMAS…
frederik-h Mar 27, 2025
c9b7002
[AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64
frederik-h Apr 4, 2025
16e4118
Change computeRegisterLiveness use
frederik-h Apr 30, 2025
c344d14
Stop moving carry-in def instruction
frederik-h Apr 30, 2025
c100594
Handle undef carry-in operand
frederik-h Apr 30, 2025
b2a5bab
Remove extra newline from debug output
frederik-h Apr 30, 2025
65d7dd1
Rename test files to indicate the different ISAs being tested
frederik-h Apr 30, 2025
b0e665e
Use COPY instead of V_CMP_EQ for copy to VCC
frederik-h Apr 30, 2025
fc50f87
Handle wave32
frederik-h Apr 30, 2025
f05ec81
Rename sdwa-peephole-cndmask-gfx{9,10} tests
frederik-h Apr 30, 2025
3b2dc23
Unify test names
frederik-h Apr 30, 2025
f807526
Update llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
frederik-h May 2, 2025
9bea2ed
clang-format changes
frederik-h May 2, 2025
af365ee
Rename convertToImplicitVcc and move CarryDef up
frederik-h May 2, 2025
3c8bc54
Extend tests
frederik-h May 2, 2025
952881f
clang-format changes
frederik-h May 2, 2025
5c4cae5
Change test prefix
frederik-h May 2, 2025
9e406a9
Apply suggestions from code review
frederik-h May 2, 2025
a8f5dc8
Adjusts tests
frederik-h May 2, 2025
e943523
Make sure that V_CND_MASK gets handled
frederik-h May 2, 2025
d027b65
Change tests to avoid the impression that the carry-in def will be re…
frederik-h May 2, 2025
135d3a0
Always copy from carry-in operand to VCC
frederik-h May 5, 2025
8d7825a
Apply suggestions from code review
frederik-h May 5, 2025
5a98da9
Apply suggestions from code review
frederik-h May 5, 2025
b79a9ce
Adjust tests
frederik-h May 5, 2025
1975582
Rename VOP2 test file and remove "-vop3" from other test names
frederik-h May 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 68 additions & 9 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class SIPeepholeSDWA {
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
MachineInstr *createSDWAVersion(MachineInstr &MI);
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
Expand Down Expand Up @@ -1037,7 +1038,8 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
return;
// Make sure VCC or its subregs are dead before MI.
MachineBasicBlock &MBB = *MI.getParent();
auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
MachineBasicBlock::LivenessQueryResult Liveness =
MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
if (Liveness != MachineBasicBlock::LQR_Dead)
return;
// Check if VCC is referenced in range of (MI,MISucc].
Expand All @@ -1061,6 +1063,52 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}

/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
/// operand into the corresponding VOP2 form which expects the
/// argument in VCC. To this end, add an copy from the carry-in to
/// VCC. The conversion will only be applied if \p MI can be shrunk
/// to VOP2 and if VCC can be proven to be dead before \p MI.
void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const {
assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);

LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
if (!TII->canShrink(MI, *MRI)) {
LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
return;
}

const MachineOperand &CarryIn =
*TII->getNamedOperand(MI, AMDGPU::OpName::src2);
Register CarryReg = CarryIn.getReg();
MachineInstr *CarryDef = MRI->getVRegDef(CarryReg);
if (!CarryDef) {
LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
return;
}

// Make sure VCC or its subregs are dead before MI.
MCRegister Vcc = TRI->getVCC();
MachineBasicBlock &MBB = *MI.getParent();
MachineBasicBlock::LivenessQueryResult Liveness =
MBB.computeRegisterLiveness(TRI, Vcc, MI);
if (Liveness != MachineBasicBlock::LQR_Dead) {
LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
return;
}

BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);

auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
.setMIFlags(MI.getFlags());
LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
MI.eraseFromParent();
}

namespace {
bool isConvertibleToSDWA(MachineInstr &MI,
const GCNSubtarget &ST,
Expand All @@ -1070,6 +1118,11 @@ bool isConvertibleToSDWA(MachineInstr &MI,
if (TII->isSDWA(Opc))
return true;

// Can only be handled after ealier conversion to
// AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
if (Opc == AMDGPU::V_CNDMASK_B32_e64)
return false;

// Check if this instruction has opcode that supports SDWA
if (AMDGPU::getSDWAOp(Opc) == -1)
Opc = AMDGPU::getVOPe32(Opc);
Expand Down Expand Up @@ -1108,10 +1161,6 @@ bool isConvertibleToSDWA(MachineInstr &MI,
if (TII->pseudoToMCOpcode(Opc) == -1)
return false;

// FIXME: has SDWA but require handling of implicit VCC use
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;

if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
if (!Src0->isReg() && !Src0->isImm())
return false;
Expand Down Expand Up @@ -1266,7 +1315,9 @@ MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
}

return SDWAInst.getInstr();
MachineInstr *Ret = SDWAInst.getInstr();
TII->fixImplicitOperands(*Ret);
return Ret;
}

bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
Expand Down Expand Up @@ -1384,10 +1435,18 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
if (!PotentialMI)
continue;

switch (PotentialMI->getOpcode()) {
case AMDGPU::V_ADD_CO_U32_e64:
case AMDGPU::V_SUB_CO_U32_e64:
pseudoOpConvertToVOP2(*PotentialMI, ST);
break;
case AMDGPU::V_CNDMASK_B32_e64:
convertVcndmaskToVOP2(*PotentialMI, ST);
break;
};
}
SDWAOperands.clear();

Expand Down
175 changes: 74 additions & 101 deletions llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38481,10 +38481,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -38494,9 +38491,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -38505,11 +38500,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
Expand Down Expand Up @@ -38577,44 +38570,37 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX8-LABEL: v_vselect_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_vselect_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s4
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
Expand Down Expand Up @@ -38771,13 +38757,12 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
Expand Down Expand Up @@ -38882,14 +38867,13 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
Expand Down Expand Up @@ -40792,48 +40776,42 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GFX9-LABEL: v_vselect_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 1, v3
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7]
; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0
; GFX10-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s5
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
Expand Down Expand Up @@ -41081,42 +41059,37 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX10-LABEL: v_vselect_v8bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12
; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v5
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v6
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v4
; GFX10-NEXT: v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc_lo
; GFX10-NEXT: s_mov_b32 vcc_lo, s6
; GFX10-NEXT: v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_mov_b32 vcc_lo, s5
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v11, vcc_lo
; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x5040100
; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v8bf16:
Expand Down
Loading