Skip to content

Commit cbdbba9

Browse files
IanWood1Muzammiluddin-Syed-ECE
authored andcommitted
Revert "[AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64 (llvm#137930)"
This reverts commit 721cba4. Signed-off-by: Ian Wood <[email protected]>
1 parent 311a66e commit cbdbba9

21 files changed

+2017
-2453
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 9 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ class SIPeepholeSDWA {
6262
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
6363
void pseudoOpConvertToVOP2(MachineInstr &MI,
6464
const GCNSubtarget &ST) const;
65-
void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
6665
MachineInstr *createSDWAVersion(MachineInstr &MI);
6766
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
6867
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
@@ -1038,8 +1037,7 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
10381037
return;
10391038
// Make sure VCC or its subregs are dead before MI.
10401039
MachineBasicBlock &MBB = *MI.getParent();
1041-
MachineBasicBlock::LivenessQueryResult Liveness =
1042-
MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
1040+
auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
10431041
if (Liveness != MachineBasicBlock::LQR_Dead)
10441042
return;
10451043
// Check if VCC is referenced in range of (MI,MISucc].
@@ -1063,52 +1061,6 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
10631061
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
10641062
}
10651063

1066-
/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1067-
/// operand into the corresponding VOP2 form which expects the
1068-
/// argument in VCC. To this end, add an copy from the carry-in to
1069-
/// VCC. The conversion will only be applied if \p MI can be shrunk
1070-
/// to VOP2 and if VCC can be proven to be dead before \p MI.
1071-
void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1072-
const GCNSubtarget &ST) const {
1073-
assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1074-
1075-
LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1076-
if (!TII->canShrink(MI, *MRI)) {
1077-
LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1078-
return;
1079-
}
1080-
1081-
const MachineOperand &CarryIn =
1082-
*TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1083-
Register CarryReg = CarryIn.getReg();
1084-
MachineInstr *CarryDef = MRI->getVRegDef(CarryReg);
1085-
if (!CarryDef) {
1086-
LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1087-
return;
1088-
}
1089-
1090-
// Make sure VCC or its subregs are dead before MI.
1091-
MCRegister Vcc = TRI->getVCC();
1092-
MachineBasicBlock &MBB = *MI.getParent();
1093-
MachineBasicBlock::LivenessQueryResult Liveness =
1094-
MBB.computeRegisterLiveness(TRI, Vcc, MI);
1095-
if (Liveness != MachineBasicBlock::LQR_Dead) {
1096-
LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1097-
return;
1098-
}
1099-
1100-
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);
1101-
1102-
auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
1103-
TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
1104-
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1105-
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1106-
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1107-
.setMIFlags(MI.getFlags());
1108-
LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1109-
MI.eraseFromParent();
1110-
}
1111-
11121064
namespace {
11131065
bool isConvertibleToSDWA(MachineInstr &MI,
11141066
const GCNSubtarget &ST,
@@ -1118,11 +1070,6 @@ bool isConvertibleToSDWA(MachineInstr &MI,
11181070
if (TII->isSDWA(Opc))
11191071
return true;
11201072

1121-
// Can only be handled after ealier conversion to
1122-
// AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1123-
if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1124-
return false;
1125-
11261073
// Check if this instruction has opcode that supports SDWA
11271074
if (AMDGPU::getSDWAOp(Opc) == -1)
11281075
Opc = AMDGPU::getVOPe32(Opc);
@@ -1161,6 +1108,10 @@ bool isConvertibleToSDWA(MachineInstr &MI,
11611108
if (TII->pseudoToMCOpcode(Opc) == -1)
11621109
return false;
11631110

1111+
// FIXME: has SDWA but require handling of implicit VCC use
1112+
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
1113+
return false;
1114+
11641115
if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
11651116
if (!Src0->isReg() && !Src0->isImm())
11661117
return false;
@@ -1315,9 +1266,7 @@ MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
13151266
SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
13161267
}
13171268

1318-
MachineInstr *Ret = SDWAInst.getInstr();
1319-
TII->fixImplicitOperands(*Ret);
1320-
return Ret;
1269+
return SDWAInst.getInstr();
13211270
}
13221271

13231272
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -1435,18 +1384,10 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
14351384
for (const auto &OperandPair : SDWAOperands) {
14361385
const auto &Operand = OperandPair.second;
14371386
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1438-
if (!PotentialMI)
1439-
continue;
1440-
1441-
switch (PotentialMI->getOpcode()) {
1442-
case AMDGPU::V_ADD_CO_U32_e64:
1443-
case AMDGPU::V_SUB_CO_U32_e64:
1387+
if (PotentialMI &&
1388+
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1389+
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
14441390
pseudoOpConvertToVOP2(*PotentialMI, ST);
1445-
break;
1446-
case AMDGPU::V_CNDMASK_B32_e64:
1447-
convertVcndmaskToVOP2(*PotentialMI, ST);
1448-
break;
1449-
};
14501391
}
14511392
SDWAOperands.clear();
14521393

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 101 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -38481,7 +38481,10 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
3848138481
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3848238482
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3848338483
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
38484-
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38484+
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
38485+
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38486+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
38487+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3848538488
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3848638489
; GFX8-NEXT: s_setpc_b64 s[30:31]
3848738490
;
@@ -38491,7 +38494,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
3849138494
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3849238495
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3849338496
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
38494-
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38497+
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
38498+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38499+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
3849538500
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3849638501
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
3849738502
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -38500,9 +38505,11 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
3850038505
; GFX10: ; %bb.0:
3850138506
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3850238507
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
38508+
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
38509+
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
3850338510
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
3850438511
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
38505-
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38512+
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo
3850638513
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
3850738514
; GFX10-NEXT: s_setpc_b64 s[30:31]
3850838515
;
@@ -38570,37 +38577,44 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
3857038577
; GFX8-LABEL: v_vselect_v2bf16:
3857138578
; GFX8: ; %bb.0:
3857238579
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38573-
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
3857438580
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
38581+
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
38582+
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38583+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
38584+
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38585+
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3857538586
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38576-
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
38577-
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
38578-
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38587+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
38588+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3857938589
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3858038590
; GFX8-NEXT: s_setpc_b64 s[30:31]
3858138591
;
3858238592
; GFX9-LABEL: v_vselect_v2bf16:
3858338593
; GFX9: ; %bb.0:
3858438594
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38585-
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
3858638595
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
38596+
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
38597+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38598+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
38599+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38600+
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3858738601
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38588-
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
38589-
; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
38590-
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38602+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
3859138603
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3859238604
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
3859338605
; GFX9-NEXT: s_setpc_b64 s[30:31]
3859438606
;
3859538607
; GFX10-LABEL: v_vselect_v2bf16:
3859638608
; GFX10: ; %bb.0:
3859738609
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38598-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
3859938610
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
38611+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
38612+
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
38613+
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
38614+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
38615+
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
3860038616
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
38601-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0
38602-
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38603-
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s4
38617+
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
3860438618
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
3860538619
; GFX10-NEXT: s_setpc_b64 s[30:31]
3860638620
;
@@ -38757,12 +38771,13 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3875738771
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
3875838772
; GFX8-NEXT: v_mov_b32_e32 v1, s3
3875938773
; GFX8-NEXT: v_mov_b32_e32 v2, s2
38760-
; GFX8-NEXT: v_mov_b32_e32 v3, s1
38761-
; GFX8-NEXT: v_mov_b32_e32 v4, s0
3876238774
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
38763-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
38764-
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
38765-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
38775+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
38776+
; GFX8-NEXT: v_mov_b32_e32 v1, s1
38777+
; GFX8-NEXT: v_mov_b32_e32 v2, s0
38778+
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
38779+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
38780+
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3876638781
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
3876738782
; GFX8-NEXT: ; return to shader part epilog
3876838783
;
@@ -38867,13 +38882,14 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3886738882
; GFX8: ; %bb.0:
3886838883
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
3886938884
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
38885+
; GFX8-NEXT: v_mov_b32_e32 v2, s3
38886+
; GFX8-NEXT: v_mov_b32_e32 v3, s2
3887038887
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
38871-
; GFX8-NEXT: v_mov_b32_e32 v1, s3
38872-
; GFX8-NEXT: v_mov_b32_e32 v2, s2
38873-
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
38888+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
3887438889
; GFX8-NEXT: v_mov_b32_e32 v2, s1
3887538890
; GFX8-NEXT: v_mov_b32_e32 v3, s0
3887638891
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
38892+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3887738893
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
3887838894
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3887938895
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
@@ -40776,42 +40792,48 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
4077640792
; GFX9-LABEL: v_vselect_v4bf16:
4077740793
; GFX9: ; %bb.0:
4077840794
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40779-
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
40780-
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
40781-
; GFX9-NEXT: v_and_b32_e32 v1, 1, v3
40795+
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
40796+
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
40797+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
4078240798
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
40783-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
40784-
; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
40785-
; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40799+
; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
40800+
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
40801+
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
40802+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
40803+
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
40804+
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
4078640805
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
40787-
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
4078840806
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
40789-
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
40790-
; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7]
40791-
; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40807+
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
40808+
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6
40809+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
40810+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
4079240811
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
40793-
; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
40794-
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
40812+
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
40813+
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
4079540814
; GFX9-NEXT: s_setpc_b64 s[30:31]
4079640815
;
4079740816
; GFX10-LABEL: v_vselect_v4bf16:
4079840817
; GFX10: ; %bb.0:
4079940818
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40800-
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
40801-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
40819+
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
4080240820
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
40803-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
40804-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
40805-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v2
40806-
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0
40807-
; GFX10-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40808-
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
40809-
; GFX10-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40821+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
40822+
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
40823+
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4
40824+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
40825+
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
40826+
; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
40827+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
40828+
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
40829+
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
40830+
; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
4081040831
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
40811-
; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s5
40812-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo
40813-
; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
40814-
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
40832+
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
40833+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
40834+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
40835+
; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
40836+
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
4081540837
; GFX10-NEXT: s_setpc_b64 s[30:31]
4081640838
;
4081740839
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
@@ -41059,37 +41081,42 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
4105941081
; GFX10-LABEL: v_vselect_v8bf16:
4106041082
; GFX10: ; %bb.0:
4106141083
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41084+
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
41085+
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
41086+
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
41087+
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
41088+
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10
41089+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
41090+
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14
41091+
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
4106241092
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
41063-
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
4106441093
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
41065-
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
41066-
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
41067-
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
41068-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v5
41069-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
41070-
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
41071-
; GFX10-NEXT: v_and_b32_e32 v3, 1, v6
41072-
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v1
41073-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v4
41074-
; GFX10-NEXT: v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41075-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
41076-
; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc_lo
41077-
; GFX10-NEXT: s_mov_b32 vcc_lo, s6
41078-
; GFX10-NEXT: v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41079-
; GFX10-NEXT: s_mov_b32 vcc_lo, s5
41080-
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41081-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
41082-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
41083-
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
41084-
; GFX10-NEXT: v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41094+
; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo
41095+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
41096+
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
41097+
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
41098+
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
41099+
; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
41100+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
41101+
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8
41102+
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12
41103+
; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
4108541104
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
41086-
; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
4108741105
; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
41106+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
41107+
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
41108+
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
41109+
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
41110+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
41111+
; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
4108841112
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
41089-
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
41090-
; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v11, vcc_lo
41091-
; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x5040100
41092-
; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
41113+
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
41114+
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
41115+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
41116+
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
41117+
; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
41118+
; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
41119+
; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
4109341120
; GFX10-NEXT: s_setpc_b64 s[30:31]
4109441121
;
4109541122
; GFX11TRUE16-LABEL: v_vselect_v8bf16:

0 commit comments

Comments
 (0)