Skip to content

Commit 125a6d4

Browse files
committed
[AMDGPU][SDAG] Enable v2i32 or/xor/and instructions
Make use of s_or_b64/s_and_b64/s_xor_b64 for v2i32. Legalising these causes a number of test regressions, so extra work in the combiner and Tablegen patterns was necessary.
1 parent 034eaed commit 125a6d4

19 files changed

+379
-203
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ bool VectorLegalizer::Run() {
235235

236236
LegalizedNodes.clear();
237237

238+
238239
// Remove dead nodes now.
239240
DAG.RemoveDeadNodes();
240241

@@ -1282,6 +1283,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
12821283
}
12831284

12841285
SDValue Unrolled = DAG.UnrollVectorOp(Node);
1286+
LLVM_DEBUG(dbgs() << "\nUnrolled node: "; Unrolled->dump());
1287+
LLVM_DEBUG(dbgs() << "\n");
12851288
if (Node->getNumValues() == 1) {
12861289
Results.push_back(Unrolled);
12871290
} else {

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
152152

153153
setOperationAction(ISD::LOAD, MVT::i128, Promote);
154154
AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155-
155+
156156
// TODO: Would be better to consume as directly legal
157157
setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
158158
AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 97 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include "llvm/IR/IntrinsicsR600.h"
4141
#include "llvm/IR/MDBuilder.h"
4242
#include "llvm/Support/CommandLine.h"
43+
4344
#include "llvm/Support/KnownBits.h"
4445
#include "llvm/Support/ModRef.h"
4546
#include "llvm/Transforms/Utils/LowerAtomic.h"
@@ -430,6 +431,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
430431
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
431432
}
432433

434+
setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal);
435+
// Prevent SELECT from being implemented with the above bitwise ops and instead use cndmask.
436+
setOperationAction(ISD::SELECT, MVT::v2i32, Custom);
437+
// Enable MatchRotate to produce ISD::ROTR, which is later transformed to alignbit.
438+
setOperationAction(ISD::ROTR, MVT::v2i32, Legal);
439+
433440
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
434441
Custom);
435442

@@ -835,6 +842,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
835842
AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
836843
} else {
837844
// Legalization hack.
845+
// Hmm.
838846
setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
839847

840848
setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
@@ -1986,6 +1994,13 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
19861994
return true;
19871995
}
19881996

1997+
bool SITargetLowering::shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
1998+
EVT VT) const {
1999+
return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
2000+
BinOpcode == ISD::XOR) &&
2001+
VT.getScalarType() == MVT::i64;
2002+
}
2003+
19892004
bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
19902005
unsigned Index) const {
19912006
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
@@ -12872,6 +12887,52 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
1287212887
}
1287312888
}
1287412889

12890+
// Detect identity v2i32 OR and replace with identity source node.
12891+
// Specifically an Or that has operands constructed from the same source node
12892+
// via extract_vector_elt and build_vector.
12893+
if (VT == MVT::v2i32) {
12894+
if (LHS->getOpcode() == ISD::BUILD_VECTOR &&
12895+
RHS->getOpcode() == ISD::BUILD_VECTOR) {
12896+
// DAG.canonicalizeCommutativeBinop(ISD::OR, RHS, LHS);
12897+
SDValue BVLHS = LHS->getOperand(0);
12898+
SDValue CLHS = LHS->getOperand(1);
12899+
SDValue CRHS = RHS->getOperand(0);
12900+
SDValue BVRHS = RHS->getOperand(1);
12901+
LLVM_DEBUG(
12902+
dbgs()
12903+
<< "### Performing v2i32 SIISelLowering DAGCombine::CombineOR\n";);
12904+
12905+
auto *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
12906+
auto *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
12907+
12908+
if (LC && RC) {
12909+
12910+
// Test for and normalise build vectors.
12911+
if (LHS->getOpcode() == ISD::BUILD_VECTOR &&
12912+
RHS->getOpcode() == ISD::BUILD_VECTOR &&
12913+
// Check cast to constantnode here
12914+
LHS->getConstantOperandVal(1) == 0 &&
12915+
RHS->getConstantOperandVal(0) == 0) {
12916+
12917+
// Get the extract_vector_element operands.
12918+
SDValue LEVE = LHS->getOperand(0);
12919+
SDValue REVE = RHS->getOperand(1);
12920+
12921+
if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
12922+
REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12923+
// Check that the the elements from the same vector are extracted.
12924+
if (LEVE->getOperand(0) == REVE->getOperand(0) &&
12925+
LEVE->getOperand(1) != REVE->getOperand(1)) {
12926+
LLVM_DEBUG(dbgs() << "### Found identity OR, folding...\n";);
12927+
SDValue IdentitySrc = LEVE.getOperand(0);
12928+
return IdentitySrc;
12929+
}
12930+
}
12931+
}
12932+
}
12933+
}
12934+
}
12935+
1287512936
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
1287612937
return SDValue();
1287712938

@@ -12915,20 +12976,52 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
1291512976
DAGCombinerInfo &DCI) const {
1291612977
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
1291712978
return RV;
12918-
12979+
12980+
SelectionDAG &DAG = DCI.DAG;
12981+
EVT VT = N->getValueType(0);
1291912982
SDValue LHS = N->getOperand(0);
1292012983
SDValue RHS = N->getOperand(1);
1292112984

12922-
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12923-
SelectionDAG &DAG = DCI.DAG;
12985+
if (VT == MVT::v2i32 && LHS.getNumOperands() > 1) {
12986+
12987+
const ConstantSDNode *CRHS_0 = dyn_cast<ConstantSDNode>(RHS.getOperand(0));
12988+
const ConstantSDNode *CRHS_1 = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12989+
SDValue LHS_0 = LHS.getOperand(0);
12990+
SDValue LHS_1 = LHS.getOperand(1);
12991+
12992+
if (LHS.getOpcode() == ISD::VSELECT && VT == MVT::v2i32) {
12993+
if (CRHS_0 && CRHS_0->getAPIntValue().isSignMask() &&
12994+
shouldFoldFNegIntoSrc(N, LHS_0))
12995+
if (CRHS_1 && CRHS_1->getAPIntValue().isSignMask() &&
12996+
shouldFoldFNegIntoSrc(N, LHS_1)) {
12997+
SDLoc DL(N);
12998+
SDValue CastLHS =
12999+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(1));
13000+
SDValue CastRHS =
13001+
DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, LHS->getOperand(2));
13002+
SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastLHS);
13003+
SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::v2f32, CastRHS);
13004+
SDValue NewSelect = DAG.getNode(ISD::VSELECT, DL, MVT::v2f32,
13005+
LHS->getOperand(0), FNegLHS, FNegRHS);
13006+
return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
13007+
}
13008+
}
13009+
// Possibly split vector here if one side does have a constant RHS.
13010+
}
1292413011

12925-
EVT VT = N->getValueType(0);
13012+
// Add test for when only one of the RHS vector elements is a const. Might be possible to optimise this case.
13013+
13014+
13015+
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13016+
13017+
1292613018
if (CRHS && VT == MVT::i64) {
1292713019
if (SDValue Split =
1292813020
splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
1292913021
return Split;
1293013022
}
1293113023

13024+
1293213025
// Make sure to apply the 64-bit constant splitting fold before trying to fold
1293313026
// fneg-like xors into 64-bit select.
1293413027
if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
366366
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
367367
Type *Ty) const override;
368368

369+
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
370+
EVT VT) const override;
371+
369372
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
370373
unsigned Index) const override;
371374
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2334,9 +2334,9 @@ def : AMDGPUPatIgnoreCopies <
23342334
(COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
23352335
>;
23362336

2337-
// 64-bit version
2337+
foreach vt = [i64, v2i32] in {
23382338
def : AMDGPUPatIgnoreCopies <
2339-
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
2339+
(DivergentBinFrag<xor> vt:$z, (and vt:$x, (xor vt:$y, vt:$z))),
23402340
(REG_SEQUENCE VReg_64,
23412341
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
23422342
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
@@ -2345,6 +2345,7 @@ def : AMDGPUPatIgnoreCopies <
23452345
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
23462346
(i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
23472347
>;
2348+
}
23482349

23492350
def : AMDGPUPat <
23502351
(fcopysign f32:$src0, f32:$src1),
@@ -2378,13 +2379,45 @@ def : AMDGPUPat <
23782379
let True16Predicate = NotHasTrue16BitInsts in {
23792380
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
23802381

2382+
def : AMDGPUPat <
2383+
(rotr v2i32:$src0, v2i32:$src1),
2384+
(REG_SEQUENCE VReg_64,
2385+
(V_ALIGNBIT_B32_e64
2386+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2387+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2388+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub0))), sub0,
2389+
(V_ALIGNBIT_B32_e64
2390+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2391+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2392+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub1))), sub1)
2393+
>;
2394+
2395+
// Prevents regression in fneg-modifier-casting.ll along with modifications to XorCombine().
2396+
def : AMDGPUPat <
2397+
(fneg (select i1:$src0, (f32 (bitconvert i32:$src1)), (f32 (bitconvert i32:$src2)))),
2398+
(V_CNDMASK_B32_e64 (i32 1), $src2, (i32 1), $src1, $src0)>;
2399+
23812400
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
23822401
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
23832402
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
23842403

23852404
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
23862405
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
23872406
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
2407+
2408+
def : GCNPat <
2409+
(rotr v2i32:$src0, v2i32:$src1),
2410+
(REG_SEQUENCE VReg_64,
2411+
(V_ALIGNBIT_B32_e64
2412+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2413+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2414+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub0))), sub0,
2415+
(V_ALIGNBIT_B32_e64
2416+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2417+
(i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2418+
(i32 (EXTRACT_SUBREG VReg_64:$src1, sub1))), sub1)
2419+
>;
2420+
23882421
} // end True16Predicate = NotHasTrue16BitInsts
23892422

23902423
let True16Predicate = UseRealTrue16Insts in {
@@ -2397,6 +2430,20 @@ def : GCNPat <
23972430
/* clamp */ 0, /* op_sel */ 0)
23982431
>;
23992432

2433+
def : GCNPat <
2434+
(rotr v2i32:$src0, v2i32:$src1),
2435+
(REG_SEQUENCE VReg_64,
2436+
(V_ALIGNBIT_B32_t16_e64
2437+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2438+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2439+
0, (EXTRACT_SUBREG (i32 (EXTRACT_SUBREG VReg_64:$src1, sub0)) ,lo16),0,0), sub0,
2440+
(V_ALIGNBIT_B32_t16_e64
2441+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2442+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2443+
0, (EXTRACT_SUBREG (i32 (EXTRACT_SUBREG VReg_64:$src1, sub0)) ,lo16),0,0), sub1)
2444+
>;
2445+
2446+
24002447
def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
24012448
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
24022449
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2423,6 +2470,20 @@ def : GCNPat <
24232470
$src1, /* clamp */ 0, /* op_sel */ 0)
24242471
>;
24252472

2473+
def : GCNPat <
2474+
(rotr v2i32:$src0, v2i32:$src1),
2475+
(REG_SEQUENCE VReg_64,
2476+
(V_ALIGNBIT_B32_fake16_e64
2477+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2478+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub0)),
2479+
0, (i32 (EXTRACT_SUBREG VReg_64:$src1, sub0)),0,0), sub0,
2480+
(V_ALIGNBIT_B32_fake16_e64
2481+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2482+
0, (i32 (EXTRACT_SUBREG VReg_64:$src0, sub1)),
2483+
0, (i32 (EXTRACT_SUBREG VReg_64:$src1, sub1)),0,0), sub1)
2484+
>;
2485+
2486+
24262487
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
24272488
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
24282489
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2449,6 +2510,7 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
24492510
>;
24502511
} // end True16Predicate = UseFakeTrue16Insts
24512512

2513+
24522514
/********** ====================== **********/
24532515
/********** Indirect addressing **********/
24542516
/********** ====================== **********/

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,6 +1779,21 @@ def : GCNPat <
17791779
(S_MOV_B32 imm:$imm)
17801780
>;
17811781

1782+
def : GCNPat <
1783+
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
1784+
(S_AND_B64 SReg_64:$x, SReg_64:$y)
1785+
>;
1786+
1787+
def : GCNPat <
1788+
(v2i32 (UniformBinFrag<or> v2i32:$x, v2i32:$y)),
1789+
(S_OR_B64 SReg_64:$x, SReg_64:$y)
1790+
>;
1791+
1792+
def : GCNPat <
1793+
(v2i32 (UniformBinFrag<xor> v2i32:$x, v2i32:$y)),
1794+
(S_XOR_B64 SReg_64:$x, SReg_64:$y)
1795+
>;
1796+
17821797
// Same as a 32-bit inreg
17831798
def : GCNPat<
17841799
(i32 (UniformUnaryFrag<sext> i16:$src)),

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -954,9 +954,9 @@ def : DivergentClampingBinOp<sub, V_SUB_CO_U32_e64>;
954954
def : DivergentBinOp<adde, V_ADDC_U32_e32>;
955955
def : DivergentBinOp<sube, V_SUBB_U32_e32>;
956956

957-
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
957+
class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst, ValueType vt = i64> :
958958
GCNPat<
959-
(DivergentBinFrag<Op> i64:$src0, i64:$src1),
959+
(DivergentBinFrag<Op> vt:$src0, vt:$src1),
960960
(REG_SEQUENCE VReg_64,
961961
(Inst
962962
(i32 (EXTRACT_SUBREG $src0, sub0)),
@@ -973,6 +973,11 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
973973
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
974974
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
975975

976+
def : divergent_i64_BinOp <and, V_AND_B32_e64, v2i32>;
977+
def : divergent_i64_BinOp <or, V_OR_B32_e64, v2i32>;
978+
def : divergent_i64_BinOp <xor, V_XOR_B32_e64, v2i32>;
979+
980+
976981
// mul24 w/ 64 bit output.
977982
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
978983
(i64 (Op i32:$src0, i32:$src1)),

llvm/test/CodeGen/AMDGPU/and.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
88
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
99
; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
1010

11-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
12-
; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
11+
; SI: s_and_b64
1312

1413
define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1514
%b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,25 +151,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
151151
; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v:
152152
; GFX-950: ; %bb.0:
153153
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
154+
; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6
155+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
154156
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
155-
; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
156157
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
157-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
158-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
158+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5]
159+
; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
159160
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
160161
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
161-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
162-
; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
162+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
163163
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
164-
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
165-
; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
164+
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
165+
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
166166
; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
167-
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
168-
; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
167+
; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
168+
; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3]
169169
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
170-
; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
171-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
172-
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
170+
; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0
171+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
172+
; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
173173
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
174174
; GFX-950-NEXT: ; return to shader part epilog
175175
%res = fptrunc <2 x double> %src to <2 x bfloat>

0 commit comments

Comments
 (0)