Skip to content

Commit d7e03df

Browse files
committed
[AMDGPU] Implement widening multiplies with v_mad_i64_i32/v_mad_u64_u32
Select SelectionDAG ops smul_lohi/umul_lohi to v_mad_i64_i32/v_mad_u64_u32 respectively, with an addend of 0. v_mul_lo, v_mul_hi and v_mad_i64/u64 are all quarter-rate instructions so it is better to use one instruction than two. Further improvements are possible to make better use of the addend operand, but this is already a strict improvement over what we have now. Differential Revision: https://reviews.llvm.org/D113986
1 parent 8a52bd8 commit d7e03df

17 files changed

+1126
-1130
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
654654
SelectMAD_64_32(N);
655655
return;
656656
}
657+
case ISD::SMUL_LOHI:
658+
case ISD::UMUL_LOHI:
659+
return SelectMUL_LOHI(N);
657660
case ISD::CopyToReg: {
658661
const SITargetLowering& Lowering =
659662
*static_cast<const SITargetLowering*>(getTargetLowering());
@@ -1013,6 +1016,32 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
10131016
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
10141017
}
10151018

1019+
// We need to handle this here because tablegen doesn't support matching
1020+
// instructions with multiple outputs.
1021+
void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1022+
SDLoc SL(N);
1023+
bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1024+
unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1025+
1026+
SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1027+
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1028+
SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1029+
SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1030+
if (!SDValue(N, 0).use_empty()) {
1031+
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1032+
SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1033+
MVT::i32, SDValue(Mad, 0), Sub0);
1034+
ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1035+
}
1036+
if (!SDValue(N, 1).use_empty()) {
1037+
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1038+
SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1039+
MVT::i32, SDValue(Mad, 0), Sub1);
1040+
ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1041+
}
1042+
CurDAG->RemoveDeadNode(N);
1043+
}
1044+
10161045
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
10171046
if (!isUInt<16>(Offset))
10181047
return false;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
235235
void SelectUADDO_USUBO(SDNode *N);
236236
void SelectDIV_SCALE(SDNode *N);
237237
void SelectMAD_64_32(SDNode *N);
238+
void SelectMUL_LOHI(SDNode *N);
238239
void SelectFMA_W_CHAIN(SDNode *N);
239240
void SelectFMUL_W_CHAIN(SDNode *N);
240241
SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
594594
setTargetDAGCombine(ISD::SRL);
595595
setTargetDAGCombine(ISD::TRUNCATE);
596596
setTargetDAGCombine(ISD::MUL);
597+
setTargetDAGCombine(ISD::SMUL_LOHI);
598+
setTargetDAGCombine(ISD::UMUL_LOHI);
597599
setTargetDAGCombine(ISD::MULHU);
598600
setTargetDAGCombine(ISD::MULHS);
599601
setTargetDAGCombine(ISD::SELECT);
@@ -3462,6 +3464,50 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
34623464
return DAG.getSExtOrTrunc(Mul, DL, VT);
34633465
}
34643466

3467+
SDValue
3468+
AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3469+
DAGCombinerInfo &DCI) const {
3470+
if (N->getValueType(0) != MVT::i32)
3471+
return SDValue();
3472+
3473+
SelectionDAG &DAG = DCI.DAG;
3474+
SDLoc DL(N);
3475+
3476+
SDValue N0 = N->getOperand(0);
3477+
SDValue N1 = N->getOperand(1);
3478+
3479+
// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3480+
// in the source into any_extends if the result of the mul is truncated. Since
3481+
// we can assume the high bits are whatever we want, use the underlying value
3482+
// to avoid the unknown high bits from interfering.
3483+
if (N0.getOpcode() == ISD::ANY_EXTEND)
3484+
N0 = N0.getOperand(0);
3485+
if (N1.getOpcode() == ISD::ANY_EXTEND)
3486+
N1 = N1.getOperand(0);
3487+
3488+
// Try to use two fast 24-bit multiplies (one for each half of the result)
3489+
// instead of one slow extending multiply.
3490+
unsigned LoOpcode, HiOpcode;
3491+
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3492+
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3493+
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3494+
LoOpcode = AMDGPUISD::MUL_U24;
3495+
HiOpcode = AMDGPUISD::MULHI_U24;
3496+
} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3497+
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3498+
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3499+
LoOpcode = AMDGPUISD::MUL_I24;
3500+
HiOpcode = AMDGPUISD::MULHI_I24;
3501+
} else {
3502+
return SDValue();
3503+
}
3504+
3505+
SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3506+
SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3507+
DCI.CombineTo(N, Lo, Hi);
3508+
return SDValue(N, 0);
3509+
}
3510+
34653511
SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
34663512
DAGCombinerInfo &DCI) const {
34673513
EVT VT = N->getValueType(0);
@@ -4103,6 +4149,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
41034149
return performTruncateCombine(N, DCI);
41044150
case ISD::MUL:
41054151
return performMulCombine(N, DCI);
4152+
case ISD::SMUL_LOHI:
4153+
case ISD::UMUL_LOHI:
4154+
return performMulLoHiCombine(N, DCI);
41064155
case ISD::MULHS:
41074156
return performMulhsCombine(N, DCI);
41084157
case ISD::MULHU:

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ class AMDGPUTargetLowering : public TargetLowering {
9191
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
9292
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const;
9393
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
94+
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const;
9495
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
9596
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
9697
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
809809
setOperationAction(ISD::SMULO, MVT::i64, Custom);
810810
setOperationAction(ISD::UMULO, MVT::i64, Custom);
811811

812+
if (Subtarget->hasMad64_32()) {
813+
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
814+
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
815+
}
816+
812817
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
813818
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
814819
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
@@ -4691,6 +4696,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
46914696
case ISD::SMULO:
46924697
case ISD::UMULO:
46934698
return lowerXMULO(Op, DAG);
4699+
case ISD::SMUL_LOHI:
4700+
case ISD::UMUL_LOHI:
4701+
return lowerXMUL_LOHI(Op, DAG);
46944702
case ISD::DYNAMIC_STACKALLOC:
46954703
return LowerDYNAMIC_STACKALLOC(Op, DAG);
46964704
}
@@ -5304,6 +5312,21 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
53045312
return DAG.getMergeValues({ Result, Overflow }, SL);
53055313
}
53065314

5315+
SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
5316+
if (Op->isDivergent()) {
5317+
// Select to V_MAD_[IU]64_[IU]32.
5318+
return Op;
5319+
}
5320+
if (Subtarget->hasSMulHi()) {
5321+
// Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
5322+
return SDValue();
5323+
}
5324+
// The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
5325+
// calculate the high part, so we might as well do the whole thing with
5326+
// V_MAD_[IU]64_[IU]32.
5327+
return Op;
5328+
}
5329+
53075330
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
53085331
if (!Subtarget->isTrapHandlerEnabled() ||
53095332
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
135135
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
136136
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
137137
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
138+
SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
138139

139140
SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
140141
SelectionDAG &DAG) const;

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 46 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -818,32 +818,29 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
818818
; GFX8-NEXT: s_mov_b32 s12, s6
819819
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
820820
; GFX8-NEXT: v_mov_b32_e32 v0, s6
821-
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
822-
; GFX8-NEXT: s_mov_b32 s13, s7
823-
; GFX8-NEXT: s_mul_i32 s7, s1, s6
824-
; GFX8-NEXT: s_mul_i32 s6, s0, s6
821+
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
822+
; GFX8-NEXT: s_mul_i32 s6, s1, s6
825823
; GFX8-NEXT: s_mov_b32 s15, 0xf000
826824
; GFX8-NEXT: s_mov_b32 s14, -1
827-
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0
828-
; GFX8-NEXT: v_mov_b32_e32 v0, s6
825+
; GFX8-NEXT: s_mov_b32 s13, s7
826+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
829827
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
830828
; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
831829
; GFX8-NEXT: s_waitcnt vmcnt(0)
832830
; GFX8-NEXT: buffer_wbinvl1_vol
833831
; GFX8-NEXT: .LBB4_2:
834832
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
835-
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
836833
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
837-
; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2
838-
; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2
834+
; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
835+
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
836+
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
839837
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
840-
; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2
841-
; GFX8-NEXT: s_mov_b32 s7, 0xf000
842-
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
838+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
843839
; GFX8-NEXT: v_mov_b32_e32 v3, s1
844-
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v1
840+
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
841+
; GFX8-NEXT: s_mov_b32 s7, 0xf000
845842
; GFX8-NEXT: s_mov_b32 s6, -1
846-
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
843+
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
847844
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
848845
; GFX8-NEXT: s_endpgm
849846
;
@@ -878,17 +875,16 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
878875
; GFX9-NEXT: .LBB4_2:
879876
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
880877
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
881-
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
882-
; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2
878+
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
879+
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
883880
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
884-
; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2
885881
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
886-
; GFX9-NEXT: v_add_u32_e32 v1, v4, v3
887-
; GFX9-NEXT: v_mov_b32_e32 v2, s1
888-
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
882+
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
883+
; GFX9-NEXT: v_mov_b32_e32 v3, s1
884+
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
889885
; GFX9-NEXT: s_mov_b32 s7, 0xf000
890886
; GFX9-NEXT: s_mov_b32 s6, -1
891-
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
887+
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
892888
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
893889
; GFX9-NEXT: s_endpgm
894890
;
@@ -927,14 +923,13 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
927923
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
928924
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
929925
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
930-
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
931-
; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2
932-
; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2
926+
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
927+
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
933928
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
934929
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
935930
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
936931
; GFX1064-NEXT: s_mov_b32 s6, -1
937-
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
932+
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
938933
; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v2
939934
; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc
940935
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -974,14 +969,13 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
974969
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
975970
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
976971
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
977-
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
978-
; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2
979-
; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2
972+
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
973+
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0
980974
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
981975
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
982976
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
983977
; GFX1032-NEXT: s_mov_b32 s6, -1
984-
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
978+
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
985979
; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v2
986980
; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
987981
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1955,32 +1949,29 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
19551949
; GFX8-NEXT: s_mov_b32 s12, s6
19561950
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
19571951
; GFX8-NEXT: v_mov_b32_e32 v0, s6
1958-
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
1959-
; GFX8-NEXT: s_mov_b32 s13, s7
1960-
; GFX8-NEXT: s_mul_i32 s7, s1, s6
1961-
; GFX8-NEXT: s_mul_i32 s6, s0, s6
1952+
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
1953+
; GFX8-NEXT: s_mul_i32 s6, s1, s6
19621954
; GFX8-NEXT: s_mov_b32 s15, 0xf000
19631955
; GFX8-NEXT: s_mov_b32 s14, -1
1964-
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0
1965-
; GFX8-NEXT: v_mov_b32_e32 v0, s6
1956+
; GFX8-NEXT: s_mov_b32 s13, s7
1957+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
19661958
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19671959
; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
19681960
; GFX8-NEXT: s_waitcnt vmcnt(0)
19691961
; GFX8-NEXT: buffer_wbinvl1_vol
19701962
; GFX8-NEXT: .LBB10_2:
19711963
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
1972-
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
19731964
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1974-
; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2
1975-
; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2
1965+
; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
1966+
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
1967+
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
19761968
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1977-
; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2
1978-
; GFX8-NEXT: s_mov_b32 s7, 0xf000
1979-
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
1969+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
19801970
; GFX8-NEXT: v_mov_b32_e32 v3, s1
1981-
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v1
1971+
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2
1972+
; GFX8-NEXT: s_mov_b32 s7, 0xf000
19821973
; GFX8-NEXT: s_mov_b32 s6, -1
1983-
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc
1974+
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
19841975
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
19851976
; GFX8-NEXT: s_endpgm
19861977
;
@@ -2015,17 +2006,16 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
20152006
; GFX9-NEXT: .LBB10_2:
20162007
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
20172008
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2018-
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
2019-
; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2
2009+
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2
2010+
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
20202011
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2021-
; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2
20222012
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2023-
; GFX9-NEXT: v_add_u32_e32 v1, v4, v3
2024-
; GFX9-NEXT: v_mov_b32_e32 v2, s1
2025-
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
2013+
; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
2014+
; GFX9-NEXT: v_mov_b32_e32 v3, s1
2015+
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2
20262016
; GFX9-NEXT: s_mov_b32 s7, 0xf000
20272017
; GFX9-NEXT: s_mov_b32 s6, -1
2028-
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2018+
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
20292019
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
20302020
; GFX9-NEXT: s_endpgm
20312021
;
@@ -2064,14 +2054,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
20642054
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
20652055
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
20662056
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2067-
; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
2068-
; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2
2069-
; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2
2057+
; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
2058+
; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
20702059
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
20712060
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
20722061
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
20732062
; GFX1064-NEXT: s_mov_b32 s6, -1
2074-
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
2063+
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
20752064
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2
20762065
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
20772066
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -2111,14 +2100,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
21112100
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
21122101
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
21132102
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2114-
; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
2115-
; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2
2116-
; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2
2103+
; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
2104+
; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0
21172105
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
21182106
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
21192107
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
21202108
; GFX1032-NEXT: s_mov_b32 s6, -1
2121-
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
2109+
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
21222110
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2
21232111
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
21242112
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0

0 commit comments

Comments
 (0)