Skip to content

Commit 26f54ef

Browse files
committed
Address coments
1 parent 04fad5d commit 26f54ef

File tree

7 files changed

+67
-86
lines changed

7 files changed

+67
-86
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15170,7 +15170,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1517015170
TLI.isTruncateFree(SrcVT, VT)) {
1517115171
if (!LegalOperations ||
1517215172
(TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
15173-
TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
15173+
TLI.isNarrowingProfitable(N0.getNode(), SrcVT, VT))) {
1517415174
SDLoc SL(N0);
1517515175
SDValue Cond = N0.getOperand(0);
1517615176
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6741,12 +6741,12 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
67416741
case ISD::SMIN:
67426742
case ISD::SMAX:
67436743
return ISD::SIGN_EXTEND;
6744-
case ISD::ADD:
6745-
case ISD::SUB:
67466744
case ISD::SRL:
67476745
case ISD::UMIN:
67486746
case ISD::UMAX:
67496747
return ISD::ZERO_EXTEND;
6748+
case ISD::ADD:
6749+
case ISD::SUB:
67506750
case ISD::AND:
67516751
case ISD::OR:
67526752
case ISD::XOR:
@@ -6811,42 +6811,27 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
68116811

68126812
const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
68136813
LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6814-
RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6814+
6815+
// Special case: for shifts, the RHS always needs a zext.
6816+
if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL ||
6817+
Op.getOpcode() == ISD::SRA)
6818+
RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6819+
else
6820+
RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
68156821

68166822
// setcc always return i1/i1 vec so no need to truncate after.
68176823
if (Opc == ISD::SETCC) {
68186824
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
68196825
return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
68206826
}
68216827

6822-
SDNodeFlags Flags;
6823-
switch (Op->getOpcode()) {
6824-
case ISD::ADD:
6825-
case ISD::SHL:
6826-
Flags.setNoUnsignedWrap(true);
6827-
Flags.setNoSignedWrap(true);
6828-
break;
6829-
case ISD::SUB:
6830-
Flags.setNoUnsignedWrap(Op->getFlags().hasNoUnsignedWrap());
6831-
Flags.setNoSignedWrap(true);
6832-
break;
6833-
case ISD::MUL:
6834-
Flags.setNoUnsignedWrap(true);
6835-
Flags.setNoSignedWrap(Op->getFlags().hasNoUnsignedWrap());
6836-
break;
6837-
default:
6838-
break;
6839-
}
6840-
6841-
Flags.setExact(Op->getFlags().hasExact());
6842-
68436828
// For other ops, we extend the operation's return type as well so we need to
68446829
// truncate back to the original type.
68456830
SDValue NewVal;
68466831
if (Opc == ISD::SELECT)
68476832
NewVal = DAG.getSelect(DL, ExtTy, Op->getOperand(0), LHS, RHS);
68486833
else
6849-
NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}, Flags);
6834+
NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
68506835

68516836
return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
68526837
}

llvm/test/CodeGen/AMDGPU/idiv-licm.ll

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -646,18 +646,18 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
646646
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
647647
; GFX9-NEXT: .LBB4_1: ; %bb3
648648
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
649-
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
650-
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
651-
; GFX9-NEXT: s_lshl_b32 s3, s2, 1
649+
; GFX9-NEXT: s_and_b32 s3, 0xffff, s2
650+
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3
652651
; GFX9-NEXT: s_add_i32 s2, s2, 1
653-
; GFX9-NEXT: v_mov_b32_e32 v3, s3
652+
; GFX9-NEXT: s_lshl_b32 s3, s3, 1
653+
; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
654654
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v1
655655
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
656656
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4
657657
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
658-
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
659658
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
660-
; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
659+
; GFX9-NEXT: v_mov_b32_e32 v3, s3
660+
; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
661661
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
662662
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
663663
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
@@ -677,10 +677,10 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
677677
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
678678
; GFX10-NEXT: .LBB4_1: ; %bb3
679679
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
680-
; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
681-
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2
682-
; GFX10-NEXT: s_lshl_b32 s3, s2, 1
680+
; GFX10-NEXT: s_and_b32 s3, 0xffff, s2
683681
; GFX10-NEXT: s_add_i32 s2, s2, 1
682+
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s3
683+
; GFX10-NEXT: s_lshl_b32 s3, s3, 1
684684
; GFX10-NEXT: v_mov_b32_e32 v4, s3
685685
; GFX10-NEXT: s_and_b32 s3, s2, 0xffff
686686
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
@@ -709,22 +709,22 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
709709
; GFX11-NEXT: .p2align 6
710710
; GFX11-NEXT: .LBB4_1: ; %bb3
711711
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
712-
; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
713-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
714-
; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s2
715-
; GFX11-NEXT: s_lshl_b32 s3, s2, 1
712+
; GFX11-NEXT: s_and_b32 s3, 0xffff, s2
716713
; GFX11-NEXT: s_add_i32 s2, s2, 1
714+
; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
715+
; GFX11-NEXT: s_lshl_b32 s3, s3, 1
716+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
717717
; GFX11-NEXT: v_mov_b32_e32 v4, s3
718718
; GFX11-NEXT: s_and_b32 s3, s2, 0xffff
719719
; GFX11-NEXT: s_waitcnt_depctr 0xfff
720720
; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
721721
; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
722-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
723722
; GFX11-NEXT: v_trunc_f32_e32 v3, v3
723+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
724724
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
725725
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
726-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
727726
; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
727+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
728728
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
729729
; GFX11-NEXT: global_store_b16 v4, v2, s[0:1]
730730
; GFX11-NEXT: s_cbranch_scc0 .LBB4_1
@@ -763,8 +763,9 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
763763
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
764764
; GFX9-NEXT: s_and_b32 s3, 0xffff, s2
765765
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3
766+
; GFX9-NEXT: s_add_i32 s2, s2, 1
766767
; GFX9-NEXT: s_lshl_b32 s5, s3, 1
767-
; GFX9-NEXT: s_add_i32 s2, s3, 1
768+
; GFX9-NEXT: s_and_b32 s6, s2, 0xffff
768769
; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1
769770
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
770771
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
@@ -773,8 +774,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
773774
; GFX9-NEXT: v_mov_b32_e32 v3, s5
774775
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
775776
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
776-
; GFX9-NEXT: s_and_b32 s5, s2, 0xffff
777-
; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
777+
; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
778778
; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
779779
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
780780
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
@@ -795,9 +795,9 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
795795
; GFX10-NEXT: .LBB5_1: ; %bb3
796796
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
797797
; GFX10-NEXT: s_and_b32 s4, 0xffff, s3
798+
; GFX10-NEXT: s_add_i32 s3, s3, 1
798799
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4
799800
; GFX10-NEXT: s_lshl_b32 s5, s4, 1
800-
; GFX10-NEXT: s_add_i32 s3, s4, 1
801801
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
802802
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
803803
; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2
@@ -829,24 +829,24 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
829829
; GFX11-NEXT: .LBB5_1: ; %bb3
830830
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
831831
; GFX11-NEXT: s_and_b32 s4, 0xffff, s3
832-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
832+
; GFX11-NEXT: s_add_i32 s3, s3, 1
833833
; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s4
834834
; GFX11-NEXT: s_lshl_b32 s5, s4, 1
835-
; GFX11-NEXT: s_add_i32 s3, s4, 1
836835
; GFX11-NEXT: s_waitcnt_depctr 0xfff
837836
; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
837+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
838838
; GFX11-NEXT: v_trunc_f32_e32 v3, v3
839-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
840839
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
841840
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
841+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
842842
; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
843-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
844843
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
845844
; GFX11-NEXT: v_mov_b32_e32 v3, s5
845+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
846846
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
847-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
848847
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
849848
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
849+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
850850
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
851851
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
852852
; GFX11-NEXT: s_cbranch_scc0 .LBB5_1
@@ -895,13 +895,13 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
895895
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
896896
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
897897
; GFX9-NEXT: s_cselect_b32 s3, s3, 0
898-
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
899-
; GFX9-NEXT: v_add_u32_e32 v2, s3, v3
900-
; GFX9-NEXT: s_lshl_b32 s3, s2, 1
898+
; GFX9-NEXT: s_and_b32 s5, 0xffff, s2
901899
; GFX9-NEXT: s_add_i32 s2, s2, 1
900+
; GFX9-NEXT: v_add_u32_e32 v2, s3, v3
901+
; GFX9-NEXT: s_lshl_b32 s3, s5, 1
902+
; GFX9-NEXT: s_and_b32 s5, s2, 0xffff
902903
; GFX9-NEXT: v_mov_b32_e32 v3, s3
903-
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
904-
; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
904+
; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
905905
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
906906
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
907907
; GFX9-NEXT: s_cbranch_scc0 .LBB6_1
@@ -932,11 +932,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
932932
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3
933933
; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
934934
; GFX10-NEXT: s_cselect_b32 s4, s4, 0
935-
; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
936-
; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2
937-
; GFX10-NEXT: s_lshl_b32 s4, s3, 1
935+
; GFX10-NEXT: s_and_b32 s5, 0xffff, s3
938936
; GFX10-NEXT: s_add_i32 s3, s3, 1
939-
; GFX10-NEXT: v_mov_b32_e32 v3, s4
937+
; GFX10-NEXT: s_lshl_b32 s5, s5, 1
938+
; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2
939+
; GFX10-NEXT: v_mov_b32_e32 v3, s5
940940
; GFX10-NEXT: s_and_b32 s4, s3, 0xffff
941941
; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
942942
; GFX10-NEXT: global_store_short v3, v2, s[0:1]
@@ -975,11 +975,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
975975
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
976976
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
977977
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
978-
; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
978+
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
979979
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
980-
; GFX11-NEXT: s_lshl_b32 s4, s3, 1
980+
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
981981
; GFX11-NEXT: s_add_i32 s3, s3, 1
982-
; GFX11-NEXT: v_mov_b32_e32 v3, s4
982+
; GFX11-NEXT: v_mov_b32_e32 v3, s5
983983
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
984984
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
985985
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
@@ -1032,12 +1032,12 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
10321032
; GFX9-NEXT: s_cselect_b32 s5, s5, 0
10331033
; GFX9-NEXT: v_add_u32_e32 v2, s5, v3
10341034
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
1035-
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
1036-
; GFX9-NEXT: s_lshl_b32 s5, s2, 1
1035+
; GFX9-NEXT: s_and_b32 s6, 0xffff, s2
10371036
; GFX9-NEXT: s_add_i32 s2, s2, 1
1037+
; GFX9-NEXT: s_lshl_b32 s5, s6, 1
1038+
; GFX9-NEXT: s_and_b32 s6, s2, 0xffff
10381039
; GFX9-NEXT: v_mov_b32_e32 v3, s5
1039-
; GFX9-NEXT: s_and_b32 s5, s2, 0xffff
1040-
; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
1040+
; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
10411041
; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
10421042
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
10431043
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
@@ -1069,12 +1069,12 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
10691069
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3
10701070
; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
10711071
; GFX10-NEXT: s_cselect_b32 s5, s5, 0
1072-
; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
10731072
; GFX10-NEXT: v_add_nc_u32_e32 v2, s5, v2
1074-
; GFX10-NEXT: s_lshl_b32 s5, s3, 1
1073+
; GFX10-NEXT: s_and_b32 s5, 0xffff, s3
10751074
; GFX10-NEXT: s_add_i32 s3, s3, 1
1076-
; GFX10-NEXT: v_mov_b32_e32 v3, s5
1075+
; GFX10-NEXT: s_lshl_b32 s5, s5, 1
10771076
; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2
1077+
; GFX10-NEXT: v_mov_b32_e32 v3, s5
10781078
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
10791079
; GFX10-NEXT: s_and_b32 s4, s3, 0xffff
10801080
; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
@@ -1111,16 +1111,17 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
11111111
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
11121112
; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0|
11131113
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
1114-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1114+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
11151115
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
11161116
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
1117-
; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
1117+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
11181118
; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2
1119-
; GFX11-NEXT: s_lshl_b32 s5, s3, 1
1119+
; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
11201120
; GFX11-NEXT: s_add_i32 s3, s3, 1
1121-
; GFX11-NEXT: v_mov_b32_e32 v3, s5
1122-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1121+
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
1122+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
11231123
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
1124+
; GFX11-NEXT: v_mov_b32_e32 v3, s5
11241125
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
11251126
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
11261127
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)

llvm/test/CodeGen/AMDGPU/srem.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
1717
; GCN-NEXT: s_mulk_i32 s0, 0x4925
1818
; GCN-NEXT: s_lshr_b32 s1, s0, 31
1919
; GCN-NEXT: s_ashr_i32 s0, s0, 17
20-
; GCN-NEXT: s_and_b32 s0, s0, 0xffff
2120
; GCN-NEXT: s_add_i32 s0, s0, s1
2221
; GCN-NEXT: s_mul_i32 s0, s0, 7
2322
; GCN-NEXT: v_subrev_u32_e32 v1, s0, v1
@@ -63,7 +62,6 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
6362
; TONGA-NEXT: s_mulk_i32 s0, 0x4925
6463
; TONGA-NEXT: s_lshr_b32 s1, s0, 31
6564
; TONGA-NEXT: s_ashr_i32 s0, s0, 17
66-
; TONGA-NEXT: s_and_b32 s0, s0, 0xffff
6765
; TONGA-NEXT: s_add_i32 s0, s0, s1
6866
; TONGA-NEXT: s_mul_i32 s0, s0, 7
6967
; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2

llvm/test/CodeGen/AMDGPU/uaddo.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -491,17 +491,16 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
491491
; VI-NEXT: v_mov_b32_e32 v0, s4
492492
; VI-NEXT: v_mov_b32_e32 v1, s5
493493
; VI-NEXT: v_mov_b32_e32 v2, s6
494-
; VI-NEXT: flat_load_ushort v4, v[0:1]
495494
; VI-NEXT: v_mov_b32_e32 v3, s7
495+
; VI-NEXT: flat_load_ushort v4, v[0:1]
496496
; VI-NEXT: flat_load_ushort v5, v[2:3]
497497
; VI-NEXT: v_mov_b32_e32 v0, s0
498498
; VI-NEXT: v_mov_b32_e32 v1, s1
499499
; VI-NEXT: v_mov_b32_e32 v2, s2
500500
; VI-NEXT: v_mov_b32_e32 v3, s3
501-
; VI-NEXT: s_waitcnt vmcnt(1)
502-
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
503501
; VI-NEXT: s_waitcnt vmcnt(0)
504502
; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v5
503+
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
505504
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5
506505
; VI-NEXT: v_cmp_lt_u32_e32 vcc, v6, v4
507506
; VI-NEXT: flat_store_short v[0:1], v5
@@ -517,7 +516,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
517516
; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
518517
; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
519518
; GFX9-NEXT: s_waitcnt vmcnt(0)
520-
; GFX9-NEXT: v_add_u32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
519+
; GFX9-NEXT: v_add_u32_e32 v2, v1, v2
521520
; GFX9-NEXT: v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
522521
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
523522
; GFX9-NEXT: global_store_short v0, v2, s[4:5]

llvm/test/CodeGen/AMDGPU/usubo.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -491,17 +491,16 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
491491
; VI-NEXT: v_mov_b32_e32 v0, s4
492492
; VI-NEXT: v_mov_b32_e32 v1, s5
493493
; VI-NEXT: v_mov_b32_e32 v2, s6
494-
; VI-NEXT: flat_load_ushort v4, v[0:1]
495494
; VI-NEXT: v_mov_b32_e32 v3, s7
495+
; VI-NEXT: flat_load_ushort v4, v[0:1]
496496
; VI-NEXT: flat_load_ushort v5, v[2:3]
497497
; VI-NEXT: v_mov_b32_e32 v0, s0
498498
; VI-NEXT: v_mov_b32_e32 v1, s1
499499
; VI-NEXT: v_mov_b32_e32 v2, s2
500500
; VI-NEXT: v_mov_b32_e32 v3, s3
501-
; VI-NEXT: s_waitcnt vmcnt(1)
502-
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
503501
; VI-NEXT: s_waitcnt vmcnt(0)
504502
; VI-NEXT: v_sub_u32_e32 v5, vcc, v4, v5
503+
; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4
505504
; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5
506505
; VI-NEXT: v_cmp_gt_u32_e32 vcc, v6, v4
507506
; VI-NEXT: flat_store_short v[0:1], v5
@@ -517,7 +516,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
517516
; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
518517
; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
519518
; GFX9-NEXT: s_waitcnt vmcnt(0)
520-
; GFX9-NEXT: v_sub_u32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
519+
; GFX9-NEXT: v_sub_u32_e32 v2, v1, v2
521520
; GFX9-NEXT: v_cmp_gt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
522521
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
523522
; GFX9-NEXT: global_store_short v0, v2, s[4:5]

llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,10 +348,9 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
348348
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
349349
; GFX11-NEXT: s_or_b32 s0, s0, s1
350350
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
351-
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
352351
; GFX11-NEXT: s_addk_i32 s0, 0x2c00
353-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
354352
; GFX11-NEXT: s_or_b32 s0, s0, 0x300
353+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
355354
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
356355
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
357356
; GFX11-NEXT: s_nop 0

0 commit comments

Comments
 (0)