Skip to content

Commit 98bddba

Browse files
committed
[AMDGPU] Fix sign confusion in performMulLoHiCombine
SMUL_LOHI and UMUL_LOHI are different operations because the high part of the result is different, so it is not OK to optimize the signed version to MUL_U24/MULHI_U24 or the unsigned version to MUL_I24/MULHI_I24.
1 parent c8ba317 commit 98bddba

File tree

2 files changed

+74
-64
lines changed

2 files changed

+74
-64
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4346,6 +4346,7 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
43464346
SelectionDAG &DAG = DCI.DAG;
43474347
SDLoc DL(N);
43484348

4349+
bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
43494350
SDValue N0 = N->getOperand(0);
43504351
SDValue N1 = N->getOperand(1);
43514352

@@ -4360,20 +4361,25 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
43604361

43614362
// Try to use two fast 24-bit multiplies (one for each half of the result)
43624363
// instead of one slow extending multiply.
4363-
unsigned LoOpcode, HiOpcode;
4364-
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4365-
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4366-
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4367-
LoOpcode = AMDGPUISD::MUL_U24;
4368-
HiOpcode = AMDGPUISD::MULHI_U24;
4369-
} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4370-
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4371-
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4372-
LoOpcode = AMDGPUISD::MUL_I24;
4373-
HiOpcode = AMDGPUISD::MULHI_I24;
4364+
unsigned LoOpcode = 0;
4365+
unsigned HiOpcode = 0;
4366+
if (Signed) {
4367+
if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4368+
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4369+
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4370+
LoOpcode = AMDGPUISD::MUL_I24;
4371+
HiOpcode = AMDGPUISD::MULHI_I24;
4372+
}
43744373
} else {
4375-
return SDValue();
4374+
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4375+
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4376+
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4377+
LoOpcode = AMDGPUISD::MUL_U24;
4378+
HiOpcode = AMDGPUISD::MULHI_U24;
4379+
}
43764380
}
4381+
if (!LoOpcode)
4382+
return SDValue();
43774383

43784384
SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
43794385
SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);

llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll

Lines changed: 56 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,21 +1052,22 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
10521052
; GFX9-NEXT: s_mov_b32 s6, 0x80000001
10531053
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1
10541054
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
1055-
; GFX9-NEXT: v_mul_i32_i24_e32 v2, 3, v6
1056-
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v7, 3, v6
1057-
; GFX9-NEXT: v_mov_b32_e32 v8, v5
1055+
; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6
1056+
; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6
1057+
; GFX9-NEXT: v_mov_b32_e32 v10, v5
10581058
; GFX9-NEXT: v_mov_b32_e32 v5, v3
1059-
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, s6, v[4:5]
1060-
; GFX9-NEXT: v_lshl_add_u32 v6, v6, 31, v6
1061-
; GFX9-NEXT: v_add3_u32 v3, v7, v6, v2
1062-
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
1063-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, -1, v[2:3]
1064-
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[4:5], 0, 0, vcc
1065-
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[4:5]
1066-
; GFX9-NEXT: v_sub_u32_e32 v3, v3, v1
1067-
; GFX9-NEXT: v_sub_u32_e32 v3, v3, v0
1068-
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
1069-
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
1059+
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
1060+
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
1061+
; GFX9-NEXT: v_mov_b32_e32 v2, v3
1062+
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
1063+
; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
1064+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
1065+
; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
1066+
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
1067+
; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1
1068+
; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0
1069+
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1070+
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
10701071
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 1, v[2:3]
10711072
; GFX9-NEXT: s_brev_b32 s6, -2
10721073
; GFX9-NEXT: v_add_u32_e32 v3, v1, v3
@@ -1083,11 +1084,11 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
10831084
; GFX942-LABEL: srem64_i32max:
10841085
; GFX942: ; %bb.0: ; %entry
10851086
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1086-
; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v1
1087-
; GFX942-NEXT: v_mul_i32_i24_e32 v2, 3, v3
1088-
; GFX942-NEXT: v_mul_hi_i32_i24_e32 v4, 3, v3
1089-
; GFX942-NEXT: v_lshl_add_u32 v3, v3, 31, v3
1090-
; GFX942-NEXT: v_add3_u32 v3, v4, v3, v2
1087+
; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1
1088+
; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2
1089+
; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2
1090+
; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0
1091+
; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4
10911092
; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3
10921093
; GFX942-NEXT: v_mov_b32_e32 v5, 0
10931094
; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5]
@@ -1124,16 +1125,17 @@ define noundef i64 @srem64_i32max(i64 noundef %i) {
11241125
; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3
11251126
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
11261127
; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1127-
; GFX1030-NEXT: v_mul_hi_i32_i24_e32 v8, 3, v6
1128+
; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6
11281129
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
1129-
; GFX1030-NEXT: v_mul_i32_i24_e32 v2, 3, v6
1130-
; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
1131-
; GFX1030-NEXT: v_mov_b32_e32 v7, v5
1130+
; GFX1030-NEXT: v_mov_b32_e32 v8, v5
11321131
; GFX1030-NEXT: v_mov_b32_e32 v5, v3
1133-
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v0, v[4:5]
1134-
; GFX1030-NEXT: v_add3_u32 v3, v8, v6, v2
1132+
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0
1133+
; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
1134+
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
1135+
; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7
1136+
; GFX1030-NEXT: v_mov_b32_e32 v4, v5
11351137
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
1136-
; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4
1138+
; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4
11371139
; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4
11381140
; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1
11391141
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]
@@ -1165,21 +1167,22 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
11651167
; GFX9-NEXT: s_mov_b32 s6, 0x80000001
11661168
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1
11671169
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3]
1168-
; GFX9-NEXT: v_mul_i32_i24_e32 v2, 3, v6
1169-
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v7, 3, v6
1170-
; GFX9-NEXT: v_mov_b32_e32 v8, v5
1170+
; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6
1171+
; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6
1172+
; GFX9-NEXT: v_mov_b32_e32 v10, v5
11711173
; GFX9-NEXT: v_mov_b32_e32 v5, v3
1172-
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, s6, v[4:5]
1173-
; GFX9-NEXT: v_lshl_add_u32 v6, v6, 31, v6
1174-
; GFX9-NEXT: v_add3_u32 v3, v7, v6, v2
1175-
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
1176-
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, -1, v[2:3]
1177-
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[4:5], 0, 0, vcc
1178-
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s6, v[4:5]
1179-
; GFX9-NEXT: v_sub_u32_e32 v3, v3, v1
1180-
; GFX9-NEXT: v_sub_u32_e32 v3, v3, v0
1181-
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
1182-
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
1174+
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5]
1175+
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0
1176+
; GFX9-NEXT: v_mov_b32_e32 v2, v3
1177+
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
1178+
; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8
1179+
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7]
1180+
; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
1181+
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3]
1182+
; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1
1183+
; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0
1184+
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1185+
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
11831186
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, 1, v[2:3]
11841187
; GFX9-NEXT: v_add_u32_e32 v3, v1, v3
11851188
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 30, v[2:3]
@@ -1191,11 +1194,11 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
11911194
; GFX942-LABEL: sdiv64_i32max:
11921195
; GFX942: ; %bb.0: ; %entry
11931196
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1194-
; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v1
1195-
; GFX942-NEXT: v_mul_i32_i24_e32 v2, 3, v3
1196-
; GFX942-NEXT: v_mul_hi_i32_i24_e32 v4, 3, v3
1197-
; GFX942-NEXT: v_lshl_add_u32 v3, v3, 31, v3
1198-
; GFX942-NEXT: v_add3_u32 v3, v4, v3, v2
1197+
; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1
1198+
; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2
1199+
; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2
1200+
; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0
1201+
; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4
11991202
; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3
12001203
; GFX942-NEXT: v_mov_b32_e32 v5, 0
12011204
; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5]
@@ -1224,16 +1227,17 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) {
12241227
; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3
12251228
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
12261229
; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1
1227-
; GFX1030-NEXT: v_mul_hi_i32_i24_e32 v8, 3, v6
1230+
; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6
12281231
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3]
1229-
; GFX1030-NEXT: v_mul_i32_i24_e32 v2, 3, v6
1230-
; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
1231-
; GFX1030-NEXT: v_mov_b32_e32 v7, v5
1232+
; GFX1030-NEXT: v_mov_b32_e32 v8, v5
12321233
; GFX1030-NEXT: v_mov_b32_e32 v5, v3
1233-
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v0, v[4:5]
1234-
; GFX1030-NEXT: v_add3_u32 v3, v8, v6, v2
1234+
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0
1235+
; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6
1236+
; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5]
1237+
; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7
1238+
; GFX1030-NEXT: v_mov_b32_e32 v4, v5
12351239
; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3]
1236-
; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4
1240+
; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4
12371241
; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4
12381242
; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1
12391243
; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5]

0 commit comments

Comments
 (0)