Skip to content

Commit 04226ba

Browse files
committed
[AMDGPU] Fix sign confusion in performMulLoHiCombine (llvm#105831)
SMUL_LOHI and UMUL_LOHI are different operations because the high part of the result is different, so it is not OK to optimize the signed version to MUL_U24/MULHI_U24 or the unsigned version to MUL_I24/MULHI_I24.
1 parent 6d7e428 commit 04226ba

File tree

2 files changed

+116
-12
lines changed

2 files changed

+116
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4349,6 +4349,7 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
43494349
SelectionDAG &DAG = DCI.DAG;
43504350
SDLoc DL(N);
43514351

4352+
bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
43524353
SDValue N0 = N->getOperand(0);
43534354
SDValue N1 = N->getOperand(1);
43544355

@@ -4363,20 +4364,25 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
43634364

43644365
// Try to use two fast 24-bit multiplies (one for each half of the result)
43654366
// instead of one slow extending multiply.
4366-
unsigned LoOpcode, HiOpcode;
4367-
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4368-
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4369-
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4370-
LoOpcode = AMDGPUISD::MUL_U24;
4371-
HiOpcode = AMDGPUISD::MULHI_U24;
4372-
} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4373-
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4374-
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4375-
LoOpcode = AMDGPUISD::MUL_I24;
4376-
HiOpcode = AMDGPUISD::MULHI_I24;
4367+
unsigned LoOpcode = 0;
4368+
unsigned HiOpcode = 0;
4369+
if (Signed) {
4370+
if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4371+
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4372+
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4373+
LoOpcode = AMDGPUISD::MUL_I24;
4374+
HiOpcode = AMDGPUISD::MULHI_I24;
4375+
}
43774376
} else {
4378-
return SDValue();
4377+
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4378+
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4379+
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4380+
LoOpcode = AMDGPUISD::MUL_U24;
4381+
HiOpcode = AMDGPUISD::MULHI_U24;
4382+
}
43794383
}
4384+
if (!LoOpcode)
4385+
return SDValue();
43804386

43814387
SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
43824388
SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);

llvm/test/CodeGen/AMDGPU/mul_int24.ll

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,4 +813,102 @@ bb7:
813813
ret void
814814

815815
}
816+
817+
define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
818+
; SI-LABEL: test_umul_i24:
819+
; SI: ; %bb.0:
820+
; SI-NEXT: s_load_dword s1, s[2:3], 0xb
821+
; SI-NEXT: v_mov_b32_e32 v0, 0xff803fe1
822+
; SI-NEXT: s_mov_b32 s0, 0
823+
; SI-NEXT: s_mov_b32 s3, 0xf000
824+
; SI-NEXT: s_waitcnt lgkmcnt(0)
825+
; SI-NEXT: s_lshr_b32 s1, s1, 9
826+
; SI-NEXT: v_mul_hi_u32 v0, s1, v0
827+
; SI-NEXT: s_mul_i32 s1, s1, 0xff803fe1
828+
; SI-NEXT: v_alignbit_b32 v0, v0, s1, 1
829+
; SI-NEXT: s_mov_b32 s2, -1
830+
; SI-NEXT: s_mov_b32 s1, s0
831+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
832+
; SI-NEXT: s_endpgm
833+
;
834+
; VI-LABEL: test_umul_i24:
835+
; VI: ; %bb.0:
836+
; VI-NEXT: s_load_dword s0, s[2:3], 0x2c
837+
; VI-NEXT: v_mov_b32_e32 v0, 0xff803fe1
838+
; VI-NEXT: s_mov_b32 s3, 0xf000
839+
; VI-NEXT: s_mov_b32 s2, -1
840+
; VI-NEXT: s_waitcnt lgkmcnt(0)
841+
; VI-NEXT: s_lshr_b32 s0, s0, 9
842+
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0
843+
; VI-NEXT: s_mov_b32 s0, 0
844+
; VI-NEXT: s_mov_b32 s1, s0
845+
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 1
846+
; VI-NEXT: s_nop 1
847+
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
848+
; VI-NEXT: s_endpgm
849+
;
850+
; GFX9-LABEL: test_umul_i24:
851+
; GFX9: ; %bb.0:
852+
; GFX9-NEXT: s_load_dword s1, s[2:3], 0x2c
853+
; GFX9-NEXT: s_mov_b32 s0, 0
854+
; GFX9-NEXT: s_mov_b32 s3, 0xf000
855+
; GFX9-NEXT: s_mov_b32 s2, -1
856+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
857+
; GFX9-NEXT: s_lshr_b32 s1, s1, 9
858+
; GFX9-NEXT: s_mul_hi_u32 s4, s1, 0xff803fe1
859+
; GFX9-NEXT: s_mul_i32 s1, s1, 0xff803fe1
860+
; GFX9-NEXT: v_mov_b32_e32 v0, s1
861+
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
862+
; GFX9-NEXT: s_mov_b32 s1, s0
863+
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
864+
; GFX9-NEXT: s_endpgm
865+
;
866+
; EG-LABEL: test_umul_i24:
867+
; EG: ; %bb.0:
868+
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
869+
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
870+
; EG-NEXT: CF_END
871+
; EG-NEXT: PAD
872+
; EG-NEXT: ALU clause starting at 4:
873+
; EG-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
874+
; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
875+
; EG-NEXT: MULHI * T0.X, PV.W, literal.x,
876+
; EG-NEXT: -8372255(nan), 0(0.000000e+00)
877+
; EG-NEXT: MULLO_INT * T0.Y, T0.W, literal.x,
878+
; EG-NEXT: -8372255(nan), 0(0.000000e+00)
879+
; EG-NEXT: BIT_ALIGN_INT T0.X, T0.X, PS, 1,
880+
; EG-NEXT: MOV * T1.X, literal.x,
881+
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
882+
;
883+
; CM-LABEL: test_umul_i24:
884+
; CM: ; %bb.0:
885+
; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
886+
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
887+
; CM-NEXT: CF_END
888+
; CM-NEXT: PAD
889+
; CM-NEXT: ALU clause starting at 4:
890+
; CM-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
891+
; CM-NEXT: 9(1.261169e-44), 0(0.000000e+00)
892+
; CM-NEXT: MULHI T0.X, T0.W, literal.x,
893+
; CM-NEXT: MULHI T0.Y (MASKED), T0.W, literal.x,
894+
; CM-NEXT: MULHI T0.Z (MASKED), T0.W, literal.x,
895+
; CM-NEXT: MULHI * T0.W (MASKED), T0.W, literal.x,
896+
; CM-NEXT: -8372255(nan), 0(0.000000e+00)
897+
; CM-NEXT: MULLO_INT T0.X (MASKED), T0.W, literal.x,
898+
; CM-NEXT: MULLO_INT T0.Y, T0.W, literal.x,
899+
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, literal.x,
900+
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, literal.x,
901+
; CM-NEXT: -8372255(nan), 0(0.000000e+00)
902+
; CM-NEXT: BIT_ALIGN_INT * T0.X, T0.X, PV.Y, 1,
903+
; CM-NEXT: MOV * T1.X, literal.x,
904+
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
905+
%i = lshr i32 %arg, 9
906+
%i1 = zext i32 %i to i64
907+
%i2 = mul i64 %i1, 4286595041
908+
%i3 = lshr i64 %i2, 1
909+
%i4 = trunc i64 %i3 to i32
910+
store i32 %i4, ptr addrspace(1) null, align 4
911+
ret void
912+
}
913+
816914
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)