Skip to content

Commit 458fd8b

Browse files
committed
Simplify SRA i64 for shift amts in range [33:62]
Signed-off-by: John Lu <[email protected]>
1 parent fc9ce03 commit 458fd8b

15 files changed

+2301
-2160
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4165,22 +4165,23 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
41654165
SDLoc SL(N);
41664166
unsigned RHSVal = RHS->getZExtValue();
41674167

4168-
// (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4169-
if (RHSVal == 32) {
4168+
// For C >= 32
4169+
// (sra i64:x, C) -> build_pair (sra hi_32(x), C - 32), (sra hi_32(x), 31)
4170+
if (32 <= RHSVal) {
41704171
SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4171-
SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4172-
DAG.getConstant(31, SL, MVT::i32));
4173-
4174-
SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4175-
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4176-
}
4172+
SDValue HiShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4173+
DAG.getConstant(31, SL, MVT::i32));
4174+
SDValue LoShift;
4175+
4176+
if (RHSVal == 63)
4177+
LoShift = HiShift;
4178+
else if (RHSVal == 32)
4179+
LoShift = Hi;
4180+
else
4181+
LoShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4182+
DAG.getConstant(RHSVal - 32, SL, MVT::i32));
41774183

4178-
// (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4179-
if (RHSVal == 63) {
4180-
SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4181-
SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4182-
DAG.getConstant(31, SL, MVT::i32));
4183-
SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4184+
SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {LoShift, HiShift});
41844185
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
41854186
}
41864187

llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,9 @@ define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) {
150150
; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr:
151151
; CHECK: ; %bb.0: ; %bb
152152
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153-
; CHECK-NEXT: v_mul_hi_u32 v4, v1, v0
154-
; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[3:4]
155-
; CHECK-NEXT: flat_store_dword v[2:3], v4
153+
; CHECK-NEXT: v_mul_hi_u32 v0, v1, v0
154+
; CHECK-NEXT: flat_store_dword v[2:3], v0
155+
; CHECK-NEXT: v_ashrrev_i32_e32 v0, 1, v0
156156
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
157157
; CHECK-NEXT: s_setpc_b64 s[30:31]
158158
bb:

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4398,9 +4398,10 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
43984398
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
43994399
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
44004400
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
4401-
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4
4402-
; GFX9-NEXT: v_ashrrev_i64 v[2:3], 33, v[2:3]
4403-
; GFX9-NEXT: v_or_b32_e32 v0, v4, v0
4401+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v4
4402+
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
4403+
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v3
4404+
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v3
44044405
; GFX9-NEXT: s_setpc_b64 s[30:31]
44054406
;
44064407
; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k:

llvm/test/CodeGen/AMDGPU/load-constant-i16.ll

Lines changed: 479 additions & 449 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/load-constant-i8.ll

Lines changed: 913 additions & 972 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

Lines changed: 264 additions & 236 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,10 @@ define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) {
110110
; SDAG-LABEL: range_metadata_sext_i8_signed_range_i64:
111111
; SDAG: ; %bb.0:
112112
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113-
; SDAG-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
113+
; SDAG-NEXT: global_load_dwordx2 v[1:2], v[0:1], off glc
114114
; SDAG-NEXT: s_waitcnt vmcnt(0)
115-
; SDAG-NEXT: v_lshlrev_b32_e32 v1, 23, v0
116-
; SDAG-NEXT: v_ashrrev_i64 v[0:1], 55, v[0:1]
115+
; SDAG-NEXT: v_bfe_i32 v0, v1, 0, 9
116+
; SDAG-NEXT: v_bfe_i32 v1, v1, 8, 1
117117
; SDAG-NEXT: s_setpc_b64 s[30:31]
118118
;
119119
; GISEL-LABEL: range_metadata_sext_i8_signed_range_i64:

llvm/test/CodeGen/AMDGPU/mad_64_32.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -463,14 +463,12 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
463463
; SI-LABEL: mad_i64_i32_sextops_i31_i63:
464464
; SI: ; %bb.0:
465465
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
466-
; SI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
467-
; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1
468-
; SI-NEXT: v_ashr_i64 v[4:5], v[3:4], 33
469-
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 33
470-
; SI-NEXT: v_mul_lo_u32 v1, v4, v0
471-
; SI-NEXT: v_mul_hi_i32 v4, v4, v0
472-
; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v2
473-
; SI-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
466+
; SI-NEXT: v_bfe_i32 v0, v0, 0, 31
467+
; SI-NEXT: v_bfe_i32 v1, v1, 0, 31
468+
; SI-NEXT: v_mul_lo_u32 v4, v0, v1
469+
; SI-NEXT: v_mul_hi_i32 v1, v0, v1
470+
; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2
471+
; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
474472
; SI-NEXT: s_setpc_b64 s[30:31]
475473
;
476474
; GFX9-LABEL: mad_i64_i32_sextops_i31_i63:

llvm/test/CodeGen/AMDGPU/mul_int24.ll

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -463,18 +463,16 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
463463
define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
464464
; SI-LABEL: test_smul24_i33:
465465
; SI: ; %bb.0: ; %entry
466+
; SI-NEXT: s_load_dword s6, s[4:5], 0xd
466467
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
467-
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
468-
; SI-NEXT: s_load_dword s4, s[4:5], 0xd
468+
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
469469
; SI-NEXT: s_mov_b32 s3, 0xf000
470470
; SI-NEXT: s_mov_b32 s2, -1
471471
; SI-NEXT: s_waitcnt lgkmcnt(0)
472-
; SI-NEXT: s_lshl_b32 s5, s6, 8
473-
; SI-NEXT: s_lshl_b32 s7, s4, 8
474-
; SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 40
475-
; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
476-
; SI-NEXT: v_mov_b32_e32 v0, s6
477-
; SI-NEXT: s_mul_i32 s5, s4, s6
472+
; SI-NEXT: s_bfe_i32 s5, s6, 0x180000
473+
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
474+
; SI-NEXT: v_mov_b32_e32 v0, s5
475+
; SI-NEXT: s_mul_i32 s5, s4, s5
478476
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0
479477
; SI-NEXT: v_mov_b32_e32 v0, s5
480478
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
@@ -485,14 +483,12 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
485483
; VI-LABEL: test_smul24_i33:
486484
; VI: ; %bb.0: ; %entry
487485
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
488-
; VI-NEXT: s_load_dword s6, s[4:5], 0x34
486+
; VI-NEXT: s_load_dword s3, s[4:5], 0x34
489487
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
490488
; VI-NEXT: s_waitcnt lgkmcnt(0)
491-
; VI-NEXT: s_lshl_b32 s3, s2, 8
492-
; VI-NEXT: s_lshl_b32 s5, s6, 8
493-
; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
494-
; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
495-
; VI-NEXT: v_mov_b32_e32 v0, s4
489+
; VI-NEXT: s_bfe_i32 s2, s2, 0x180000
490+
; VI-NEXT: s_bfe_i32 s3, s3, 0x180000
491+
; VI-NEXT: v_mov_b32_e32 v0, s3
496492
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
497493
; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
498494
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
@@ -505,15 +501,13 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
505501
; GFX9-LABEL: test_smul24_i33:
506502
; GFX9: ; %bb.0: ; %entry
507503
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
508-
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
509504
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34
505+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
510506
; GFX9-NEXT: s_mov_b32 s3, 0xf000
511507
; GFX9-NEXT: s_mov_b32 s2, -1
512508
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
513-
; GFX9-NEXT: s_lshl_b32 s5, s6, 8
514-
; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
515-
; GFX9-NEXT: s_lshl_b32 s5, s7, 8
516-
; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40
509+
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
510+
; GFX9-NEXT: s_bfe_i32 s6, s7, 0x180000
517511
; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6
518512
; GFX9-NEXT: s_mul_i32 s4, s4, s6
519513
; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31
@@ -609,16 +603,14 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
609603
; GFX9-LABEL: test_smulhi24_i33:
610604
; GFX9: ; %bb.0: ; %entry
611605
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
612-
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
613606
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34
607+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
614608
; GFX9-NEXT: s_mov_b32 s3, 0xf000
615609
; GFX9-NEXT: s_mov_b32 s2, -1
616610
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
617-
; GFX9-NEXT: s_lshl_b32 s5, s6, 8
618-
; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
619-
; GFX9-NEXT: s_lshl_b32 s5, s7, 8
620-
; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 40
621-
; GFX9-NEXT: s_mul_hi_i32 s4, s4, s6
611+
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
612+
; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000
613+
; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5
622614
; GFX9-NEXT: s_and_b32 s4, s4, 1
623615
; GFX9-NEXT: v_mov_b32_e32 v0, s4
624616
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0

0 commit comments

Comments
 (0)