Skip to content

Commit 16742f0

Browse files
committed
Reduce shl64 to shl32 if shift range is [63-32]
Signed-off-by: John Lu <[email protected]>
1 parent 92e3cd7 commit 16742f0

File tree

2 files changed

+91
-8
lines changed

2 files changed

+91
-8
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4040,19 +4040,35 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
40404040
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40414041
DAGCombinerInfo &DCI) const {
40424042
EVT VT = N->getValueType(0);
4043+
SDValue LHS = N->getOperand(0);
4044+
SDValue RHS = N->getOperand(1);
4045+
ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4046+
SDLoc SL(N);
4047+
SelectionDAG &DAG = DCI.DAG;
40434048

4044-
ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4045-
if (!RHS)
4049+
if (!CRHS) {
4050+
// shl i64 X, Y -> [0, shl i32 X, (Y - 32)]
4051+
if (VT == MVT::i64) {
4052+
KnownBits Known = DAG.computeKnownBits(RHS);
4053+
if (Known.getMinValue().getZExtValue() >= 32) {
4054+
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, RHS);
4055+
const SDValue C32 = DAG.getConstant(32, SL, MVT::i32);
4056+
SDValue ShiftAmt =
4057+
DAG.getNode(ISD::SUB, SL, MVT::i32, truncShiftAmt, C32);
4058+
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4059+
SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4060+
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4061+
SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4062+
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4063+
}
4064+
}
40464065
return SDValue();
4066+
}
40474067

4048-
SDValue LHS = N->getOperand(0);
4049-
unsigned RHSVal = RHS->getZExtValue();
4068+
unsigned RHSVal = CRHS->getZExtValue();
40504069
if (!RHSVal)
40514070
return LHS;
40524071

4053-
SDLoc SL(N);
4054-
SelectionDAG &DAG = DCI.DAG;
4055-
40564072
switch (LHS->getOpcode()) {
40574073
default:
40584074
break;
@@ -4078,7 +4094,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40784094
if (LZ < RHSVal)
40794095
break;
40804096
EVT XVT = X.getValueType();
4081-
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4097+
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
40824098
return DAG.getZExtOrTrunc(Shl, SL, VT);
40834099
}
40844100
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
;; Test reduction of:
2+
;;
3+
;; DST = shl i64 X, Y
4+
;;
5+
;; where Y is in the range [63-32] to:
6+
;;
7+
;; DST = [0, shl i32 X, (Y - 32)]
8+
9+
; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
10+
11+
; FIXME: This case should be reduced, but SelectionDAG::computeKnownBits() cannot
12+
; determine the minimum from metadata in this case. Match current results
13+
; for now.
14+
define i64 @shl_metadata(i64 noundef %arg0, ptr %arg1.ptr) {
15+
%shift.amt = load i64, ptr %arg1.ptr, !range !0
16+
%shl = shl i64 %arg0, %shift.amt
17+
ret i64 %shl
18+
19+
; CHECK: .globl shl_metadata
20+
; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
21+
}
22+
23+
!0 = !{i64 32, i64 64}
24+
25+
; This case is reduced because computeKnownBits() can calculates a minimum of 32
26+
; based on the OR with 32.
27+
define i64 @shl_or32(i64 noundef %arg0, ptr %arg1.ptr) {
28+
%shift.amt = load i64, ptr %arg1.ptr
29+
%or = or i64 %shift.amt, 32
30+
%shl = shl i64 %arg0, %or
31+
ret i64 %shl
32+
33+
; CHECK: .globl shl_or32
34+
; CHECK: v_or_b32_e32 v1, 32, v1
35+
; CHECK: v_subrev_i32_e32 v1, vcc, 32, v1
36+
; CHECK: v_lshlrev_b32_e32 v1, v1, v0
37+
; CHECK: v_mov_b32_e32 v0, 0
38+
}
39+
40+
; This case must not be reduced because the known minimum, 16, is not in range.
41+
define i64 @shl_or16(i64 noundef %arg0, ptr %arg1.ptr) {
42+
%shift.amt = load i64, ptr %arg1.ptr
43+
%or = or i64 %shift.amt, 16
44+
%shl = shl i64 %arg0, %or
45+
ret i64 %shl
46+
47+
; CHECK: .globl shl_or16
48+
; CHECK: v_or_b32_e32 v2, 16, v2
49+
; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
50+
}
51+
52+
; FIXME: This case should be reduced too, but computeKnownBits() cannot
53+
; determine the range. Match current results for now.
54+
define i64 @shl_maxmin(i64 noundef %arg0, i64 noundef %arg1) {
55+
%max = call i64 @llvm.umax.i64(i64 %arg1, i64 32)
56+
%min = call i64 @llvm.umin.i64(i64 %max, i64 63)
57+
%shl = shl i64 %arg0, %min
58+
ret i64 %shl
59+
60+
; CHECK: .globl shl_maxmin
61+
; CHECK: v_cmp_lt_u64_e32 vcc, 32, v[2:3]
62+
; CHECK: v_cndmask_b32_e32 v3, 0, v3, vcc
63+
; CHECK: v_cndmask_b32_e32 v2, 32, v2, vcc
64+
; CHECK: v_cmp_gt_u64_e32 vcc, 63, v[2:3]
65+
; CHECK: v_cndmask_b32_e32 v2, 63, v2, vcc
66+
; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
67+
}

0 commit comments

Comments
 (0)