Skip to content

Commit 827f2ad

Browse files
authored
AMDGPU: Convert vector 64-bit shl to 32-bit if shift amt >= 32 (#132964)
Convert vector 64-bit shl to 32-bit if shift amt is known to be >= 32. --------- Signed-off-by: John Lu <[email protected]>
1 parent 6b1acdb commit 827f2ad

File tree

2 files changed

+53
-25
lines changed

2 files changed

+53
-25
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4084,29 +4084,32 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
40844084
}
40854085
}
40864086

4087-
if (VT != MVT::i64)
4087+
if (VT.getScalarType() != MVT::i64)
40884088
return SDValue();
40894089

40904090
// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
40914091

40924092
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
40934093
// common case, splitting this into a move and a 32-bit shift is faster and
40944094
// the same code size.
4095-
EVT TargetType = VT.getHalfSizedIntegerVT(*DAG.getContext());
4096-
EVT TargetVecPairType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
40974095
KnownBits Known = DAG.computeKnownBits(RHS);
40984096

4099-
if (Known.getMinValue().getZExtValue() < TargetType.getSizeInBits())
4097+
EVT ElementType = VT.getScalarType();
4098+
EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4099+
EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4100+
: TargetScalarType;
4101+
4102+
if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
41004103
return SDValue();
41014104
SDValue ShiftAmt;
41024105

41034106
if (CRHS) {
4104-
ShiftAmt =
4105-
DAG.getConstant(RHSVal - TargetType.getSizeInBits(), SL, TargetType);
4107+
ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4108+
TargetType);
41064109
} else {
41074110
SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
41084111
const SDValue ShiftMask =
4109-
DAG.getConstant(TargetType.getSizeInBits() - 1, SL, TargetType);
4112+
DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
41104113
// This AND instruction will clamp out of bounds shift values.
41114114
// It will also be removed during later instruction selection.
41124115
ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
@@ -4116,9 +4119,23 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
41164119
SDValue NewShift =
41174120
DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
41184121

4119-
const SDValue Zero = DAG.getConstant(0, SL, TargetType);
4122+
const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4123+
SDValue Vec;
41204124

4121-
SDValue Vec = DAG.getBuildVector(TargetVecPairType, SL, {Zero, NewShift});
4125+
if (VT.isVector()) {
4126+
EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4127+
unsigned NElts = TargetType.getVectorNumElements();
4128+
SmallVector<SDValue, 8> HiOps;
4129+
SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4130+
4131+
DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4132+
for (unsigned I = 0; I != NElts; ++I)
4133+
HiAndLoOps[2 * I + 1] = HiOps[I];
4134+
Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4135+
} else {
4136+
EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4137+
Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4138+
}
41224139
return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
41234140
}
41244141

@@ -5182,9 +5199,14 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
51825199
break;
51835200
}
51845201
case ISD::SHL: {
5185-
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5202+
// Range metadata can be invalidated when loads are converted to legal types
5203+
// (e.g. v2i64 -> v4i32).
5204+
// Try to convert vector shl before type legalization so that range metadata
5205+
// can be utilized.
5206+
if (!(N->getValueType(0).isVector() &&
5207+
DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5208+
DCI.getDAGCombineLevel() < AfterLegalizeDAG)
51865209
break;
5187-
51885210
return performShlCombine(N, DCI);
51895211
}
51905212
case ISD::SRL: {

llvm/test/CodeGen/AMDGPU/shl64_reduce.ll

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,39 +72,41 @@ define i64 @shl_metadata_cant_be_narrowed_to_i32(i64 %arg0, ptr %arg1.ptr) {
7272
ret i64 %shl
7373
}
7474

75-
; FIXME: This case should be reduced
7675
define <2 x i64> @shl_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
7776
; CHECK-LABEL: shl_v2_metadata:
7877
; CHECK: ; %bb.0:
7978
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80-
; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
79+
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[4:5]
8180
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
82-
; CHECK-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
83-
; CHECK-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3]
81+
; CHECK-NEXT: v_lshlrev_b32_e32 v1, v3, v0
82+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, v5, v2
83+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
84+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
8485
; CHECK-NEXT: s_setpc_b64 s[30:31]
8586
%shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
8687
%shl = shl <2 x i64> %arg0, %shift.amt
8788
ret <2 x i64> %shl
8889
}
8990

90-
; FIXME: This case should be reduced
9191
define <3 x i64> @shl_v3_metadata(<3 x i64> %arg0, ptr %arg1.ptr) {
9292
; CHECK-LABEL: shl_v3_metadata:
9393
; CHECK: ; %bb.0:
9494
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95-
; CHECK-NEXT: flat_load_dword v12, v[6:7] offset:16
95+
; CHECK-NEXT: flat_load_dword v1, v[6:7] offset:16
9696
; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[6:7]
9797
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
98-
; CHECK-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5]
99-
; CHECK-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
100-
; CHECK-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3]
98+
; CHECK-NEXT: v_lshlrev_b32_e32 v5, v1, v4
99+
; CHECK-NEXT: v_lshlrev_b32_e32 v1, v8, v0
100+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, v10, v2
101+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
102+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
103+
; CHECK-NEXT: v_mov_b32_e32 v4, 0
101104
; CHECK-NEXT: s_setpc_b64 s[30:31]
102105
%shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
103106
%shl = shl <3 x i64> %arg0, %shift.amt
104107
ret <3 x i64> %shl
105108
}
106109

107-
; FIXME: This case should be reduced
108110
define <4 x i64> @shl_v4_metadata(<4 x i64> %arg0, ptr %arg1.ptr) {
109111
; CHECK-LABEL: shl_v4_metadata:
110112
; CHECK: ; %bb.0:
@@ -113,11 +115,15 @@ define <4 x i64> @shl_v4_metadata(<4 x i64> %arg0, ptr %arg1.ptr) {
113115
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
114116
; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[8:9] offset:16
115117
; CHECK-NEXT: ; kill: killed $vgpr8 killed $vgpr9
116-
; CHECK-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1]
117-
; CHECK-NEXT: v_lshlrev_b64 v[2:3], v12, v[2:3]
118+
; CHECK-NEXT: v_lshlrev_b32_e32 v1, v10, v0
119+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, v12, v2
118120
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
119-
; CHECK-NEXT: v_lshlrev_b64 v[4:5], v13, v[4:5]
120-
; CHECK-NEXT: v_lshlrev_b64 v[6:7], v15, v[6:7]
121+
; CHECK-NEXT: v_lshlrev_b32_e32 v5, v13, v4
122+
; CHECK-NEXT: v_lshlrev_b32_e32 v7, v15, v6
123+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
124+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
125+
; CHECK-NEXT: v_mov_b32_e32 v4, 0
126+
; CHECK-NEXT: v_mov_b32_e32 v6, 0
121127
; CHECK-NEXT: s_setpc_b64 s[30:31]
122128
%shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
123129
%shl = shl <4 x i64> %arg0, %shift.amt

0 commit comments

Comments
 (0)