Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit bd0b6b0

Browse files
committed
AMDGPU: Add combine for short vector extract_vector_elts
Try to access pieces 4 bytes at a time. This helps various hasOneUse extract_vector_elt combines, such as load width reductions. Avoids test regressions in a future commit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@334836 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 9e41f53 commit bd0b6b0

File tree

4 files changed

+176
-1
lines changed

4 files changed

+176
-1
lines changed

lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7097,8 +7097,11 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
70977097
SDValue SITargetLowering::performExtractVectorEltCombine(
70987098
SDNode *N, DAGCombinerInfo &DCI) const {
70997099
SDValue Vec = N->getOperand(0);
7100-
71017100
SelectionDAG &DAG = DCI.DAG;
7101+
7102+
EVT VecVT = Vec.getValueType();
7103+
EVT EltVT = VecVT.getVectorElementType();
7104+
71027105
if ((Vec.getOpcode() == ISD::FNEG ||
71037106
Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
71047107
SDLoc SL(N);
@@ -7139,6 +7142,44 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
71397142
Vec.getOperand(1), Idx));
71407143
}
71417144
}
7145+
7146+
if (!DCI.isBeforeLegalize())
7147+
return SDValue();
7148+
7149+
unsigned VecSize = VecVT.getSizeInBits();
7150+
unsigned EltSize = EltVT.getSizeInBits();
7151+
7152+
// Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
7153+
// elements. This exposes more load reduction opportunities by replacing
7154+
// multiple small extract_vector_elements with a single 32-bit extract.
7155+
auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
7156+
if (EltSize <= 16 &&
7157+
EltVT.isByteSized() &&
7158+
VecSize > 32 &&
7159+
VecSize % 32 == 0 &&
7160+
Idx) {
7161+
EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
7162+
7163+
unsigned BitIndex = Idx->getZExtValue() * EltSize;
7164+
unsigned EltIdx = BitIndex / 32;
7165+
unsigned LeftoverBitIdx = BitIndex % 32;
7166+
SDLoc SL(N);
7167+
7168+
SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
7169+
DCI.AddToWorklist(Cast.getNode());
7170+
7171+
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
7172+
DAG.getConstant(EltIdx, SL, MVT::i32));
7173+
DCI.AddToWorklist(Elt.getNode());
7174+
SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
7175+
DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
7176+
DCI.AddToWorklist(Srl.getNode());
7177+
7178+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
7179+
DCI.AddToWorklist(Trunc.getNode());
7180+
return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
7181+
}
7182+
71427183
return SDValue();
71437184
}
71447185

test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,36 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)*
141141
ret void
142142
}
143143

144+
; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_01:
145+
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
146+
; GCN-NOT: {{s|buffer|flat|global}}_load_
147+
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
148+
; GCN-NOT: {{s|buffer|flat|global}}_load_
149+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
150+
define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(<16 x half> addrspace(4)* %ptr) #0 {
151+
%load = load <16 x half>, <16 x half> addrspace(4)* %ptr
152+
%elt0 = extractelement <16 x half> %load, i32 0
153+
%elt1 = extractelement <16 x half> %load, i32 1
154+
store volatile half %elt0, half addrspace(1)* undef, align 2
155+
store volatile half %elt1, half addrspace(1)* undef, align 2
156+
ret void
157+
}
158+
159+
; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_23:
160+
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
161+
; GCN-NOT: {{s|buffer|flat|global}}_load_
162+
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
163+
; GCN-NOT: {{s|buffer|flat|global}}_load_
164+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
165+
define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(<16 x half> addrspace(4)* %ptr) #0 {
166+
%load = load <16 x half>, <16 x half> addrspace(4)* %ptr
167+
%elt2 = extractelement <16 x half> %load, i32 2
168+
%elt3 = extractelement <16 x half> %load, i32 3
169+
store volatile half %elt2, half addrspace(1)* undef, align 2
170+
store volatile half %elt3, half addrspace(1)* undef, align 2
171+
ret void
172+
}
173+
144174
declare i32 @llvm.amdgcn.workitem.id.x() #1
145175

146176
attributes #0 = { nounwind }

test/CodeGen/AMDGPU/extract_vector_elt-i16.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,36 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)*
142142
ret void
143143
}
144144

145+
; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01:
146+
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
147+
; GCN-NOT: {{s|buffer|flat|global}}_load_
148+
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
149+
; GCN-NOT: {{s|buffer|flat|global}}_load_
150+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
151+
define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(<16 x i16> addrspace(4)* %ptr) #0 {
152+
%load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
153+
%elt0 = extractelement <16 x i16> %load, i32 0
154+
%elt1 = extractelement <16 x i16> %load, i32 1
155+
store volatile i16 %elt0, i16 addrspace(1)* undef, align 2
156+
store volatile i16 %elt1, i16 addrspace(1)* undef, align 2
157+
ret void
158+
}
159+
160+
; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23:
161+
; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
162+
; GCN-NOT: {{s|buffer|flat|global}}_load_
163+
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
164+
; GCN-NOT: {{s|buffer|flat|global}}_load_
165+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
166+
define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrspace(4)* %ptr) #0 {
167+
%load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
168+
%elt2 = extractelement <16 x i16> %load, i32 2
169+
%elt3 = extractelement <16 x i16> %load, i32 3
170+
store volatile i16 %elt2, i16 addrspace(1)* undef, align 2
171+
store volatile i16 %elt3, i16 addrspace(1)* undef, align 2
172+
ret void
173+
}
174+
145175
declare i32 @llvm.amdgcn.workitem.id.x() #1
146176

147177
attributes #0 = { nounwind }

test/CodeGen/AMDGPU/extract_vector_elt-i8.ll

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,4 +199,78 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out
199199
ret void
200200
}
201201

202+
; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0123:
203+
; GCN-NOT: {{s|buffer|flat|global}}_load_
204+
; GCN: s_load_dword s
205+
; GCN-NOT: {{s|buffer|flat|global}}_load_
206+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
207+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
208+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
209+
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
210+
%load = load <8 x i8>, <8 x i8> addrspace(4)* null
211+
%elt0 = extractelement <8 x i8> %load, i32 0
212+
%elt1 = extractelement <8 x i8> %load, i32 1
213+
%elt2 = extractelement <8 x i8> %load, i32 2
214+
%elt3 = extractelement <8 x i8> %load, i32 3
215+
store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
216+
store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
217+
store volatile i8 %elt2, i8 addrspace(1)* undef, align 1
218+
store volatile i8 %elt3, i8 addrspace(1)* undef, align 1
219+
ret void
220+
}
221+
222+
; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0145:
223+
; GCN-NOT: {{s|buffer|flat|global}}_load_
224+
; GCN: s_load_dwordx2
225+
; GCN-NOT: {{s|buffer|flat|global}}_load_
226+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
227+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
228+
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
229+
%load = load <8 x i8>, <8 x i8> addrspace(4)* null
230+
%elt0 = extractelement <8 x i8> %load, i32 0
231+
%elt1 = extractelement <8 x i8> %load, i32 1
232+
%elt4 = extractelement <8 x i8> %load, i32 4
233+
%elt5 = extractelement <8 x i8> %load, i32 5
234+
store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
235+
store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
236+
store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
237+
store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
238+
ret void
239+
}
240+
241+
; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_45:
242+
; GCN-NOT: {{s|buffer|flat|global}}_load_
243+
; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 4{{$}}
244+
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}}
245+
; GCN-NOT: {{s|buffer|flat|global}}_load_
246+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
247+
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
248+
%load = load <8 x i8>, <8 x i8> addrspace(4)* null
249+
%elt4 = extractelement <8 x i8> %load, i32 4
250+
%elt5 = extractelement <8 x i8> %load, i32 5
251+
store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
252+
store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
253+
ret void
254+
}
255+
256+
; FIXME: ought to be able to eliminate high half of load
257+
; GCN-LABEL: {{^}}reduce_load_vector_v16i8_extract_0145:
258+
; GCN-NOT: {{s|buffer|flat|global}}_load_
259+
; GCN: s_load_dwordx4
260+
; GCN-NOT: {{s|buffer|flat|global}}_load_
261+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
262+
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
263+
define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
264+
%load = load <16 x i8>, <16 x i8> addrspace(4)* null
265+
%elt0 = extractelement <16 x i8> %load, i32 0
266+
%elt1 = extractelement <16 x i8> %load, i32 1
267+
%elt4 = extractelement <16 x i8> %load, i32 4
268+
%elt5 = extractelement <16 x i8> %load, i32 5
269+
store volatile i8 %elt0, i8 addrspace(1)* undef, align 1
270+
store volatile i8 %elt1, i8 addrspace(1)* undef, align 1
271+
store volatile i8 %elt4, i8 addrspace(1)* undef, align 1
272+
store volatile i8 %elt5, i8 addrspace(1)* undef, align 1
273+
ret void
274+
}
275+
202276
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)