Skip to content

Commit 445973c

Browse files
authored
[LegalizeTypes] Handle non byte-sized elt types when splitting INSERT/EXTRACT_VECTOR_ELT (#93357)
DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT and DAGTypeLegalizer::SplitVecRes_EXTRACT_VECTOR_ELT did not handle non byte-sized elements properly. In fact, it only dealt with elements smaller than 8 bits (as well as byte-sized elements). This patch generalizes the support for non byte-sized element by always widening the the vector elements to next "round integer type" (a power of 2 bit size). This should make sure that we can access a single element via a simple byte-addressed scalar load/store. Also removing a suspicious CustomLowerNode call from SplitVecRes_INSERT_VECTOR_ELT. Considering that it did not reset the Lo/Hi out arguments before the return I think that DAGTypeLegalizer::SplitVectorResult could be fooled into registering the input vector as being the result. This should however not have caused any problems since DAGTypeLegalizer::SplitVectorResult is doing the same CustomLowerNode call, making the code removed by this patch redundant.
1 parent f440239 commit 445973c

File tree

4 files changed

+104
-50
lines changed

4 files changed

+104
-50
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1839,17 +1839,12 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
18391839
}
18401840
}
18411841

1842-
// See if the target wants to custom expand this node.
1843-
if (CustomLowerNode(N, N->getValueType(0), true))
1844-
return;
1845-
18461842
// Make the vector elements byte-addressable if they aren't already.
18471843
EVT VecVT = Vec.getValueType();
18481844
EVT EltVT = VecVT.getVectorElementType();
1849-
if (VecVT.getScalarSizeInBits() < 8) {
1850-
EltVT = MVT::i8;
1851-
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
1852-
VecVT.getVectorElementCount());
1845+
if (!EltVT.isByteSized()) {
1846+
EltVT = EltVT.changeTypeToInteger().getRoundIntegerType(*DAG.getContext());
1847+
VecVT = VecVT.changeElementType(EltVT);
18531848
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
18541849
// Extend the element type to match if needed.
18551850
if (EltVT.bitsGT(Elt.getValueType()))
@@ -3457,11 +3452,13 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
34573452
// Make the vector elements byte-addressable if they aren't already.
34583453
SDLoc dl(N);
34593454
EVT EltVT = VecVT.getVectorElementType();
3460-
if (VecVT.getScalarSizeInBits() < 8) {
3461-
EltVT = MVT::i8;
3462-
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
3463-
VecVT.getVectorElementCount());
3455+
if (!EltVT.isByteSized()) {
3456+
EltVT = EltVT.changeTypeToInteger().getRoundIntegerType(*DAG.getContext());
3457+
VecVT = VecVT.changeElementType(EltVT);
34643458
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
3459+
SDValue NewExtract =
3460+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vec, Idx);
3461+
return DAG.getAnyExtOrTrunc(NewExtract, dl, N->getValueType(0));
34653462
}
34663463

34673464
// Store the vector to the stack.
@@ -3479,13 +3476,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
34793476
// Load back the required element.
34803477
StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
34813478

3482-
// FIXME: This is to handle i1 vectors with elements promoted to i8.
3483-
// i1 vector handling needs general improvement.
3484-
if (N->getValueType(0).bitsLT(EltVT)) {
3485-
SDValue Load = DAG.getLoad(EltVT, dl, Store, StackPtr,
3486-
MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
3487-
return DAG.getZExtOrTrunc(Load, dl, N->getValueType(0));
3488-
}
3479+
// EXTRACT_VECTOR_ELT can extend the element type to the width of the return
3480+
// type, leaving the high bits undefined. But it can't truncate.
3481+
assert(N->getValueType(0).bitsGE(EltVT) && "Illegal EXTRACT_VECTOR_ELT.");
34893482

34903483
return DAG.getExtLoad(
34913484
ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,

llvm/test/CodeGen/AMDGPU/extract-load-i1.ll

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,28 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
88
; CHECK: ; %bb.0:
99
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010
; CHECK-NEXT: flat_load_ubyte v0, v[0:1]
11-
; CHECK-NEXT: v_and_b32_e32 v1, 7, v2
12-
; CHECK-NEXT: v_lshr_b32_e64 v2, s32, 6
13-
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
1411
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15-
; CHECK-NEXT: v_bfe_u32 v2, v0, 1, 1
16-
; CHECK-NEXT: v_bfe_u32 v3, v0, 2, 2
17-
; CHECK-NEXT: v_bfe_u32 v4, v0, 3, 1
18-
; CHECK-NEXT: v_lshrrev_b32_e32 v5, 4, v0
19-
; CHECK-NEXT: v_bfe_u32 v6, v0, 5, 1
20-
; CHECK-NEXT: v_lshrrev_b32_e32 v7, 6, v0
21-
; CHECK-NEXT: v_lshrrev_b32_e32 v8, 7, v0
22-
; CHECK-NEXT: buffer_store_byte v0, off, s[0:3], s32
23-
; CHECK-NEXT: buffer_store_byte v8, off, s[0:3], s32 offset:7
24-
; CHECK-NEXT: buffer_store_byte v7, off, s[0:3], s32 offset:6
25-
; CHECK-NEXT: buffer_store_byte v6, off, s[0:3], s32 offset:5
26-
; CHECK-NEXT: buffer_store_byte v5, off, s[0:3], s32 offset:4
27-
; CHECK-NEXT: buffer_store_byte v4, off, s[0:3], s32 offset:3
28-
; CHECK-NEXT: buffer_store_byte v3, off, s[0:3], s32 offset:2
29-
; CHECK-NEXT: buffer_store_byte v2, off, s[0:3], s32 offset:1
30-
; CHECK-NEXT: buffer_load_ubyte v0, v1, s[0:3], 0 offen
31-
; CHECK-NEXT: s_waitcnt vmcnt(0)
12+
; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v0
13+
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 5, v0
14+
; CHECK-NEXT: v_and_b32_e32 v4, 2, v0
15+
; CHECK-NEXT: v_lshrrev_b32_e32 v5, 6, v0
16+
; CHECK-NEXT: v_lshrrev_b32_e32 v6, 4, v0
17+
; CHECK-NEXT: v_lshlrev_b32_e32 v7, 3, v0
18+
; CHECK-NEXT: v_lshlrev_b32_e32 v8, 1, v0
19+
; CHECK-NEXT: v_or_b32_e32 v1, v1, v3
20+
; CHECK-NEXT: v_and_b32_e32 v3, 0x100, v7
21+
; CHECK-NEXT: v_and_b32_e32 v7, 0x100, v8
22+
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 7, v4
23+
; CHECK-NEXT: v_or_b32_e32 v3, v6, v3
24+
; CHECK-NEXT: v_or_b32_e32 v5, v5, v7
25+
; CHECK-NEXT: v_or_b32_e32 v0, v0, v4
26+
; CHECK-NEXT: v_and_b32_e32 v1, 0x103, v1
27+
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 16, v5
28+
; CHECK-NEXT: v_lshlrev_b32_e32 v5, 16, v1
29+
; CHECK-NEXT: v_or_b32_e32 v1, v3, v4
30+
; CHECK-NEXT: v_or_b32_e32 v0, v0, v5
31+
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2
32+
; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], v2
3233
; CHECK-NEXT: s_setpc_b64 s[30:31]
3334
%val = load <8 x i1>, ptr %ptr
3435
%ret = extractelement <8 x i1> %val, i32 %idx

llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -410,17 +410,17 @@ entry:
410410
}
411411

412412
; GCN-LABEL: {{^}}bit4_extelt:
413-
; FIXME: One v_mov_b32_e32 vN, 0 should suffice
414-
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0
415-
; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
416-
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
417-
; GCN-DAG: buffer_store_byte [[ZERO]],
418-
; GCN-DAG: buffer_store_byte [[ONE]],
419-
; GCN-DAG: buffer_store_byte [[ZERO]],
420-
; GCN-DAG: buffer_store_byte [[ONE]],
421-
; GCN: buffer_load_ubyte [[LOAD:v[0-9]+]],
422-
; GCN: v_and_b32_e32 [[RES:v[0-9]+]], 1, [[LOAD]]
423-
; GCN: flat_store_dword v[{{[0-9:]+}}], [[RES]]
413+
; GCN: ; %bb.0: ; %entry
414+
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
415+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
416+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
417+
; GCN-NEXT: s_lshl_b32 s2, s2, 3
418+
; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2
419+
; GCN-NEXT: s_and_b32 s2, s2, 1
420+
; GCN-NEXT: v_mov_b32_e32 v0, s0
421+
; GCN-NEXT: v_mov_b32_e32 v1, s1
422+
; GCN-NEXT: v_mov_b32_e32 v2, s2
423+
; GCN-NEXT: flat_store_dword v[0:1], v2
424424
define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
425425
entry:
426426
%ext = extractelement <4 x i1> <i1 0, i1 1, i1 0, i1 1>, i32 %sel
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=x86_64-- -o - %s| FileCheck %s
3+
4+
; Verify that we support non byte-sized elements, together with variable index.
5+
6+
define void @Legalize_SplitVectorResult_insert_i28(i28 %elt, i16 %idx, ptr %p1, ptr %p2) nounwind {
7+
; CHECK-LABEL: Legalize_SplitVectorResult_insert_i28:
8+
; CHECK: # %bb.0:
9+
; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
10+
; CHECK-NEXT: xorps %xmm0, %xmm0
11+
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
12+
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
13+
; CHECK-NEXT: andl $7, %esi
14+
; CHECK-NEXT: movl %edi, -40(%rsp,%rsi,4)
15+
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [268435455,268435455,268435455,268435455]
16+
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1
17+
; CHECK-NEXT: andps %xmm0, %xmm1
18+
; CHECK-NEXT: andps -{{[0-9]+}}(%rsp), %xmm0
19+
; CHECK-NEXT: movaps %xmm0, 16(%rcx)
20+
; CHECK-NEXT: movaps %xmm1, (%rcx)
21+
; CHECK-NEXT: retq
22+
%vec1 = insertelement <8 x i28> zeroinitializer, i28 %elt, i16 %idx
23+
%vec2 = zext <8 x i28> %vec1 to <8 x i32>
24+
store <8 x i32> %vec2, ptr %p2
25+
ret void
26+
}
27+
28+
define void @Legalize_SplitVectorResult_extract_i12(i16 %idx, ptr %p1, ptr %p2) nounwind {
29+
; CHECK-LABEL: Legalize_SplitVectorResult_extract_i12:
30+
; CHECK: # %bb.0:
31+
; CHECK-NEXT: pushq %rax
32+
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
33+
; CHECK-NEXT: movaps (%rsi), %xmm0
34+
; CHECK-NEXT: movaps 16(%rsi), %xmm1
35+
; CHECK-NEXT: movaps 32(%rsi), %xmm2
36+
; CHECK-NEXT: movaps 48(%rsi), %xmm3
37+
; CHECK-NEXT: movaps 64(%rsi), %xmm4
38+
; CHECK-NEXT: movaps 80(%rsi), %xmm5
39+
; CHECK-NEXT: movaps 96(%rsi), %xmm6
40+
; CHECK-NEXT: movaps 112(%rsi), %xmm7
41+
; CHECK-NEXT: movaps %xmm7, -{{[0-9]+}}(%rsp)
42+
; CHECK-NEXT: movaps %xmm6, -{{[0-9]+}}(%rsp)
43+
; CHECK-NEXT: movaps %xmm5, -{{[0-9]+}}(%rsp)
44+
; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
45+
; CHECK-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
46+
; CHECK-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
47+
; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
48+
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
49+
; CHECK-NEXT: andl $63, %edi
50+
; CHECK-NEXT: movzwl -128(%rsp,%rdi,2), %eax
51+
; CHECK-NEXT: andl $4095, %eax # imm = 0xFFF
52+
; CHECK-NEXT: movw %ax, (%rdx)
53+
; CHECK-NEXT: popq %rax
54+
; CHECK-NEXT: retq
55+
%vec = load <64 x i16>, ptr %p1
56+
%trunc = trunc <64 x i16> %vec to <64 x i12>
57+
%elt = extractelement <64 x i12> %trunc, i16 %idx
58+
store i12 %elt, ptr %p2
59+
ret void
60+
}

0 commit comments

Comments
 (0)