Skip to content

Commit 13c359a

Browse files
authored
[X86] ReplaceNodeResults - truncate sub-128-bit vectors as shuffles directly (#83120)
We were scalarizing these truncations, but in most cases we can widen the source vector to 128-bits and perform the truncation as a shuffle directly (which will usually lower as a PACK or PSHUFB). For the cases where the widening and shuffle isn't legal we can leave it to generic legalization to scalarize for us. Fixes #81883
1 parent 8c2ae42 commit 13c359a

File tree

4 files changed

+26
-46
lines changed

4 files changed

+26
-46
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32341,20 +32341,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3234132341
}
3234232342
}
3234332343

32344-
if (128 % InBits == 0) {
32344+
if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
3234532345
// 128 bit and smaller inputs should avoid truncate all together and
32346-
// just use a build_vector that will become a shuffle.
32347-
// TODO: Widen and use a shuffle directly?
32348-
SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
32349-
// Use the original element count so we don't do more scalar opts than
32350-
// necessary.
32351-
for (unsigned i=0; i < MinElts; ++i) {
32352-
SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
32353-
DAG.getIntPtrConstant(i, dl));
32354-
Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
32355-
}
32356-
Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
32357-
return;
32346+
// use a shuffle.
32347+
if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
32348+
int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
32349+
SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32350+
for (unsigned I = 0; I < MinElts; ++I)
32351+
TruncMask[I] = Scale * I;
32352+
SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
32353+
assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
32354+
"Illegal vector type in truncation");
32355+
WidenIn = DAG.getBitcast(WidenVT, WidenIn);
32356+
Results.push_back(
32357+
DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
32358+
return;
32359+
}
3235832360
}
3235932361

3236032362
// With AVX512 there are some cases that can use a target specific

llvm/test/CodeGen/X86/extract-concat.ll

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,17 @@ define void @foo(<4 x float> %in, ptr %out) {
99
; SSE2-LABEL: foo:
1010
; SSE2: # %bb.0:
1111
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
12-
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
13-
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
14-
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
15-
; SSE2-NEXT: shll $8, %ecx
16-
; SSE2-NEXT: orl %eax, %ecx
17-
; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
18-
; SSE2-NEXT: shll $16, %eax
19-
; SSE2-NEXT: orl %ecx, %eax
20-
; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000
21-
; SSE2-NEXT: movl %eax, (%rdi)
12+
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
13+
; SSE2-NEXT: packuswb %xmm0, %xmm0
14+
; SSE2-NEXT: packuswb %xmm0, %xmm0
15+
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
16+
; SSE2-NEXT: movd %xmm0, (%rdi)
2217
; SSE2-NEXT: retq
2318
;
2419
; SSE42-LABEL: foo:
2520
; SSE42: # %bb.0:
2621
; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
27-
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
22+
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
2823
; SSE42-NEXT: movl $255, %eax
2924
; SSE42-NEXT: pinsrb $3, %eax, %xmm0
3025
; SSE42-NEXT: movd %xmm0, (%rdi)
@@ -33,7 +28,7 @@ define void @foo(<4 x float> %in, ptr %out) {
3328
; AVX-LABEL: foo:
3429
; AVX: # %bb.0:
3530
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
36-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
31+
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
3732
; AVX-NEXT: movl $255, %eax
3833
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
3934
; AVX-NEXT: vmovd %xmm0, (%rdi)

llvm/test/CodeGen/X86/vec_anyext.ll

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -112,27 +112,10 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
112112
;
113113
; X64-LABEL: func_8_16:
114114
; X64: # %bb.0:
115-
; X64-NEXT: movq (%rdi), %rax
116-
; X64-NEXT: vmovd %eax, %xmm0
117-
; X64-NEXT: movl %eax, %ecx
118-
; X64-NEXT: shrl $16, %ecx
119-
; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
120-
; X64-NEXT: movq %rax, %rcx
121-
; X64-NEXT: shrq $32, %rcx
122-
; X64-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
123-
; X64-NEXT: shrq $48, %rax
124-
; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
125-
; X64-NEXT: movq (%rsi), %rax
126-
; X64-NEXT: vmovd %eax, %xmm1
127-
; X64-NEXT: movl %eax, %ecx
128-
; X64-NEXT: shrl $16, %ecx
129-
; X64-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
130-
; X64-NEXT: movq %rax, %rcx
131-
; X64-NEXT: shrq $32, %rcx
132-
; X64-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
133-
; X64-NEXT: shrq $48, %rax
134-
; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
115+
; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
116+
; X64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
135117
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
118+
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
136119
; X64-NEXT: retq
137120
%F = load <4 x i16>, ptr %a
138121
%G = trunc <4 x i16> %F to <4 x i8>

llvm/test/CodeGen/X86/vec_cast.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind {
156156
; CHECK-WIN-LABEL: h:
157157
; CHECK-WIN: # %bb.0:
158158
; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0
159-
; CHECK-WIN-NEXT: movl (%rcx), %eax
159+
; CHECK-WIN-NEXT: movd %xmm0, %eax
160160
; CHECK-WIN-NEXT: pextrw $2, %xmm0, %edx
161161
; CHECK-WIN-NEXT: pextrw $4, %xmm0, %ecx
162162
; CHECK-WIN-NEXT: # kill: def $ax killed $ax killed $eax

0 commit comments

Comments
 (0)