Skip to content

Commit dcd0f2b

Browse files
committed
[X86] combineExtractFromVectorLoad support extraction from vector of different types to the extraction type/index
combineExtractFromVectorLoad no longer uses the vector we're extracting from to determine the pointer offset calculation, allowing us to extract from types that have been bitcast to work with specific target shuffles. Fixes #85419
1 parent c335acc commit dcd0f2b

File tree

6 files changed

+179
-292
lines changed

6 files changed

+179
-292
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43999,18 +43999,18 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
4399943999
// integer, that requires a potentially expensive XMM -> GPR transfer.
4400044000
// Additionally, if we can convert to a scalar integer load, that will likely
4400144001
// be folded into a subsequent integer op.
44002+
// Note: SrcVec might not have a VecVT type, but it must be the same size.
4400244003
// Note: Unlike the related fold for this in DAGCombiner, this is not limited
4400344004
// to a single-use of the loaded vector. For the reasons above, we
4400444005
// expect this to be profitable even if it creates an extra load.
4400544006
static SDValue
44006-
combineExtractFromVectorLoad(SDNode *N, SDValue InputVector, uint64_t Idx,
44007+
combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx,
4400744008
const SDLoc &dl, SelectionDAG &DAG,
4400844009
TargetLowering::DAGCombinerInfo &DCI) {
4400944010
assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4401044011
"Only EXTRACT_VECTOR_ELT supported so far");
4401144012

4401244013
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44013-
EVT SrcVT = InputVector.getValueType();
4401444014
EVT VT = N->getValueType(0);
4401544015

4401644016
bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
@@ -44019,12 +44019,13 @@ combineExtractFromVectorLoad(SDNode *N, SDValue InputVector, uint64_t Idx,
4401944019
Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
4402044020
});
4402144021

44022-
auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
44022+
auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
4402344023
if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44024-
SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
44025-
!LikelyUsedAsVector && LoadVec->isSimple()) {
44024+
VecVT.getVectorElementType() == VT &&
44025+
VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
44026+
DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
4402644027
SDValue NewPtr = TLI.getVectorElementPointer(
44027-
DAG, LoadVec->getBasePtr(), SrcVT, DAG.getVectorIdxConstant(Idx, dl));
44028+
DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
4402844029
unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
4402944030
MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
4403044031
Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
@@ -44234,10 +44235,9 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
4423444235
if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
4423544236
return DAG.getZExtOrTrunc(V, dl, VT);
4423644237

44237-
if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT &&
44238-
SrcOp.getValueType() == SrcVT)
44239-
if (SDValue V =
44240-
combineExtractFromVectorLoad(N, SrcOp, ExtractIdx, dl, DAG, DCI))
44238+
if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44239+
if (SDValue V = combineExtractFromVectorLoad(
44240+
N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
4424144241
return V;
4424244242

4424344243
return SDValue();
@@ -44651,7 +44651,8 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
4465144651

4465244652
if (CIdx)
4465344653
if (SDValue V = combineExtractFromVectorLoad(
44654-
N, InputVector, CIdx->getZExtValue(), dl, DAG, DCI))
44654+
N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
44655+
dl, DAG, DCI))
4465544656
return V;
4465644657

4465744658
// Attempt to extract a i1 element by using MOVMSK to extract the signbits

llvm/test/CodeGen/X86/extractelement-load.ll

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,9 @@ bb:
7676
define i64 @t4(ptr %a) {
7777
; X86-SSE2-LABEL: t4:
7878
; X86-SSE2: # %bb.0:
79-
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
80-
; X86-SSE2-NEXT: movdqa (%eax), %xmm0
81-
; X86-SSE2-NEXT: movd %xmm0, %eax
82-
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
83-
; X86-SSE2-NEXT: movd %xmm0, %edx
79+
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
80+
; X86-SSE2-NEXT: movl (%ecx), %eax
81+
; X86-SSE2-NEXT: movl 4(%ecx), %edx
8482
; X86-SSE2-NEXT: retl
8583
;
8684
; X64-LABEL: t4:
@@ -289,24 +287,15 @@ define i32 @PR85419(ptr %p0) {
289287
; X86-SSE2-NEXT: .LBB8_2:
290288
; X86-SSE2-NEXT: retl
291289
;
292-
; X64-SSSE3-LABEL: PR85419:
293-
; X64-SSSE3: # %bb.0:
294-
; X64-SSSE3-NEXT: xorl %ecx, %ecx
295-
; X64-SSSE3-NEXT: cmpq $0, (%rdi)
296-
; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,2,3]
297-
; X64-SSSE3-NEXT: movd %xmm0, %eax
298-
; X64-SSSE3-NEXT: cmovel %ecx, %eax
299-
; X64-SSSE3-NEXT: retq
300-
;
301-
; X64-AVX-LABEL: PR85419:
302-
; X64-AVX: # %bb.0:
303-
; X64-AVX-NEXT: xorl %eax, %eax
304-
; X64-AVX-NEXT: cmpq $0, (%rdi)
305-
; X64-AVX-NEXT: je .LBB8_2
306-
; X64-AVX-NEXT: # %bb.1:
307-
; X64-AVX-NEXT: movl 8(%rdi), %eax
308-
; X64-AVX-NEXT: .LBB8_2:
309-
; X64-AVX-NEXT: retq
290+
; X64-LABEL: PR85419:
291+
; X64: # %bb.0:
292+
; X64-NEXT: xorl %eax, %eax
293+
; X64-NEXT: cmpq $0, (%rdi)
294+
; X64-NEXT: je .LBB8_2
295+
; X64-NEXT: # %bb.1:
296+
; X64-NEXT: movl 8(%rdi), %eax
297+
; X64-NEXT: .LBB8_2:
298+
; X64-NEXT: retq
310299
%load = load <2 x i64>, ptr %p0, align 16
311300
%vecext.i = extractelement <2 x i64> %load, i64 0
312301
%cmp = icmp eq i64 %vecext.i, 0

llvm/test/CodeGen/X86/pr45378.ll

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=SSE41
4-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX
6-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX
7-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=AVX
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE41
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
6+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX
7+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX
88

99
declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
1010

@@ -71,28 +71,12 @@ define i1 @parseHeaders2_scalar_or(ptr %ptr) nounwind {
7171
}
7272

7373
define i1 @parseHeaders2_scalar_and(ptr %ptr) nounwind {
74-
; SSE2-LABEL: parseHeaders2_scalar_and:
75-
; SSE2: # %bb.0:
76-
; SSE2-NEXT: movdqu (%rdi), %xmm0
77-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
78-
; SSE2-NEXT: movq %xmm0, %rax
79-
; SSE2-NEXT: testq %rax, (%rdi)
80-
; SSE2-NEXT: sete %al
81-
; SSE2-NEXT: retq
82-
;
83-
; SSE41-LABEL: parseHeaders2_scalar_and:
84-
; SSE41: # %bb.0:
85-
; SSE41-NEXT: movq (%rdi), %rax
86-
; SSE41-NEXT: testq %rax, 8(%rdi)
87-
; SSE41-NEXT: sete %al
88-
; SSE41-NEXT: retq
89-
;
90-
; AVX-LABEL: parseHeaders2_scalar_and:
91-
; AVX: # %bb.0:
92-
; AVX-NEXT: movq (%rdi), %rax
93-
; AVX-NEXT: testq %rax, 8(%rdi)
94-
; AVX-NEXT: sete %al
95-
; AVX-NEXT: retq
74+
; CHECK-LABEL: parseHeaders2_scalar_and:
75+
; CHECK: # %bb.0:
76+
; CHECK-NEXT: movq (%rdi), %rax
77+
; CHECK-NEXT: testq %rax, 8(%rdi)
78+
; CHECK-NEXT: sete %al
79+
; CHECK-NEXT: retq
9680
%vload = load <2 x i64>, ptr %ptr, align 8
9781
%v1 = extractelement <2 x i64> %vload, i32 0
9882
%v2 = extractelement <2 x i64> %vload, i32 1

llvm/test/CodeGen/X86/setcc-non-simple-type.ll

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -60,36 +60,30 @@ define void @failing(ptr %0, ptr %1) nounwind {
6060
; CHECK-NEXT: .LBB0_2: # %vector.body
6161
; CHECK-NEXT: # Parent Loop BB0_1 Depth=1
6262
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
63-
; CHECK-NEXT: movdqu 1024(%rdx,%rdi), %xmm5
64-
; CHECK-NEXT: movdqu 1040(%rdx,%rdi), %xmm6
65-
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
66-
; CHECK-NEXT: movq %xmm5, %r8
67-
; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
68-
; CHECK-NEXT: movq %xmm5, %r9
69-
; CHECK-NEXT: cmpq 1040(%rdx,%rdi), %rsi
70-
; CHECK-NEXT: movq %rcx, %r10
71-
; CHECK-NEXT: sbbq %r9, %r10
72-
; CHECK-NEXT: setge %r9b
73-
; CHECK-NEXT: movzbl %r9b, %r9d
74-
; CHECK-NEXT: andl $1, %r9d
75-
; CHECK-NEXT: negq %r9
76-
; CHECK-NEXT: movq %r9, %xmm5
7763
; CHECK-NEXT: cmpq 1024(%rdx,%rdi), %rsi
78-
; CHECK-NEXT: movq %rcx, %r9
79-
; CHECK-NEXT: sbbq %r8, %r9
64+
; CHECK-NEXT: movq %rcx, %r8
65+
; CHECK-NEXT: sbbq 1032(%rdx,%rdi), %r8
66+
; CHECK-NEXT: setge %r8b
67+
; CHECK-NEXT: movzbl %r8b, %r8d
68+
; CHECK-NEXT: andl $1, %r8d
69+
; CHECK-NEXT: negq %r8
70+
; CHECK-NEXT: movq %r8, %xmm5
71+
; CHECK-NEXT: cmpq 1040(%rdx,%rdi), %rsi
72+
; CHECK-NEXT: movq %rcx, %r8
73+
; CHECK-NEXT: sbbq 1048(%rdx,%rdi), %r8
8074
; CHECK-NEXT: setge %r8b
8175
; CHECK-NEXT: movzbl %r8b, %r8d
8276
; CHECK-NEXT: andl $1, %r8d
8377
; CHECK-NEXT: negq %r8
8478
; CHECK-NEXT: movq %r8, %xmm6
85-
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
86-
; CHECK-NEXT: movdqa %xmm1, %xmm5
87-
; CHECK-NEXT: psllq %xmm4, %xmm5
79+
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
80+
; CHECK-NEXT: movdqa %xmm1, %xmm6
81+
; CHECK-NEXT: psllq %xmm4, %xmm6
8882
; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
8983
; CHECK-NEXT: movdqa %xmm1, %xmm8
9084
; CHECK-NEXT: psllq %xmm7, %xmm8
91-
; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
92-
; CHECK-NEXT: andpd %xmm6, %xmm8
85+
; CHECK-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
86+
; CHECK-NEXT: andpd %xmm5, %xmm8
9387
; CHECK-NEXT: orpd %xmm8, %xmm3
9488
; CHECK-NEXT: paddq %xmm2, %xmm4
9589
; CHECK-NEXT: addq $32, %rdi

llvm/test/CodeGen/X86/var-permute-128.ll

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,17 +1101,13 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in
11011101
define void @indices_convert() {
11021102
; SSE3-LABEL: indices_convert:
11031103
; SSE3: # %bb.0: # %bb
1104-
; SSE3-NEXT: movdqa (%rax), %xmm0
1105-
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1106-
; SSE3-NEXT: movd %xmm1, %eax
1107-
; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1108-
; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1104+
; SSE3-NEXT: movaps (%rax), %xmm0
1105+
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1106+
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1107+
; SSE3-NEXT: movl (%rax), %eax
1108+
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1109+
; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
11091110
; SSE3-NEXT: andl $3, %eax
1110-
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
1111-
; SSE3-NEXT: movd %xmm1, %ecx
1112-
; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1113-
; SSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1114-
; SSE3-NEXT: andl $3, %ecx
11151111
; SSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
11161112
; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
11171113
; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -1120,17 +1116,13 @@ define void @indices_convert() {
11201116
;
11211117
; SSSE3-LABEL: indices_convert:
11221118
; SSSE3: # %bb.0: # %bb
1123-
; SSSE3-NEXT: movdqa (%rax), %xmm0
1124-
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1125-
; SSSE3-NEXT: movd %xmm1, %eax
1126-
; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1127-
; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1119+
; SSSE3-NEXT: movaps (%rax), %xmm0
1120+
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1121+
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1122+
; SSSE3-NEXT: movl (%rax), %eax
1123+
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1124+
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
11281125
; SSSE3-NEXT: andl $3, %eax
1129-
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
1130-
; SSSE3-NEXT: movd %xmm1, %ecx
1131-
; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1132-
; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
1133-
; SSSE3-NEXT: andl $3, %ecx
11341126
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
11351127
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
11361128
; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]

0 commit comments

Comments
 (0)