-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DAG] isSplatValue - only treat binop splats with repeated undef elements as undef #135945
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ents as undef llvm#135597 didn't correctly fix the issue of binops with an undef element from only one operand - only reporting the common undef elements could incorrectly recognise splats where the (binop X, undef) fold might actually be different - we need to ensure both operands have the same demanded undefs for certainty. Fixes llvm#135917
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-selectiondag Author: Simon Pilgrim (RKSimon) Changes#135597 didn't correctly fix the issue of binops with an undef element from only one operand - only reporting the common undef elements could incorrectly recognise splats where the (binop X, undef) fold might actually be different - we need to ensure both operands have the same demanded undefs for certainty. Fixes #135917 Patch is 44.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135945.diff 8 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 46fc8856640de..8682c40898046 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3002,12 +3002,14 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
APInt UndefLHS, UndefRHS;
SDValue LHS = V.getOperand(0);
SDValue RHS = V.getOperand(1);
- // Only propagate common undef elts for both operands, otherwise we might
- // fail to handle binop-specific undef handling.
+ // Only recognize splats with the same demanded undef elements for both
+ // operands, otherwise we might fail to handle binop-specific undef
+ // handling.
// e.g. (and undef, 0) -> 0 etc.
if (isSplatValue(LHS, DemandedElts, UndefLHS, Depth + 1) &&
- isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1)) {
- UndefElts = UndefLHS & UndefRHS;
+ isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1) &&
+ (DemandedElts & UndefLHS) == (DemandedElts & UndefRHS)) {
+ UndefElts = UndefLHS | UndefRHS;
return true;
}
return false;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index 3df63b4de82e3..dbbb8362144ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -452,7 +452,8 @@ define void @buggy(i32 %0) #0 {
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
; RV64-NEXT: vor.vi v8, v8, 1
-; RV64-NEXT: vse32.v v8, (zero)
+; RV64-NEXT: vrgather.vi v9, v8, 0
+; RV64-NEXT: vse32.v v9, (zero)
; RV64-NEXT: ret
entry:
%mul.us.us.i.3 = shl i32 %0, 1
diff --git a/llvm/test/CodeGen/X86/pr134602.ll b/llvm/test/CodeGen/X86/pr134602.ll
index 50efcde462532..063b6f31fe630 100644
--- a/llvm/test/CodeGen/X86/pr134602.ll
+++ b/llvm/test/CodeGen/X86/pr134602.ll
@@ -17,7 +17,7 @@ define i32 @PR134602(i16 %a0) {
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X64-NEXT: paddw %xmm0, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: psrld $16, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr135917.ll b/llvm/test/CodeGen/X86/pr135917.ll
index 9eed955128b74..2061e3e7cc395 100644
--- a/llvm/test/CodeGen/X86/pr135917.ll
+++ b/llvm/test/CodeGen/X86/pr135917.ll
@@ -1,46 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefix=SSE4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefix=AVX512
define i32 @PR135917(i1 %a0) {
-; SSE2-LABEL: PR135917:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: addl %ecx, %eax
-; SSE2-NEXT: retq
-;
-; SSE4-LABEL: PR135917:
-; SSE4: # %bb.0:
-; SSE4-NEXT: movd %edi, %xmm0
-; SSE4-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: movd %xmm0, %ecx
-; SSE4-NEXT: pextrd $1, %xmm0, %eax
-; SSE4-NEXT: addl %ecx, %eax
-; SSE4-NEXT: retq
-;
-; AVX2-LABEL: PR135917:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: addl %ecx, %eax
-; AVX2-NEXT: retq
+; CHECK-LABEL: PR135917:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: notl %edi
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: leal (%rdi,%rdi), %eax
+; CHECK-NEXT: retq
;
; AVX512-LABEL: PR135917:
; AVX512: # %bb.0:
-; AVX512-NEXT: andb $1, %dil
-; AVX512-NEXT: negb %dil
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: knotw %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm0
; AVX512-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %ecx
; AVX512-NEXT: vpextrd $1, %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 8523cb4973827..9ecc6296a844a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -162,42 +162,72 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
; SSE2-LABEL: splatvar_funnnel_v2i32:
; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE2-NEXT: psllq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_funnnel_v2i32:
; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: pslld $23, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; SSE41-NEXT: psllq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE41-NEXT: psllq %xmm1, %xmm0
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pmuludq %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_funnnel_v2i32:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_funnnel_v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; AVX2-NEXT: vpsllq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
+; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i32:
@@ -259,12 +289,22 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X86-SSE2-NEXT: pslld $23, %xmm1
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
-; X86-SSE2-NEXT: psllq %xmm1, %xmm2
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X86-SSE2-NEXT: psllq %xmm1, %xmm0
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer
%res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %x, <2 x i32> %splat)
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index eb4d84b8d7dd6..322ebe22671e6 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -248,27 +248,162 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
;
define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
-; SSE-LABEL: splatvar_funnnel_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: psllq %xmm2, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: psllq %xmm2, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: splatvar_funnnel_v2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrld %xmm2, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrld %xmm7, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pslld $23, %xmm3
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: cvttps2dq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
-; AVX-NEXT: retq
+; SSE41-LABEL: splatvar_funnnel_v2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pandn %xmm3, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: psrld $1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: psrld %xmm7, %xmm8
+; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: psrld %xmm4, %xmm6
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pslld $23, %xmm2
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: por %xmm6, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: splatvar_funnnel_v2i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: splatvar_funnnel_v2i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512F-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: splatvar_funnnel_v2i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VL-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: splatvar_funnnel_v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512BW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VLBW-LABEL: splatvar_funnnel_v2i32:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [31,31,31,31]
+; AVX512VLBW-NEXT: vpandn %xmm4, %xmm3, %xmm3
+; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlvd %xmm3, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32:
; AVX512VBMI2: # %bb.0:
@@ -286,26 +421,67 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: splatvar_funnnel_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; XOP-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; XOP-NEXT: vpsllq %xmm2, %xm...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…ents as undef (llvm#135945) llvm#135597 didn't correctly fix the issue of binops with an undef element from only one operand - only reporting the common undef elements could incorrectly recognise splats where the (binop X, undef) fold might actually be different - we need to ensure both operands have the same demanded undefs for certainty. Fixes llvm#135917
#135597 didn't correctly fix the issue of binops with an undef element from only one operand - only reporting the common undef elements could incorrectly recognise splats where the (binop X, undef) fold might actually be different - we need to ensure both operands have the same demanded undefs for certainty.
Fixes #135917