-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[X86] splitAndLowerShuffle - split a v8f32 bitcast from v8i32 operands as 2 v4i32 shuffles #143493
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…s as 2 v4i32 shuffles AVX1 performs v8i32 shuffles as bitcast v8f32, but if we split these back to v4f32 instead of peeking through the bitcasts, we can lose track of the original domain. Fixes an issue I noticed working on llvm#142972 where we were using v4f32 blends instead of v8i16 resulting in a lot of domain crossing. Its also helps avoid unnecessary use of VINSERTPS nodes which can be tricky to commute or concatenate back to 256-bit vectors.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesAVX1 performs v8i32 shuffles as bitcast v8f32, but if we split these back to v4f32 instead of peeking through the bitcasts, we can lose track of the original domain. Fixes an issue I noticed working on #142972 where we were using v4f32 blends instead of v8i16 resulting in a lot of domain crossing. Its also helps avoid unnecessary use of VINSERTPS nodes which can be tricky to commute or concatenate back to 256-bit vectors. Patch is 242.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143493.diff 11 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b34215b316128..a983de7019bd9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15101,6 +15101,17 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+ // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
+ if (VT == MVT::v8f32) {
+ SDValue BC1 = peekThroughBitcasts(V1);
+ SDValue BC2 = peekThroughBitcasts(V2);
+ if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
+ if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
+ DAG, SimpleOnly))
+ return DAG.getBitcast(VT, Split);
+ }
+ }
+
ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index f5802150d5353..dc723eb713c28 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3467,9 +3467,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX-NEXT: vbroadcastss (%rdi), %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
@@ -3664,13 +3664,13 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vbroadcastss (%rdi), %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index edc8404993996..6b9a86343ea10 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1688,16 +1688,16 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX1-NEXT: vmovups (%rsi), %xmm2
; AVX1-NEXT: vmovups 16(%rsi), %xmm3
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm1[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm0[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
@@ -1808,16 +1808,16 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; XOP-NEXT: vmovups (%rsi), %xmm3
; XOP-NEXT: vmovups 16(%rsi), %xmm4
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
-; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2]
+; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
-; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
-; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,2,2]
+; XOP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2],xmm4[3]
+; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index 7303f6124afcb..acf9bad81736d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -243,19 +243,19 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vmovaps (%rsi), %xmm1
; AVX-NEXT: vmovaps (%rdx), %xmm2
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
-; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0],xmm4[3]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX-NEXT: vmovaps %xmm0, 32(%rcx)
-; AVX-NEXT: vmovaps %ymm3, (%rcx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm4, 16(%rcx)
+; AVX-NEXT: vmovaps %xmm3, (%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: store_i32_stride3_vf4:
@@ -458,20 +458,20 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX-LABEL: store_i32_stride3_vf8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm1
-; AVX-NEXT: vmovaps (%rdi), %xmm2
-; AVX-NEXT: vmovaps 16(%rdi), %xmm3
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1],xmm4[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps 16(%rdi), %xmm1
+; AVX-NEXT: vmovaps (%rsi), %xmm2
+; AVX-NEXT: vmovaps 16(%rsi), %xmm3
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX-NEXT: vbroadcastsd (%rdx), %ymm2
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3],xmm1[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm3[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,3]
@@ -779,39 +779,39 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i32_stride3_vf16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm1
-; AVX-NEXT: vmovaps 32(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps (%rdi), %xmm4
-; AVX-NEXT: vmovaps 16(%rdi), %xmm5
-; AVX-NEXT: vmovaps 32(%rdi), %xmm6
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1],xmm7[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,1]
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps 16(%rdi), %xmm1
+; AVX-NEXT: vmovaps 32(%rdi), %xmm2
+; AVX-NEXT: vmovaps (%rsi), %xmm3
+; AVX-NEXT: vmovaps 16(%rsi), %xmm4
+; AVX-NEXT: vmovaps 32(%rsi), %xmm5
+; AVX-NEXT: vmovaps 48(%rsi), %xmm6
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm3[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2],xmm7[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT: vbroadcastsd (%rdx), %ymm4
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
-; AVX-NEXT: vmovaps 48(%rdi), %xmm4
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm4[3,3],xmm3[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,2]
+; AVX-NEXT: vbroadcastsd (%rdx), %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
+; AVX-NEXT: vmovaps 48(%rdi), %xmm3
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm3[3,3],xmm6[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm2[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm4
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3],xmm1[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm5[0,2]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm2[2],xmm6[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm5
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm1[3,3],xmm4[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
@@ -1375,74 +1375,74 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i32_stride3_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm3
-; AVX-NEXT: vmovaps 32(%rsi), %xmm6
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps 16(%rdi), %xmm3
+; AVX-NEXT: vmovaps 32(%rdi), %xmm6
+; AVX-NEXT: vmovaps (%rsi), %xmm1
+; AVX-NEXT: vmovaps 16(%rsi), %xmm4
+; AVX-NEXT: vmovaps 32(%rsi), %xmm7
; AVX-NEXT: vmovaps 48(%rsi), %xmm5
-; AVX-NEXT: vmovaps (%rdi), %xmm1
-; AVX-NEXT: vmovaps 16(%rdi), %xmm4
-; AVX-NEXT: vmovaps 32(%rdi), %xmm7
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm2[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vbroadcastsd (%rdx), %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX-NEXT: vmovaps 80(%rsi), %xmm1
; AVX-NEXT: vmovaps 80(%rdi), %xmm2
; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm2[3,3],xmm1[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX-NEXT: vmovaps 64(%rsi), %xmm2
-; AVX-NEXT: vmovaps 64(%rdi), %xmm8
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm2[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm2[1,1],xmm9[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,1]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX-NEXT: vmovaps 64(%rsi), %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2],xmm9[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
; AVX-NEXT: vbroadcastsd 64(%rdx), %ymm8
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7]
; AVX-NEXT: vmovaps 48(%rdi), %xmm8
; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[3,3],xmm5[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6],ymm8[7]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1],xmm8[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[2,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2],xmm8[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm7
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
; AVX-NEXT: vmovaps 112(%rsi), %xmm7
; AVX-NEXT: vmovaps 112(%rdi), %xmm8
; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[3,3],xmm7[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
-; AVX-NEXT: vmovaps 96(%rsi), %xmm8
-; AVX-NEXT: vmovaps 96(%rdi), %xmm9
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm9[1],xmm8[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1],xmm10[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,0],xmm9[2,1]
+; AVX-NEXT: vmovaps 96(%rdi), %xmm8
+; AVX-NEXT: vmovaps 96(%rsi), %xmm9
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2],xmm10[3]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8
; AVX-NEXT: vbroadcastsd 96(%rdx), %ymm9
; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm4[3,3],xmm3[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,2]
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm3[3,3],xmm4[3,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
@@ -2526,52 +2526,52 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-LABEL: store_i32_stride3_vf64:
; AVX: # %bb.0:
; AVX-NEXT: subq $168, %rsp
-; AVX-NEXT: vmovaps (%rsi), %xmm0
-; AVX-NEXT: vmovaps 16(%rsi), %xmm1
-; AVX-NEXT: vmovaps 32(%rsi), %xmm2
-; AVX-NEXT: vmovaps 48(%rsi), %xmm3
-; AVX-NEXT: vmovaps (%rdi), %xmm4
-; AVX-NEXT: vmovaps 16(%rdi), %xmm5
-; AVX-NEXT: vmovaps 32(%rdi), %xmm6
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm0[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1],xmm7[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2...
[truncated]
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/24220 Here is the relevant piece of the build log for the reference
|
…s as 2 v4i32 shuffles (llvm#143493) AVX1 performs v8i32 shuffles as bitcast v8f32, but if we split these back to v4f32 instead of peeking through the bitcasts, we can lose track of the original domain. Fixes an issue I noticed working on llvm#142972 where we were using v4f32 blends instead of v8i16 resulting in a lot of domain crossing. Its also helps avoid unnecessary use of VINSERTPS nodes which can be tricky to commute or concatenate back to 256-bit vectors.
…s as 2 v4i32 shuffles (llvm#143493) AVX1 performs v8i32 shuffles as bitcast v8f32, but if we split these back to v4f32 instead of peeking through the bitcasts, we can lose track of the original domain. Fixes an issue I noticed working on llvm#142972 where we were using v4f32 blends instead of v8i16 resulting in a lot of domain crossing. Its also helps avoid unnecessary use of VINSERTPS nodes which can be tricky to commute or concatenate back to 256-bit vectors.
AVX1 performs v8i32 shuffles as bitcast v8f32, but if we split these back to v4f32 instead of peeking through the bitcasts, we can lose track of the original domain.
Fixes an issue I noticed working on #142972 where we were using v4f32 blends instead of v8i16 resulting in a lot of domain crossing.
Its also helps avoid unnecessary use of VINSERTPS nodes which can be tricky to commute or concatenate back to 256-bit vectors.