Skip to content

Commit fb054e6

Browse files
authored
[X86] splitAndLowerShuffle - split a v8f32 bitcast from v8i32 operands as 2 v4i32 shuffles (#143493)
AVX1 performs v8i32 shuffles as bitcast v8f32, but if we split these back to v4f32 instead of peeking through the bitcasts, we can lose track of the original domain. Fixes an issue I noticed working on #142972 where we were using v4f32 blends instead of v8i16 resulting in a lot of domain crossing. Its also helps avoid unnecessary use of VINSERTPS nodes which can be tricky to commute or concatenate back to 256-bit vectors.
1 parent 7e471c1 commit fb054e6

11 files changed

+1617
-1837
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15101,6 +15101,17 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
1510115101
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
1510215102
assert(V2.getSimpleValueType() == VT && "Bad operand type!");
1510315103

15104+
// If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15105+
if (VT == MVT::v8f32) {
15106+
SDValue BC1 = peekThroughBitcasts(V1);
15107+
SDValue BC2 = peekThroughBitcasts(V2);
15108+
if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15109+
if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15110+
DAG, SimpleOnly))
15111+
return DAG.getBitcast(VT, Split);
15112+
}
15113+
}
15114+
1510415115
ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
1510515116
ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
1510615117

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3467,9 +3467,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
34673467
; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
34683468
; AVX: # %bb.0:
34693469
; AVX-NEXT: vmovdqa (%rdi), %xmm0
3470-
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
3471-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3472-
; AVX-NEXT: vbroadcastss (%rdi), %xmm2
3470+
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3471+
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7]
3472+
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
34733473
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
34743474
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
34753475
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0
@@ -3664,13 +3664,13 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
36643664
;
36653665
; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
36663666
; AVX: # %bb.0:
3667-
; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
3668-
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
3669-
; AVX-NEXT: vbroadcastss (%rdi), %xmm1
3670-
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3671-
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
3672-
; AVX-NEXT: vmovdqa %xmm1, 16(%rdx)
3673-
; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3667+
; AVX-NEXT: vmovdqa (%rdi), %xmm0
3668+
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
3669+
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3670+
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
3671+
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
3672+
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
3673+
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
36743674
; AVX-NEXT: retq
36753675
;
36763676
; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,16 +1688,16 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
16881688
; AVX1-NEXT: vmovups (%rsi), %xmm2
16891689
; AVX1-NEXT: vmovups 16(%rsi), %xmm3
16901690
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm1[3,3]
1691-
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
1692-
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
1691+
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2]
1692+
; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
16931693
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
16941694
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
16951695
; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
16961696
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
1697-
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm0[1]
1698-
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
1699-
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1700-
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
1697+
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,2,2]
1698+
; AVX1-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3]
1699+
; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1700+
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
17011701
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
17021702
; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
17031703
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
@@ -1808,16 +1808,16 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
18081808
; XOP-NEXT: vmovups (%rsi), %xmm3
18091809
; XOP-NEXT: vmovups 16(%rsi), %xmm4
18101810
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
1811-
; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1812-
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
1811+
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2]
1812+
; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
18131813
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
18141814
; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
18151815
; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
18161816
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
1817-
; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
1818-
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
1819-
; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
1820-
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
1817+
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1,2,2]
1818+
; XOP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2],xmm4[3]
1819+
; XOP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1820+
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
18211821
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
18221822
; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
18231823
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]

0 commit comments

Comments
 (0)