Skip to content

Commit 31ed36d

Browse files
committed
[X86] SimplifyDemandedVectorElts - attempt to recombine target shuffle using DemandedElts mask (REAPPLIED)
If we don't demand all elements, then attempt to combine to a simpler shuffle. At the moment we can only do this if Depth == 0 as combineX86ShufflesRecursively uses Depth to track whether the shuffle has really changed or not - we'll need to change this before we can properly start merging combineX86ShufflesRecursively into SimplifyDemandedVectorElts (see D66004). This reapplies rL368307 (reverted at rL369167) after the fix for the infinite loop reported at PR43024 was applied at rG3f087e38a2e7b87a5adaaac1c1b61e51220e7ff3
1 parent 3169f01 commit 31ed36d

File tree

5 files changed

+273
-254
lines changed

5 files changed

+273
-254
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34750,6 +34750,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
3475034750
return true;
3475134751
}
3475234752

34753+
// If we don't demand all elements, then attempt to combine to a simpler
34754+
// shuffle.
34755+
// TODO: Handle other depths, but first we need to handle the fact that
34756+
// it might combine to the same shuffle.
34757+
if (!DemandedElts.isAllOnesValue() && Depth == 0) {
34758+
SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
34759+
for (int i = 0; i != NumElts; ++i)
34760+
if (DemandedElts[i])
34761+
DemandedMask[i] = i;
34762+
34763+
SDValue NewShuffle = combineX86ShufflesRecursively(
34764+
{Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
34765+
/*AllowVarMask*/ true, TLO.DAG, Subtarget);
34766+
if (NewShuffle)
34767+
return TLO.CombineTo(Op, NewShuffle);
34768+
}
34769+
3475334770
return false;
3475434771
}
3475534772

llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8320,7 +8320,7 @@ define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
83208320
; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
83218321
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
83228322
; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8323-
; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8323+
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
83248324
; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
83258325
; X86-NEXT: vmovss %xmm0, (%esp)
83268326
; X86-NEXT: flds (%esp)
@@ -8337,7 +8337,7 @@ define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
83378337
; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
83388338
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
83398339
; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8340-
; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8340+
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
83418341
; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
83428342
; X64-NEXT: vzeroupper
83438343
; X64-NEXT: retq
@@ -8446,7 +8446,7 @@ define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
84468446
; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
84478447
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
84488448
; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8449-
; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8449+
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
84508450
; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
84518451
; X86-NEXT: vmovss %xmm0, (%esp)
84528452
; X86-NEXT: flds (%esp)
@@ -8463,7 +8463,7 @@ define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
84638463
; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
84648464
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
84658465
; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8466-
; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8466+
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
84678467
; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
84688468
; X64-NEXT: vzeroupper
84698469
; X64-NEXT: retq
@@ -8624,7 +8624,7 @@ define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W)
86248624
; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
86258625
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
86268626
; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8627-
; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8627+
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
86288628
; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
86298629
; X86-NEXT: vmovss %xmm0, (%esp)
86308630
; X86-NEXT: flds (%esp)
@@ -8644,7 +8644,7 @@ define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W)
86448644
; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
86458645
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
86468646
; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8647-
; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8647+
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
86488648
; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
86498649
; X64-NEXT: vzeroupper
86508650
; X64-NEXT: retq
@@ -8809,7 +8809,7 @@ define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W)
88098809
; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
88108810
; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
88118811
; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8812-
; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8812+
; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
88138813
; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
88148814
; X86-NEXT: vmovss %xmm0, (%esp)
88158815
; X86-NEXT: flds (%esp)
@@ -8829,7 +8829,7 @@ define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W)
88298829
; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
88308830
; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
88318831
; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8832-
; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8832+
; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
88338833
; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
88348834
; X64-NEXT: vzeroupper
88358835
; X64-NEXT: retq

llvm/test/CodeGen/X86/shrink_vmul.ll

Lines changed: 104 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -2085,85 +2085,88 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
20852085
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
20862086
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
20872087
; X86-SSE-NEXT: movdqa (%eax), %xmm5
2088-
; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2088+
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
20892089
; X86-SSE-NEXT: movdqa (%ecx), %xmm2
20902090
; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6
2091-
; X86-SSE-NEXT: pxor %xmm0, %xmm0
2092-
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2091+
; X86-SSE-NEXT: pxor %xmm1, %xmm1
2092+
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
20932093
; X86-SSE-NEXT: movdqa %xmm5, %xmm4
2094-
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
2095-
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2096-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2097-
; X86-SSE-NEXT: movd %xmm0, %eax
2098-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2099-
; X86-SSE-NEXT: movd %xmm0, %esi
2094+
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
2095+
; X86-SSE-NEXT: movdqa %xmm5, %xmm3
2096+
; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2097+
; X86-SSE-NEXT: movdqa %xmm5, %xmm1
2098+
; X86-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2099+
; X86-SSE-NEXT: movd %xmm1, %eax
2100+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
2101+
; X86-SSE-NEXT: movd %xmm1, %esi
21002102
; X86-SSE-NEXT: xorl %edx, %edx
21012103
; X86-SSE-NEXT: divl %esi
2102-
; X86-SSE-NEXT: movd %edx, %xmm0
2103-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
2104-
; X86-SSE-NEXT: movd %xmm3, %eax
2105-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
2106-
; X86-SSE-NEXT: movd %xmm3, %esi
2104+
; X86-SSE-NEXT: movd %edx, %xmm1
2105+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
2106+
; X86-SSE-NEXT: movd %xmm7, %eax
2107+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1]
2108+
; X86-SSE-NEXT: movd %xmm7, %esi
21072109
; X86-SSE-NEXT: xorl %edx, %edx
21082110
; X86-SSE-NEXT: divl %esi
21092111
; X86-SSE-NEXT: movd %edx, %xmm7
2110-
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2111-
; X86-SSE-NEXT: movd %xmm5, %eax
2112+
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
2113+
; X86-SSE-NEXT: movd %xmm3, %eax
21122114
; X86-SSE-NEXT: movd %xmm6, %esi
21132115
; X86-SSE-NEXT: xorl %edx, %edx
21142116
; X86-SSE-NEXT: divl %esi
2117+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
2118+
; X86-SSE-NEXT: movd %xmm3, %eax
21152119
; X86-SSE-NEXT: movd %edx, %xmm3
2116-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2117-
; X86-SSE-NEXT: movd %xmm5, %eax
2118-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2119-
; X86-SSE-NEXT: movd %xmm5, %esi
2120-
; X86-SSE-NEXT: xorl %edx, %edx
2121-
; X86-SSE-NEXT: divl %esi
2122-
; X86-SSE-NEXT: movd %edx, %xmm5
2123-
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
2124-
; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
2125-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
2126-
; X86-SSE-NEXT: movd %xmm6, %eax
2127-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2120+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
21282121
; X86-SSE-NEXT: movd %xmm6, %esi
21292122
; X86-SSE-NEXT: xorl %edx, %edx
21302123
; X86-SSE-NEXT: divl %esi
21312124
; X86-SSE-NEXT: movd %edx, %xmm6
2132-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
2125+
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
2126+
; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
2127+
; X86-SSE-NEXT: movdqa %xmm5, %xmm7
2128+
; X86-SSE-NEXT: psrld $16, %xmm7
21332129
; X86-SSE-NEXT: movd %xmm7, %eax
2134-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2130+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,2,3]
21352131
; X86-SSE-NEXT: movd %xmm7, %esi
21362132
; X86-SSE-NEXT: xorl %edx, %edx
21372133
; X86-SSE-NEXT: divl %esi
21382134
; X86-SSE-NEXT: movd %edx, %xmm7
2139-
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
21402135
; X86-SSE-NEXT: movd %xmm4, %eax
21412136
; X86-SSE-NEXT: movd %xmm2, %esi
21422137
; X86-SSE-NEXT: xorl %edx, %edx
21432138
; X86-SSE-NEXT: divl %esi
2144-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
2139+
; X86-SSE-NEXT: psrlq $48, %xmm5
2140+
; X86-SSE-NEXT: movd %xmm5, %eax
2141+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
2142+
; X86-SSE-NEXT: movd %xmm5, %esi
2143+
; X86-SSE-NEXT: movd %edx, %xmm5
2144+
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
2145+
; X86-SSE-NEXT: xorl %edx, %edx
2146+
; X86-SSE-NEXT: divl %esi
2147+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
21452148
; X86-SSE-NEXT: movd %xmm4, %eax
21462149
; X86-SSE-NEXT: movd %edx, %xmm4
2147-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2150+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
21482151
; X86-SSE-NEXT: movd %xmm2, %esi
21492152
; X86-SSE-NEXT: xorl %edx, %edx
21502153
; X86-SSE-NEXT: divl %esi
21512154
; X86-SSE-NEXT: movd %edx, %xmm2
2155+
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2156+
; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
2157+
; X86-SSE-NEXT: movd %xmm0, %eax
2158+
; X86-SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm4[0,0]
2159+
; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
2160+
; X86-SSE-NEXT: pmuludq %xmm0, %xmm7
2161+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
2162+
; X86-SSE-NEXT: pmuludq %xmm0, %xmm5
2163+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
21522164
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2153-
; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2154-
; X86-SSE-NEXT: movd %xmm1, %eax
2155-
; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2156-
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2157-
; X86-SSE-NEXT: pmuludq %xmm1, %xmm4
2158-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2159-
; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
2160-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2161-
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2162-
; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
2163-
; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
2164-
; X86-SSE-NEXT: pmuludq %xmm1, %xmm5
2165+
; X86-SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm1[0,0]
2166+
; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
2167+
; X86-SSE-NEXT: pmuludq %xmm0, %xmm6
21652168
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2166-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2169+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
21672170
; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
21682171
; X86-SSE-NEXT: xorl %edx, %edx
21692172
; X86-SSE-NEXT: divl 32(%ecx)
@@ -2324,92 +2327,95 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
23242327
; X64-SSE-LABEL: PR34947:
23252328
; X64-SSE: # %bb.0:
23262329
; X64-SSE-NEXT: movdqa (%rdi), %xmm5
2327-
; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2330+
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
23282331
; X64-SSE-NEXT: movdqa (%rsi), %xmm2
23292332
; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6
2330-
; X64-SSE-NEXT: pxor %xmm0, %xmm0
2331-
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2333+
; X64-SSE-NEXT: pxor %xmm1, %xmm1
2334+
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
23322335
; X64-SSE-NEXT: movdqa %xmm5, %xmm3
2333-
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2334-
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2335-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
2336-
; X64-SSE-NEXT: movd %xmm0, %eax
2337-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
2338-
; X64-SSE-NEXT: movd %xmm0, %ecx
2336+
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2337+
; X64-SSE-NEXT: movdqa %xmm5, %xmm7
2338+
; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
2339+
; X64-SSE-NEXT: movdqa %xmm5, %xmm1
2340+
; X64-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2341+
; X64-SSE-NEXT: movd %xmm1, %eax
2342+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
2343+
; X64-SSE-NEXT: movd %xmm1, %ecx
23392344
; X64-SSE-NEXT: xorl %edx, %edx
23402345
; X64-SSE-NEXT: divl %ecx
23412346
; X64-SSE-NEXT: movd %edx, %xmm8
2342-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
2343-
; X64-SSE-NEXT: movd %xmm4, %eax
2344-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1]
2345-
; X64-SSE-NEXT: movd %xmm4, %ecx
2347+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
2348+
; X64-SSE-NEXT: movd %xmm1, %eax
2349+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,0,1]
2350+
; X64-SSE-NEXT: movd %xmm1, %ecx
23462351
; X64-SSE-NEXT: xorl %edx, %edx
23472352
; X64-SSE-NEXT: divl %ecx
2348-
; X64-SSE-NEXT: movd %edx, %xmm7
2349-
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2350-
; X64-SSE-NEXT: movd %xmm5, %eax
2353+
; X64-SSE-NEXT: movd %edx, %xmm1
2354+
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
2355+
; X64-SSE-NEXT: movd %xmm7, %eax
23512356
; X64-SSE-NEXT: movd %xmm6, %ecx
23522357
; X64-SSE-NEXT: xorl %edx, %edx
23532358
; X64-SSE-NEXT: divl %ecx
23542359
; X64-SSE-NEXT: movd %edx, %xmm4
2355-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
2356-
; X64-SSE-NEXT: movd %xmm5, %eax
2357-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
2358-
; X64-SSE-NEXT: movd %xmm5, %ecx
2359-
; X64-SSE-NEXT: xorl %edx, %edx
2360-
; X64-SSE-NEXT: divl %ecx
2361-
; X64-SSE-NEXT: movd %edx, %xmm5
2362-
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
2363-
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2364-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
2365-
; X64-SSE-NEXT: movd %xmm6, %eax
2366-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
2360+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,2,3]
2361+
; X64-SSE-NEXT: movd %xmm7, %eax
2362+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
23672363
; X64-SSE-NEXT: movd %xmm6, %ecx
23682364
; X64-SSE-NEXT: xorl %edx, %edx
23692365
; X64-SSE-NEXT: divl %ecx
23702366
; X64-SSE-NEXT: movd %edx, %xmm6
2371-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
2372-
; X64-SSE-NEXT: movd %xmm7, %eax
2373-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
2374-
; X64-SSE-NEXT: movd %xmm7, %ecx
2367+
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
2368+
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
2369+
; X64-SSE-NEXT: movdqa %xmm5, %xmm1
2370+
; X64-SSE-NEXT: psrld $16, %xmm1
2371+
; X64-SSE-NEXT: movd %xmm1, %eax
2372+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,3]
2373+
; X64-SSE-NEXT: movd %xmm1, %ecx
23752374
; X64-SSE-NEXT: xorl %edx, %edx
23762375
; X64-SSE-NEXT: divl %ecx
23772376
; X64-SSE-NEXT: movd %edx, %xmm7
2378-
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
23792377
; X64-SSE-NEXT: movd %xmm3, %eax
23802378
; X64-SSE-NEXT: movd %xmm2, %ecx
23812379
; X64-SSE-NEXT: xorl %edx, %edx
23822380
; X64-SSE-NEXT: divl %ecx
2383-
; X64-SSE-NEXT: movd %edx, %xmm0
2384-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
2381+
; X64-SSE-NEXT: movd %edx, %xmm1
2382+
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
2383+
; X64-SSE-NEXT: psrlq $48, %xmm5
2384+
; X64-SSE-NEXT: movd %xmm5, %eax
2385+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
2386+
; X64-SSE-NEXT: movd %xmm5, %ecx
2387+
; X64-SSE-NEXT: xorl %edx, %edx
2388+
; X64-SSE-NEXT: divl %ecx
2389+
; X64-SSE-NEXT: movd %edx, %xmm5
2390+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
23852391
; X64-SSE-NEXT: movd %xmm3, %eax
2386-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2392+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
23872393
; X64-SSE-NEXT: movd %xmm2, %ecx
23882394
; X64-SSE-NEXT: xorl %edx, %edx
23892395
; X64-SSE-NEXT: divl %ecx
23902396
; X64-SSE-NEXT: movd %edx, %xmm2
2391-
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2392-
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
2393-
; X64-SSE-NEXT: movd %xmm1, %eax
2397+
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
2398+
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2399+
; X64-SSE-NEXT: movd %xmm0, %eax
23942400
; X64-SSE-NEXT: xorl %edx, %edx
23952401
; X64-SSE-NEXT: divl 32(%rsi)
2396-
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
2397-
; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
2398-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2399-
; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
2400-
; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
2401-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2402-
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2403-
; X64-SSE-NEXT: pmuludq %xmm1, %xmm4
2402+
; X64-SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm5[0,0]
2403+
; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199]
2404+
; X64-SSE-NEXT: pmuludq %xmm0, %xmm7
2405+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
2406+
; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
2407+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2408+
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2409+
; X64-SSE-NEXT: pmuludq %xmm0, %xmm4
24042410
; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2405-
; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
2406-
; X64-SSE-NEXT: pmuludq %xmm1, %xmm5
2407-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
2408-
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2411+
; X64-SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm8[0,0]
2412+
; X64-SSE-NEXT: pmuludq %xmm0, %xmm6
2413+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
2414+
; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
24092415
; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
24102416
; X64-SSE-NEXT: movl %eax, (%rax)
24112417
; X64-SSE-NEXT: movdqa %xmm2, (%rax)
2412-
; X64-SSE-NEXT: movdqa %xmm0, (%rax)
2418+
; X64-SSE-NEXT: movdqa %xmm1, (%rax)
24132419
; X64-SSE-NEXT: retq
24142420
;
24152421
; X64-AVX1-LABEL: PR34947:

0 commit comments

Comments
 (0)