Skip to content

Commit 3a2f3c2

Browse files
committed
[X86][SSE] getFauxShuffleMask - Fix shuffle mask adjustment for multiple inserted subvectors
Part of the issue discovered in PR39483, although its not fully exposed until I reapply rL345395 (by reverting rL345451) llvm-svn: 345520
1 parent 220fd33 commit 3a2f3c2

File tree

2 files changed

+100
-4
lines changed

2 files changed

+100
-4
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6379,13 +6379,12 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
63796379
Mask.push_back(i);
63806380
for (int i = 0; i != (int)NumSubElts; ++i) {
63816381
int M = SubMask[i];
6382-
if (M < 0) {
6383-
Mask[i + InsertIdx] = M;
6384-
} else {
6382+
if (0 <= M) {
63856383
int InputIdx = M / NumSubElts;
63866384
int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
6387-
Mask[i + InsertIdx] = (NumElts * (1 + InputIdx)) + ExtractIdx + M;
6385+
M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
63886386
}
6387+
Mask[i + InsertIdx] = M;
63896388
}
63906389
// TODO - Add support for more than 1 subinput.
63916390
return Ops.size() <= 2;

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,3 +435,100 @@ define <8 x float> @constant_fold_vpermilvar_ps_256() {
435435
%1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
436436
ret <8 x float> %1
437437
}
438+
439+
define void @PR39483() {
440+
; X32-AVX1-LABEL: PR39483:
441+
; X32-AVX1: # %bb.0: # %entry
442+
; X32-AVX1-NEXT: vmovups 32, %ymm0
443+
; X32-AVX1-NEXT: vmovups 64, %ymm1
444+
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
445+
; X32-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
446+
; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
447+
; X32-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
448+
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
449+
; X32-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
450+
; X32-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
451+
; X32-AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
452+
; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
453+
; X32-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
454+
; X32-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
455+
; X32-AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0
456+
; X32-AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
457+
; X32-AVX1-NEXT: vmovups %ymm0, (%eax)
458+
;
459+
; X32-AVX2-LABEL: PR39483:
460+
; X32-AVX2: # %bb.0: # %entry
461+
; X32-AVX2-NEXT: vmovups 32, %ymm0
462+
; X32-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
463+
; X32-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
464+
; X32-AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
465+
; X32-AVX2-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
466+
; X32-AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
467+
; X32-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
468+
; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
469+
; X32-AVX2-NEXT: vmulps %ymm1, %ymm0, %ymm0
470+
; X32-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
471+
; X32-AVX2-NEXT: vmovups %ymm0, (%eax)
472+
;
473+
; X32-AVX512-LABEL: PR39483:
474+
; X32-AVX512: # %bb.0: # %entry
475+
; X32-AVX512-NEXT: vmovups 0, %zmm0
476+
; X32-AVX512-NEXT: vmovups 64, %ymm1
477+
; X32-AVX512-NEXT: vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
478+
; X32-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
479+
; X32-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
480+
; X32-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
481+
; X32-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0
482+
; X32-AVX512-NEXT: vmovups %ymm0, (%eax)
483+
;
484+
; X64-AVX1-LABEL: PR39483:
485+
; X64-AVX1: # %bb.0: # %entry
486+
; X64-AVX1-NEXT: vmovups 32, %ymm0
487+
; X64-AVX1-NEXT: vmovups 64, %ymm1
488+
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
489+
; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
490+
; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
491+
; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
492+
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
493+
; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
494+
; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
495+
; X64-AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
496+
; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
497+
; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
498+
; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
499+
; X64-AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0
500+
; X64-AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
501+
; X64-AVX1-NEXT: vmovups %ymm0, (%rax)
502+
;
503+
; X64-AVX2-LABEL: PR39483:
504+
; X64-AVX2: # %bb.0: # %entry
505+
; X64-AVX2-NEXT: vmovups 32, %ymm0
506+
; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
507+
; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
508+
; X64-AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
509+
; X64-AVX2-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
510+
; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
511+
; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
512+
; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
513+
; X64-AVX2-NEXT: vmulps %ymm1, %ymm0, %ymm0
514+
; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
515+
; X64-AVX2-NEXT: vmovups %ymm0, (%rax)
516+
;
517+
; X64-AVX512-LABEL: PR39483:
518+
; X64-AVX512: # %bb.0: # %entry
519+
; X64-AVX512-NEXT: vmovups 0, %zmm0
520+
; X64-AVX512-NEXT: vmovups 64, %ymm1
521+
; X64-AVX512-NEXT: vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
522+
; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
523+
; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
524+
; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
525+
; X64-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0
526+
; X64-AVX512-NEXT: vmovups %ymm0, (%rax)
527+
entry:
528+
%wide.vec = load <24 x float>, <24 x float>* null, align 4
529+
%strided.vec18 = shufflevector <24 x float> %wide.vec, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
530+
%0 = fmul <8 x float> %strided.vec18, zeroinitializer
531+
%1 = fadd <8 x float> zeroinitializer, %0
532+
store <8 x float> %1, <8 x float>* undef, align 16
533+
unreachable
534+
}

0 commit comments

Comments
 (0)