Skip to content

Commit 8756269

Browse files
committed
[X86][AVX] Fix handling of out-of-bounds shift amounts in AVX2 vector shift nodes #83840
1 parent 81e2047 commit 8756269

File tree

2 files changed

+286
-0
lines changed

2 files changed

+286
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47334,6 +47334,17 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
4733447334
if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
4733547335
return V;
4733647336

47337+
APInt ShiftAmt;
47338+
SDNode *UMinNode = N1.getNode();
47339+
if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
47340+
UMinNode->getOpcode() == ISD::UMIN &&
47341+
ISD::isConstantSplatVector(UMinNode->getOperand(1).getNode(), ShiftAmt) &&
47342+
ShiftAmt == VT.getScalarSizeInBits() - 1) {
47343+
SDValue ShrAmtVal = UMinNode->getOperand(0);
47344+
SDLoc DL(N);
47345+
return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
47346+
}
47347+
4733747348
// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
4733847349
// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
4733947350
// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

llvm/test/CodeGen/X86/combine-sra.ll

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,3 +521,278 @@ define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
521521
%2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10>
522522
ret <4 x i32> %2
523523
}
524+
525+
define <8 x i16> @combine_vec16_ashr_out_of_bound(<8 x i16> %x, <8 x i16> %y) {
526+
; SSE2-LABEL: combine_vec16_ashr_out_of_bound:
527+
; SSE2: # %bb.0:
528+
; SSE2-NEXT: movdqa %xmm1, %xmm2
529+
; SSE2-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
530+
; SSE2-NEXT: psubw %xmm2, %xmm1
531+
; SSE2-NEXT: psllw $12, %xmm1
532+
; SSE2-NEXT: movdqa %xmm1, %xmm2
533+
; SSE2-NEXT: psraw $15, %xmm2
534+
; SSE2-NEXT: movdqa %xmm2, %xmm3
535+
; SSE2-NEXT: pandn %xmm0, %xmm3
536+
; SSE2-NEXT: psraw $8, %xmm0
537+
; SSE2-NEXT: pand %xmm2, %xmm0
538+
; SSE2-NEXT: por %xmm3, %xmm0
539+
; SSE2-NEXT: paddw %xmm1, %xmm1
540+
; SSE2-NEXT: movdqa %xmm1, %xmm2
541+
; SSE2-NEXT: psraw $15, %xmm2
542+
; SSE2-NEXT: movdqa %xmm2, %xmm3
543+
; SSE2-NEXT: pandn %xmm0, %xmm3
544+
; SSE2-NEXT: psraw $4, %xmm0
545+
; SSE2-NEXT: pand %xmm2, %xmm0
546+
; SSE2-NEXT: por %xmm3, %xmm0
547+
; SSE2-NEXT: paddw %xmm1, %xmm1
548+
; SSE2-NEXT: movdqa %xmm1, %xmm2
549+
; SSE2-NEXT: psraw $15, %xmm2
550+
; SSE2-NEXT: movdqa %xmm2, %xmm3
551+
; SSE2-NEXT: pandn %xmm0, %xmm3
552+
; SSE2-NEXT: psraw $2, %xmm0
553+
; SSE2-NEXT: pand %xmm2, %xmm0
554+
; SSE2-NEXT: por %xmm3, %xmm0
555+
; SSE2-NEXT: paddw %xmm1, %xmm1
556+
; SSE2-NEXT: psraw $15, %xmm1
557+
; SSE2-NEXT: movdqa %xmm1, %xmm2
558+
; SSE2-NEXT: pandn %xmm0, %xmm2
559+
; SSE2-NEXT: psraw $1, %xmm0
560+
; SSE2-NEXT: pand %xmm1, %xmm0
561+
; SSE2-NEXT: por %xmm2, %xmm0
562+
; SSE2-NEXT: retq
563+
;
564+
; SSE41-LABEL: combine_vec16_ashr_out_of_bound:
565+
; SSE41: # %bb.0:
566+
; SSE41-NEXT: movdqa %xmm0, %xmm2
567+
; SSE41-NEXT: pminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
568+
; SSE41-NEXT: movdqa %xmm1, %xmm0
569+
; SSE41-NEXT: psllw $12, %xmm0
570+
; SSE41-NEXT: psllw $4, %xmm1
571+
; SSE41-NEXT: por %xmm1, %xmm0
572+
; SSE41-NEXT: movdqa %xmm0, %xmm1
573+
; SSE41-NEXT: paddw %xmm0, %xmm1
574+
; SSE41-NEXT: movdqa %xmm2, %xmm3
575+
; SSE41-NEXT: psraw $8, %xmm3
576+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
577+
; SSE41-NEXT: movdqa %xmm2, %xmm3
578+
; SSE41-NEXT: psraw $4, %xmm3
579+
; SSE41-NEXT: movdqa %xmm1, %xmm0
580+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
581+
; SSE41-NEXT: movdqa %xmm2, %xmm3
582+
; SSE41-NEXT: psraw $2, %xmm3
583+
; SSE41-NEXT: paddw %xmm1, %xmm1
584+
; SSE41-NEXT: movdqa %xmm1, %xmm0
585+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
586+
; SSE41-NEXT: movdqa %xmm2, %xmm3
587+
; SSE41-NEXT: psraw $1, %xmm3
588+
; SSE41-NEXT: paddw %xmm1, %xmm1
589+
; SSE41-NEXT: movdqa %xmm1, %xmm0
590+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
591+
; SSE41-NEXT: movdqa %xmm2, %xmm0
592+
; SSE41-NEXT: retq
593+
;
594+
; AVX2-LABEL: combine_vec16_ashr_out_of_bound:
595+
; AVX2: # %bb.0:
596+
; AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
597+
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
598+
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
599+
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
600+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
601+
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
602+
; AVX2-NEXT: vzeroupper
603+
; AVX2-NEXT: retq
604+
;
605+
; AVX512-LABEL: combine_vec16_ashr_out_of_bound:
606+
; AVX512: # %bb.0:
607+
; AVX512-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
608+
; AVX512-NEXT: vpsravw %xmm1, %xmm0, %xmm0
609+
; AVX512-NEXT: retq
610+
%1 = tail call <8 x i16> @llvm.umin.v4i16(<8 x i16> %y, <8 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
611+
%2 = ashr <8 x i16> %x, %1
612+
ret <8 x i16> %2
613+
}
614+
615+
define <4 x i32> @combine_vec32_ashr_out_of_bound(<4 x i32> %x, <4 x i32> %y) {
616+
; SSE2-LABEL: combine_vec32_ashr_out_of_bound:
617+
; SSE2: # %bb.0:
618+
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
619+
; SSE2-NEXT: pxor %xmm1, %xmm2
620+
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
621+
; SSE2-NEXT: movdqa %xmm2, %xmm3
622+
; SSE2-NEXT: pandn %xmm1, %xmm3
623+
; SSE2-NEXT: psrld $27, %xmm2
624+
; SSE2-NEXT: por %xmm3, %xmm2
625+
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
626+
; SSE2-NEXT: movdqa %xmm0, %xmm3
627+
; SSE2-NEXT: psrad %xmm1, %xmm3
628+
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
629+
; SSE2-NEXT: movdqa %xmm0, %xmm1
630+
; SSE2-NEXT: psrad %xmm4, %xmm1
631+
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
632+
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
633+
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
634+
; SSE2-NEXT: movdqa %xmm0, %xmm4
635+
; SSE2-NEXT: psrad %xmm3, %xmm4
636+
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
637+
; SSE2-NEXT: psrad %xmm2, %xmm0
638+
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
639+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
640+
; SSE2-NEXT: movaps %xmm1, %xmm0
641+
; SSE2-NEXT: retq
642+
;
643+
; SSE41-LABEL: combine_vec32_ashr_out_of_bound:
644+
; SSE41: # %bb.0:
645+
; SSE41-NEXT: pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
646+
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
647+
; SSE41-NEXT: movdqa %xmm0, %xmm3
648+
; SSE41-NEXT: psrad %xmm2, %xmm3
649+
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
650+
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
651+
; SSE41-NEXT: movdqa %xmm0, %xmm5
652+
; SSE41-NEXT: psrad %xmm4, %xmm5
653+
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
654+
; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
655+
; SSE41-NEXT: movdqa %xmm0, %xmm3
656+
; SSE41-NEXT: psrad %xmm1, %xmm3
657+
; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
658+
; SSE41-NEXT: psrad %xmm1, %xmm0
659+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
660+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
661+
; SSE41-NEXT: retq
662+
;
663+
; AVX-LABEL: combine_vec32_ashr_out_of_bound:
664+
; AVX: # %bb.0:
665+
; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0
666+
; AVX-NEXT: retq
667+
%1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
668+
%2 = ashr <4 x i32> %x, %1
669+
ret <4 x i32> %2
670+
}
671+
672+
define <4 x i64> @combine_vec64_ashr_out_of_bound(<4 x i64> %x, <4 x i64> %y) {
673+
; SSE2-LABEL: combine_vec64_ashr_out_of_bound:
674+
; SSE2: # %bb.0:
675+
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
676+
; SSE2-NEXT: movdqa %xmm3, %xmm4
677+
; SSE2-NEXT: pxor %xmm5, %xmm4
678+
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
679+
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483679,2147483679,2147483679,2147483679]
680+
; SSE2-NEXT: movdqa %xmm7, %xmm8
681+
; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
682+
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
683+
; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
684+
; SSE2-NEXT: pand %xmm8, %xmm4
685+
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [31,31]
686+
; SSE2-NEXT: pand %xmm4, %xmm3
687+
; SSE2-NEXT: pandn %xmm6, %xmm4
688+
; SSE2-NEXT: por %xmm3, %xmm4
689+
; SSE2-NEXT: movdqa %xmm2, %xmm3
690+
; SSE2-NEXT: pxor %xmm5, %xmm3
691+
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
692+
; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
693+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
694+
; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
695+
; SSE2-NEXT: pand %xmm7, %xmm3
696+
; SSE2-NEXT: pand %xmm3, %xmm2
697+
; SSE2-NEXT: pandn %xmm6, %xmm3
698+
; SSE2-NEXT: por %xmm2, %xmm3
699+
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
700+
; SSE2-NEXT: movdqa %xmm2, %xmm5
701+
; SSE2-NEXT: psrlq %xmm3, %xmm5
702+
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
703+
; SSE2-NEXT: movdqa %xmm2, %xmm7
704+
; SSE2-NEXT: psrlq %xmm6, %xmm7
705+
; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1]
706+
; SSE2-NEXT: movdqa %xmm0, %xmm5
707+
; SSE2-NEXT: psrlq %xmm3, %xmm5
708+
; SSE2-NEXT: psrlq %xmm6, %xmm0
709+
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
710+
; SSE2-NEXT: xorpd %xmm7, %xmm0
711+
; SSE2-NEXT: psubq %xmm7, %xmm0
712+
; SSE2-NEXT: movdqa %xmm2, %xmm3
713+
; SSE2-NEXT: psrlq %xmm4, %xmm3
714+
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
715+
; SSE2-NEXT: psrlq %xmm5, %xmm2
716+
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
717+
; SSE2-NEXT: movdqa %xmm1, %xmm3
718+
; SSE2-NEXT: psrlq %xmm4, %xmm3
719+
; SSE2-NEXT: psrlq %xmm5, %xmm1
720+
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
721+
; SSE2-NEXT: xorpd %xmm2, %xmm1
722+
; SSE2-NEXT: psubq %xmm2, %xmm1
723+
; SSE2-NEXT: retq
724+
;
725+
; SSE41-LABEL: combine_vec64_ashr_out_of_bound:
726+
; SSE41: # %bb.0:
727+
; SSE41-NEXT: movdqa %xmm0, %xmm4
728+
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
729+
; SSE41-NEXT: movdqa %xmm3, %xmm0
730+
; SSE41-NEXT: pxor %xmm7, %xmm0
731+
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259487,9223372039002259487]
732+
; SSE41-NEXT: movdqa %xmm8, %xmm6
733+
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
734+
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
735+
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483679,2147483679,2147483679,2147483679]
736+
; SSE41-NEXT: movdqa %xmm5, %xmm0
737+
; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
738+
; SSE41-NEXT: pand %xmm6, %xmm0
739+
; SSE41-NEXT: movapd {{.*#+}} xmm9 = [31,31]
740+
; SSE41-NEXT: movapd %xmm9, %xmm6
741+
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
742+
; SSE41-NEXT: pxor %xmm2, %xmm7
743+
; SSE41-NEXT: pcmpeqd %xmm7, %xmm8
744+
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
745+
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
746+
; SSE41-NEXT: pand %xmm8, %xmm5
747+
; SSE41-NEXT: movdqa %xmm5, %xmm0
748+
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
749+
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
750+
; SSE41-NEXT: movdqa %xmm0, %xmm2
751+
; SSE41-NEXT: psrlq %xmm9, %xmm2
752+
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
753+
; SSE41-NEXT: movdqa %xmm0, %xmm5
754+
; SSE41-NEXT: psrlq %xmm3, %xmm5
755+
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
756+
; SSE41-NEXT: movdqa %xmm4, %xmm2
757+
; SSE41-NEXT: psrlq %xmm9, %xmm2
758+
; SSE41-NEXT: psrlq %xmm3, %xmm4
759+
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
760+
; SSE41-NEXT: pxor %xmm5, %xmm4
761+
; SSE41-NEXT: psubq %xmm5, %xmm4
762+
; SSE41-NEXT: movdqa %xmm0, %xmm2
763+
; SSE41-NEXT: psrlq %xmm6, %xmm2
764+
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
765+
; SSE41-NEXT: psrlq %xmm3, %xmm0
766+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
767+
; SSE41-NEXT: movdqa %xmm1, %xmm2
768+
; SSE41-NEXT: psrlq %xmm6, %xmm2
769+
; SSE41-NEXT: psrlq %xmm3, %xmm1
770+
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
771+
; SSE41-NEXT: pxor %xmm0, %xmm1
772+
; SSE41-NEXT: psubq %xmm0, %xmm1
773+
; SSE41-NEXT: movdqa %xmm4, %xmm0
774+
; SSE41-NEXT: retq
775+
;
776+
; AVX2-LABEL: combine_vec64_ashr_out_of_bound:
777+
; AVX2: # %bb.0:
778+
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
779+
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
780+
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775838,9223372036854775838,9223372036854775838,9223372036854775838]
781+
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
782+
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [31,31,31,31]
783+
; AVX2-NEXT: vblendvpd %ymm3, %ymm4, %ymm1, %ymm1
784+
; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
785+
; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
786+
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
787+
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
788+
; AVX2-NEXT: retq
789+
;
790+
; AVX512-LABEL: combine_vec64_ashr_out_of_bound:
791+
; AVX512: # %bb.0:
792+
; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1
793+
; AVX512-NEXT: vpsravq %ymm1, %ymm0, %ymm0
794+
; AVX512-NEXT: retq
795+
%1 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %y, <4 x i64> <i64 31, i64 31, i64 31, i64 31>)
796+
%2 = ashr <4 x i64> %x, %1
797+
ret <4 x i64> %2
798+
}

0 commit comments

Comments
 (0)