Skip to content

Commit 65e07f1

Browse files
committed
[X86][AVX] Fix handling of out-of-bounds shift amounts in AVX2 vector shift nodes #83840
1 parent 81e2047 commit 65e07f1

File tree

2 files changed

+283
-0
lines changed

2 files changed

+283
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47334,6 +47334,16 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
4733447334
if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
4733547335
return V;
4733647336

47337+
APInt ShiftAmt;
47338+
if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
47339+
N1.getOpcode() == ISD::UMIN &&
47340+
ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
47341+
ShiftAmt == VT.getScalarSizeInBits() - 1) {
47342+
SDValue ShrAmtVal = N1.getOperand(0);
47343+
SDLoc DL(N);
47344+
return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
47345+
}
47346+
4733747347
// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
4733847348
// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
4733947349
// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

llvm/test/CodeGen/X86/combine-sra.ll

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,3 +521,276 @@ define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
521521
%2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10>
522522
ret <4 x i32> %2
523523
}
524+
525+
define <8 x i16> @combine_vec8i16_ashr_clamped(<8 x i16> %x, <8 x i16> %y) {
526+
; SSE2-LABEL: combine_vec8i16_ashr_clamped:
527+
; SSE2: # %bb.0:
528+
; SSE2-NEXT: movdqa %xmm1, %xmm2
529+
; SSE2-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
530+
; SSE2-NEXT: psubw %xmm2, %xmm1
531+
; SSE2-NEXT: psllw $12, %xmm1
532+
; SSE2-NEXT: movdqa %xmm1, %xmm2
533+
; SSE2-NEXT: psraw $15, %xmm2
534+
; SSE2-NEXT: movdqa %xmm2, %xmm3
535+
; SSE2-NEXT: pandn %xmm0, %xmm3
536+
; SSE2-NEXT: psraw $8, %xmm0
537+
; SSE2-NEXT: pand %xmm2, %xmm0
538+
; SSE2-NEXT: por %xmm3, %xmm0
539+
; SSE2-NEXT: paddw %xmm1, %xmm1
540+
; SSE2-NEXT: movdqa %xmm1, %xmm2
541+
; SSE2-NEXT: psraw $15, %xmm2
542+
; SSE2-NEXT: movdqa %xmm2, %xmm3
543+
; SSE2-NEXT: pandn %xmm0, %xmm3
544+
; SSE2-NEXT: psraw $4, %xmm0
545+
; SSE2-NEXT: pand %xmm2, %xmm0
546+
; SSE2-NEXT: por %xmm3, %xmm0
547+
; SSE2-NEXT: paddw %xmm1, %xmm1
548+
; SSE2-NEXT: movdqa %xmm1, %xmm2
549+
; SSE2-NEXT: psraw $15, %xmm2
550+
; SSE2-NEXT: movdqa %xmm2, %xmm3
551+
; SSE2-NEXT: pandn %xmm0, %xmm3
552+
; SSE2-NEXT: psraw $2, %xmm0
553+
; SSE2-NEXT: pand %xmm2, %xmm0
554+
; SSE2-NEXT: por %xmm3, %xmm0
555+
; SSE2-NEXT: paddw %xmm1, %xmm1
556+
; SSE2-NEXT: psraw $15, %xmm1
557+
; SSE2-NEXT: movdqa %xmm1, %xmm2
558+
; SSE2-NEXT: pandn %xmm0, %xmm2
559+
; SSE2-NEXT: psraw $1, %xmm0
560+
; SSE2-NEXT: pand %xmm1, %xmm0
561+
; SSE2-NEXT: por %xmm2, %xmm0
562+
; SSE2-NEXT: retq
563+
;
564+
; SSE41-LABEL: combine_vec8i16_ashr_clamped:
565+
; SSE41: # %bb.0:
566+
; SSE41-NEXT: movdqa %xmm0, %xmm2
567+
; SSE41-NEXT: pminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
568+
; SSE41-NEXT: movdqa %xmm1, %xmm0
569+
; SSE41-NEXT: psllw $12, %xmm0
570+
; SSE41-NEXT: psllw $4, %xmm1
571+
; SSE41-NEXT: por %xmm1, %xmm0
572+
; SSE41-NEXT: movdqa %xmm0, %xmm1
573+
; SSE41-NEXT: paddw %xmm0, %xmm1
574+
; SSE41-NEXT: movdqa %xmm2, %xmm3
575+
; SSE41-NEXT: psraw $8, %xmm3
576+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
577+
; SSE41-NEXT: movdqa %xmm2, %xmm3
578+
; SSE41-NEXT: psraw $4, %xmm3
579+
; SSE41-NEXT: movdqa %xmm1, %xmm0
580+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
581+
; SSE41-NEXT: movdqa %xmm2, %xmm3
582+
; SSE41-NEXT: psraw $2, %xmm3
583+
; SSE41-NEXT: paddw %xmm1, %xmm1
584+
; SSE41-NEXT: movdqa %xmm1, %xmm0
585+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
586+
; SSE41-NEXT: movdqa %xmm2, %xmm3
587+
; SSE41-NEXT: psraw $1, %xmm3
588+
; SSE41-NEXT: paddw %xmm1, %xmm1
589+
; SSE41-NEXT: movdqa %xmm1, %xmm0
590+
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
591+
; SSE41-NEXT: movdqa %xmm2, %xmm0
592+
; SSE41-NEXT: retq
593+
;
594+
; AVX2-LABEL: combine_vec8i16_ashr_clamped:
595+
; AVX2: # %bb.0:
596+
; AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
597+
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
598+
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
599+
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
600+
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
601+
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
602+
; AVX2-NEXT: vzeroupper
603+
; AVX2-NEXT: retq
604+
;
605+
; AVX512-LABEL: combine_vec8i16_ashr_clamped:
606+
; AVX512: # %bb.0:
607+
; AVX512-NEXT: vpsravw %xmm1, %xmm0, %xmm0
608+
; AVX512-NEXT: retq
609+
%1 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %y, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
610+
%2 = ashr <8 x i16> %x, %1
611+
ret <8 x i16> %2
612+
}
613+
614+
define <4 x i32> @combine_vec4i32_ashr_clamped(<4 x i32> %x, <4 x i32> %y) {
615+
; SSE2-LABEL: combine_vec4i32_ashr_clamped:
616+
; SSE2: # %bb.0:
617+
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
618+
; SSE2-NEXT: pxor %xmm1, %xmm2
619+
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
620+
; SSE2-NEXT: movdqa %xmm2, %xmm3
621+
; SSE2-NEXT: pandn %xmm1, %xmm3
622+
; SSE2-NEXT: psrld $27, %xmm2
623+
; SSE2-NEXT: por %xmm3, %xmm2
624+
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
625+
; SSE2-NEXT: movdqa %xmm0, %xmm3
626+
; SSE2-NEXT: psrad %xmm1, %xmm3
627+
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
628+
; SSE2-NEXT: movdqa %xmm0, %xmm1
629+
; SSE2-NEXT: psrad %xmm4, %xmm1
630+
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
631+
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
632+
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
633+
; SSE2-NEXT: movdqa %xmm0, %xmm4
634+
; SSE2-NEXT: psrad %xmm3, %xmm4
635+
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
636+
; SSE2-NEXT: psrad %xmm2, %xmm0
637+
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
638+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
639+
; SSE2-NEXT: movaps %xmm1, %xmm0
640+
; SSE2-NEXT: retq
641+
;
642+
; SSE41-LABEL: combine_vec4i32_ashr_clamped:
643+
; SSE41: # %bb.0:
644+
; SSE41-NEXT: pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
645+
; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
646+
; SSE41-NEXT: movdqa %xmm0, %xmm3
647+
; SSE41-NEXT: psrad %xmm2, %xmm3
648+
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
649+
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
650+
; SSE41-NEXT: movdqa %xmm0, %xmm5
651+
; SSE41-NEXT: psrad %xmm4, %xmm5
652+
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
653+
; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
654+
; SSE41-NEXT: movdqa %xmm0, %xmm3
655+
; SSE41-NEXT: psrad %xmm1, %xmm3
656+
; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
657+
; SSE41-NEXT: psrad %xmm1, %xmm0
658+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
659+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
660+
; SSE41-NEXT: retq
661+
;
662+
; AVX-LABEL: combine_vec4i32_ashr_clamped:
663+
; AVX: # %bb.0:
664+
; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0
665+
; AVX-NEXT: retq
666+
%1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
667+
%2 = ashr <4 x i32> %x, %1
668+
ret <4 x i32> %2
669+
}
670+
671+
define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) {
672+
; SSE2-LABEL: combine_vec4i64_ashr_clamped:
673+
; SSE2: # %bb.0:
674+
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
675+
; SSE2-NEXT: movdqa %xmm3, %xmm4
676+
; SSE2-NEXT: pxor %xmm5, %xmm4
677+
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
678+
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483711,2147483711,2147483711,2147483711]
679+
; SSE2-NEXT: movdqa %xmm7, %xmm8
680+
; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
681+
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
682+
; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
683+
; SSE2-NEXT: pand %xmm8, %xmm4
684+
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [63,63]
685+
; SSE2-NEXT: pand %xmm4, %xmm3
686+
; SSE2-NEXT: pandn %xmm6, %xmm4
687+
; SSE2-NEXT: por %xmm3, %xmm4
688+
; SSE2-NEXT: movdqa %xmm2, %xmm3
689+
; SSE2-NEXT: pxor %xmm5, %xmm3
690+
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
691+
; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
692+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
693+
; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
694+
; SSE2-NEXT: pand %xmm7, %xmm3
695+
; SSE2-NEXT: pand %xmm3, %xmm2
696+
; SSE2-NEXT: pandn %xmm6, %xmm3
697+
; SSE2-NEXT: por %xmm2, %xmm3
698+
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
699+
; SSE2-NEXT: movdqa %xmm2, %xmm5
700+
; SSE2-NEXT: psrlq %xmm3, %xmm5
701+
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
702+
; SSE2-NEXT: movdqa %xmm2, %xmm7
703+
; SSE2-NEXT: psrlq %xmm6, %xmm7
704+
; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1]
705+
; SSE2-NEXT: movdqa %xmm0, %xmm5
706+
; SSE2-NEXT: psrlq %xmm3, %xmm5
707+
; SSE2-NEXT: psrlq %xmm6, %xmm0
708+
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
709+
; SSE2-NEXT: xorpd %xmm7, %xmm0
710+
; SSE2-NEXT: psubq %xmm7, %xmm0
711+
; SSE2-NEXT: movdqa %xmm2, %xmm3
712+
; SSE2-NEXT: psrlq %xmm4, %xmm3
713+
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
714+
; SSE2-NEXT: psrlq %xmm5, %xmm2
715+
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
716+
; SSE2-NEXT: movdqa %xmm1, %xmm3
717+
; SSE2-NEXT: psrlq %xmm4, %xmm3
718+
; SSE2-NEXT: psrlq %xmm5, %xmm1
719+
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
720+
; SSE2-NEXT: xorpd %xmm2, %xmm1
721+
; SSE2-NEXT: psubq %xmm2, %xmm1
722+
; SSE2-NEXT: retq
723+
;
724+
; SSE41-LABEL: combine_vec4i64_ashr_clamped:
725+
; SSE41: # %bb.0:
726+
; SSE41-NEXT: movdqa %xmm0, %xmm4
727+
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
728+
; SSE41-NEXT: movdqa %xmm3, %xmm0
729+
; SSE41-NEXT: pxor %xmm7, %xmm0
730+
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519]
731+
; SSE41-NEXT: movdqa %xmm8, %xmm6
732+
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
733+
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
734+
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711]
735+
; SSE41-NEXT: movdqa %xmm5, %xmm0
736+
; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
737+
; SSE41-NEXT: pand %xmm6, %xmm0
738+
; SSE41-NEXT: movapd {{.*#+}} xmm9 = [63,63]
739+
; SSE41-NEXT: movapd %xmm9, %xmm6
740+
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
741+
; SSE41-NEXT: pxor %xmm2, %xmm7
742+
; SSE41-NEXT: pcmpeqd %xmm7, %xmm8
743+
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
744+
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
745+
; SSE41-NEXT: pand %xmm8, %xmm5
746+
; SSE41-NEXT: movdqa %xmm5, %xmm0
747+
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
748+
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
749+
; SSE41-NEXT: movdqa %xmm0, %xmm2
750+
; SSE41-NEXT: psrlq %xmm9, %xmm2
751+
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
752+
; SSE41-NEXT: movdqa %xmm0, %xmm5
753+
; SSE41-NEXT: psrlq %xmm3, %xmm5
754+
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
755+
; SSE41-NEXT: movdqa %xmm4, %xmm2
756+
; SSE41-NEXT: psrlq %xmm9, %xmm2
757+
; SSE41-NEXT: psrlq %xmm3, %xmm4
758+
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
759+
; SSE41-NEXT: pxor %xmm5, %xmm4
760+
; SSE41-NEXT: psubq %xmm5, %xmm4
761+
; SSE41-NEXT: movdqa %xmm0, %xmm2
762+
; SSE41-NEXT: psrlq %xmm6, %xmm2
763+
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
764+
; SSE41-NEXT: psrlq %xmm3, %xmm0
765+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
766+
; SSE41-NEXT: movdqa %xmm1, %xmm2
767+
; SSE41-NEXT: psrlq %xmm6, %xmm2
768+
; SSE41-NEXT: psrlq %xmm3, %xmm1
769+
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
770+
; SSE41-NEXT: pxor %xmm0, %xmm1
771+
; SSE41-NEXT: psubq %xmm0, %xmm1
772+
; SSE41-NEXT: movdqa %xmm4, %xmm0
773+
; SSE41-NEXT: retq
774+
;
775+
; AVX2-LABEL: combine_vec4i64_ashr_clamped:
776+
; AVX2: # %bb.0:
777+
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
778+
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
779+
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775870,9223372036854775870,9223372036854775870,9223372036854775870]
780+
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
781+
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [63,63,63,63]
782+
; AVX2-NEXT: vblendvpd %ymm3, %ymm4, %ymm1, %ymm1
783+
; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
784+
; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
785+
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
786+
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
787+
; AVX2-NEXT: retq
788+
;
789+
; AVX512-LABEL: combine_vec4i64_ashr_clamped:
790+
; AVX512: # %bb.0:
791+
; AVX512-NEXT: vpsravq %ymm1, %ymm0, %ymm0
792+
; AVX512-NEXT: retq
793+
%1 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %y, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
794+
%2 = ashr <4 x i64> %x, %1
795+
ret <4 x i64> %2
796+
}

0 commit comments

Comments
 (0)