Skip to content

Commit 6d48496

Browse files
authored
[InstCombine] lshr (mul (X, 2^N + 1)), N -> add (X, lshr(X, N)) (#92907)
Alive2 Proofs: https://alive2.llvm.org/ce/z/eSinJY https://alive2.llvm.org/ce/z/vyKvde https://alive2.llvm.org/ce/z/dRFsfV
1 parent c6ce937 commit 6d48496

File tree

3 files changed

+318
-10
lines changed

3 files changed

+318
-10
lines changed

llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1457,13 +1457,24 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
14571457

14581458
const APInt *MulC;
14591459
if (match(Op0, m_NUWMul(m_Value(X), m_APInt(MulC)))) {
1460-
// Look for a "splat" mul pattern - it replicates bits across each half of
1461-
// a value, so a right shift is just a mask of the low bits:
1462-
// lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1
1463-
// TODO: Generalize to allow more than just half-width shifts?
1464-
if (BitWidth > 2 && ShAmtC * 2 == BitWidth && (*MulC - 1).isPowerOf2() &&
1465-
MulC->logBase2() == ShAmtC)
1466-
return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
1460+
if (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
1461+
MulC->logBase2() == ShAmtC) {
1462+
// Look for a "splat" mul pattern - it replicates bits across each half
1463+
// of a value, so a right shift is just a mask of the low bits:
1464+
// lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1
1465+
if (ShAmtC * 2 == BitWidth)
1466+
return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
1467+
1468+
// lshr (mul nuw (X, 2^N + 1)), N -> add nuw (X, lshr(X, N))
1469+
if (Op0->hasOneUse()) {
1470+
auto *NewAdd = BinaryOperator::CreateNUWAdd(
1471+
X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "",
1472+
I.isExact()));
1473+
NewAdd->setHasNoSignedWrap(
1474+
cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap());
1475+
return NewAdd;
1476+
}
1477+
}
14671478

14681479
// The one-use check is not strictly necessary, but codegen may not be
14691480
// able to invert the transform and perf may suffer with an extra mul
@@ -1483,6 +1494,16 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
14831494
}
14841495
}
14851496

1497+
// lshr (mul nsw (X, 2^N + 1)), N -> add nsw (X, lshr(X, N))
1498+
if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC))))) {
1499+
if (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
1500+
MulC->logBase2() == ShAmtC) {
1501+
return BinaryOperator::CreateNSWAdd(
1502+
X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "",
1503+
I.isExact()));
1504+
}
1505+
}
1506+
14861507
// Try to narrow bswap.
14871508
// In the case where the shift amount equals the bitwidth difference, the
14881509
// shift is eliminated.
@@ -1686,6 +1707,21 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
16861707
if (match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
16871708
return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty);
16881709
}
1710+
1711+
const APInt *MulC;
1712+
if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC)))) &&
1713+
(BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
1714+
MulC->logBase2() == ShAmt &&
1715+
(ShAmt < BitWidth - 1))) /* Minus 1 for the sign bit */ {
1716+
1717+
// ashr (mul nsw (X, 2^N + 1)), N -> add nsw (X, ashr(X, N))
1718+
auto *NewAdd = BinaryOperator::CreateNSWAdd(
1719+
X,
1720+
Builder.CreateAShr(X, ConstantInt::get(Ty, ShAmt), "", I.isExact()));
1721+
NewAdd->setHasNoUnsignedWrap(
1722+
cast<OverflowingBinaryOperator>(Op0)->hasNoUnsignedWrap());
1723+
return NewAdd;
1724+
}
16891725
}
16901726

16911727
const SimplifyQuery Q = SQ.getWithInstruction(&I);

llvm/test/Transforms/InstCombine/ashr-lshr.ll

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,3 +604,262 @@ define <2 x i8> @ashr_known_pos_exact_vec(<2 x i8> %x, <2 x i8> %y) {
604604
%r = ashr exact <2 x i8> %p, %y
605605
ret <2 x i8> %r
606606
}
607+
608+
define i32 @lshr_mul_times_3_div_2(i32 %0) {
609+
; CHECK-LABEL: @lshr_mul_times_3_div_2(
610+
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 1
611+
; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
612+
; CHECK-NEXT: ret i32 [[LSHR]]
613+
;
614+
%mul = mul nsw nuw i32 %0, 3
615+
%lshr = lshr i32 %mul, 1
616+
ret i32 %lshr
617+
}
618+
619+
define i32 @lshr_mul_times_3_div_2_exact(i32 %x) {
620+
; CHECK-LABEL: @lshr_mul_times_3_div_2_exact(
621+
; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1
622+
; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
623+
; CHECK-NEXT: ret i32 [[LSHR]]
624+
;
625+
%mul = mul nsw i32 %x, 3
626+
%lshr = lshr exact i32 %mul, 1
627+
ret i32 %lshr
628+
}
629+
630+
; Negative test
631+
632+
define i32 @lshr_mul_times_3_div_2_no_flags(i32 %0) {
633+
; CHECK-LABEL: @lshr_mul_times_3_div_2_no_flags(
634+
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3
635+
; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[MUL]], 1
636+
; CHECK-NEXT: ret i32 [[LSHR]]
637+
;
638+
%mul = mul i32 %0, 3
639+
%lshr = lshr i32 %mul, 1
640+
ret i32 %lshr
641+
}
642+
643+
; Negative test
644+
645+
define i32 @mul_times_3_div_2_multiuse_lshr(i32 %x) {
646+
; CHECK-LABEL: @mul_times_3_div_2_multiuse_lshr(
647+
; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 3
648+
; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[MUL]], 1
649+
; CHECK-NEXT: call void @use(i32 [[MUL]])
650+
; CHECK-NEXT: ret i32 [[RES]]
651+
;
652+
%mul = mul nuw i32 %x, 3
653+
%res = lshr i32 %mul, 1
654+
call void @use(i32 %mul)
655+
ret i32 %res
656+
}
657+
658+
define i32 @lshr_mul_times_3_div_2_exact_2(i32 %x) {
659+
; CHECK-LABEL: @lshr_mul_times_3_div_2_exact_2(
660+
; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1
661+
; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]]
662+
; CHECK-NEXT: ret i32 [[LSHR]]
663+
;
664+
%mul = mul nuw i32 %x, 3
665+
%lshr = lshr exact i32 %mul, 1
666+
ret i32 %lshr
667+
}
668+
669+
define i32 @lshr_mul_times_5_div_4(i32 %0) {
670+
; CHECK-LABEL: @lshr_mul_times_5_div_4(
671+
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 2
672+
; CHECK-NEXT: [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
673+
; CHECK-NEXT: ret i32 [[LSHR]]
674+
;
675+
%mul = mul nsw nuw i32 %0, 5
676+
%lshr = lshr i32 %mul, 2
677+
ret i32 %lshr
678+
}
679+
680+
define i32 @lshr_mul_times_5_div_4_exact(i32 %x) {
681+
; CHECK-LABEL: @lshr_mul_times_5_div_4_exact(
682+
; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2
683+
; CHECK-NEXT: [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
684+
; CHECK-NEXT: ret i32 [[LSHR]]
685+
;
686+
%mul = mul nsw i32 %x, 5
687+
%lshr = lshr exact i32 %mul, 2
688+
ret i32 %lshr
689+
}
690+
691+
; Negative test
692+
693+
define i32 @lshr_mul_times_5_div_4_no_flags(i32 %0) {
694+
; CHECK-LABEL: @lshr_mul_times_5_div_4_no_flags(
695+
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5
696+
; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[MUL]], 2
697+
; CHECK-NEXT: ret i32 [[LSHR]]
698+
;
699+
%mul = mul i32 %0, 5
700+
%lshr = lshr i32 %mul, 2
701+
ret i32 %lshr
702+
}
703+
704+
; Negative test
705+
706+
define i32 @mul_times_5_div_4_multiuse_lshr(i32 %x) {
707+
; CHECK-LABEL: @mul_times_5_div_4_multiuse_lshr(
708+
; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 5
709+
; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[MUL]], 2
710+
; CHECK-NEXT: call void @use(i32 [[MUL]])
711+
; CHECK-NEXT: ret i32 [[RES]]
712+
;
713+
%mul = mul nuw i32 %x, 5
714+
%res = lshr i32 %mul, 2
715+
call void @use(i32 %mul)
716+
ret i32 %res
717+
}
718+
719+
define i32 @lshr_mul_times_5_div_4_exact_2(i32 %x) {
720+
; CHECK-LABEL: @lshr_mul_times_5_div_4_exact_2(
721+
; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2
722+
; CHECK-NEXT: [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]]
723+
; CHECK-NEXT: ret i32 [[LSHR]]
724+
;
725+
%mul = mul nuw i32 %x, 5
726+
%lshr = lshr exact i32 %mul, 2
727+
ret i32 %lshr
728+
}
729+
730+
define i32 @ashr_mul_times_3_div_2(i32 %0) {
731+
; CHECK-LABEL: @ashr_mul_times_3_div_2(
732+
; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 1
733+
; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
734+
; CHECK-NEXT: ret i32 [[ASHR]]
735+
;
736+
%mul = mul nuw nsw i32 %0, 3
737+
%ashr = ashr i32 %mul, 1
738+
ret i32 %ashr
739+
}
740+
741+
define i32 @ashr_mul_times_3_div_2_exact(i32 %x) {
742+
; CHECK-LABEL: @ashr_mul_times_3_div_2_exact(
743+
; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1
744+
; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
745+
; CHECK-NEXT: ret i32 [[ASHR]]
746+
;
747+
%mul = mul nsw i32 %x, 3
748+
%ashr = ashr exact i32 %mul, 1
749+
ret i32 %ashr
750+
}
751+
752+
; Negative test
753+
754+
define i32 @ashr_mul_times_3_div_2_no_flags(i32 %0) {
755+
; CHECK-LABEL: @ashr_mul_times_3_div_2_no_flags(
756+
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3
757+
; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[MUL]], 1
758+
; CHECK-NEXT: ret i32 [[ASHR]]
759+
;
760+
%mul = mul i32 %0, 3
761+
%ashr = ashr i32 %mul, 1
762+
ret i32 %ashr
763+
}
764+
765+
; Negative test
766+
767+
define i32 @ashr_mul_times_3_div_2_no_nsw(i32 %0) {
768+
; CHECK-LABEL: @ashr_mul_times_3_div_2_no_nsw(
769+
; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[TMP0:%.*]], 3
770+
; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[MUL]], 1
771+
; CHECK-NEXT: ret i32 [[ASHR]]
772+
;
773+
%mul = mul nuw i32 %0, 3
774+
%ashr = ashr i32 %mul, 1
775+
ret i32 %ashr
776+
}
777+
778+
; Negative test
779+
780+
define i32 @mul_times_3_div_2_multiuse_ashr(i32 %x) {
781+
; CHECK-LABEL: @mul_times_3_div_2_multiuse_ashr(
782+
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 3
783+
; CHECK-NEXT: [[RES:%.*]] = ashr i32 [[MUL]], 1
784+
; CHECK-NEXT: call void @use(i32 [[MUL]])
785+
; CHECK-NEXT: ret i32 [[RES]]
786+
;
787+
%mul = mul nsw i32 %x, 3
788+
%res = ashr i32 %mul, 1
789+
call void @use(i32 %mul)
790+
ret i32 %res
791+
}
792+
793+
define i32 @ashr_mul_times_3_div_2_exact_2(i32 %x) {
794+
; CHECK-LABEL: @ashr_mul_times_3_div_2_exact_2(
795+
; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1
796+
; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
797+
; CHECK-NEXT: ret i32 [[ASHR]]
798+
;
799+
%mul = mul nsw i32 %x, 3
800+
%ashr = ashr exact i32 %mul, 1
801+
ret i32 %ashr
802+
}
803+
804+
define i32 @ashr_mul_times_5_div_4(i32 %0) {
805+
; CHECK-LABEL: @ashr_mul_times_5_div_4(
806+
; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 2
807+
; CHECK-NEXT: [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
808+
; CHECK-NEXT: ret i32 [[ASHR]]
809+
;
810+
%mul = mul nuw nsw i32 %0, 5
811+
%ashr = ashr i32 %mul, 2
812+
ret i32 %ashr
813+
}
814+
815+
define i32 @ashr_mul_times_5_div_4_exact(i32 %x) {
816+
; CHECK-LABEL: @ashr_mul_times_5_div_4_exact(
817+
; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2
818+
; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
819+
; CHECK-NEXT: ret i32 [[ASHR]]
820+
;
821+
%mul = mul nsw i32 %x, 5
822+
%ashr = ashr exact i32 %mul, 2
823+
ret i32 %ashr
824+
}
825+
826+
; Negative test
827+
828+
define i32 @ashr_mul_times_5_div_4_no_flags(i32 %0) {
829+
; CHECK-LABEL: @ashr_mul_times_5_div_4_no_flags(
830+
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5
831+
; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[MUL]], 2
832+
; CHECK-NEXT: ret i32 [[ASHR]]
833+
;
834+
%mul = mul i32 %0, 5
835+
%ashr = ashr i32 %mul, 2
836+
ret i32 %ashr
837+
}
838+
839+
; Negative test
840+
841+
define i32 @mul_times_5_div_4_multiuse_ashr(i32 %x) {
842+
; CHECK-LABEL: @mul_times_5_div_4_multiuse_ashr(
843+
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 5
844+
; CHECK-NEXT: [[RES:%.*]] = ashr i32 [[MUL]], 2
845+
; CHECK-NEXT: call void @use(i32 [[MUL]])
846+
; CHECK-NEXT: ret i32 [[RES]]
847+
;
848+
%mul = mul nsw i32 %x, 5
849+
%res = ashr i32 %mul, 2
850+
call void @use(i32 %mul)
851+
ret i32 %res
852+
}
853+
854+
define i32 @ashr_mul_times_5_div_4_exact_2(i32 %x) {
855+
; CHECK-LABEL: @ashr_mul_times_5_div_4_exact_2(
856+
; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2
857+
; CHECK-NEXT: [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
858+
; CHECK-NEXT: ret i32 [[ASHR]]
859+
;
860+
%mul = mul nsw i32 %x, 5
861+
%ashr = ashr exact i32 %mul, 2
862+
ret i32 %ashr
863+
}
864+
865+
declare void @use(i32)

llvm/test/Transforms/InstCombine/lshr.ll

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -628,19 +628,32 @@ define i32 @mul_splat_fold_wrong_lshr_const(i32 %x) {
628628
ret i32 %t
629629
}
630630

631-
; Negative test
631+
; Negative test (but simplifies into a different transform)
632632

633633
define i32 @mul_splat_fold_no_nuw(i32 %x) {
634634
; CHECK-LABEL: @mul_splat_fold_no_nuw(
635-
; CHECK-NEXT: [[M:%.*]] = mul nsw i32 [[X:%.*]], 65537
636-
; CHECK-NEXT: [[T:%.*]] = lshr i32 [[M]], 16
635+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 16
636+
; CHECK-NEXT: [[T:%.*]] = add nsw i32 [[TMP1]], [[X]]
637637
; CHECK-NEXT: ret i32 [[T]]
638638
;
639639
%m = mul nsw i32 %x, 65537
640640
%t = lshr i32 %m, 16
641641
ret i32 %t
642642
}
643643

644+
; Negative test
645+
646+
define i32 @mul_splat_fold_no_flags(i32 %x) {
647+
; CHECK-LABEL: @mul_splat_fold_no_flags(
648+
; CHECK-NEXT: [[M:%.*]] = mul i32 [[X:%.*]], 65537
649+
; CHECK-NEXT: [[T:%.*]] = lshr i32 [[M]], 16
650+
; CHECK-NEXT: ret i32 [[T]]
651+
;
652+
%m = mul i32 %x, 65537
653+
%t = lshr i32 %m, 16
654+
ret i32 %t
655+
}
656+
644657
; Negative test (but simplifies before we reach the mul_splat transform)- need more than 2 bits
645658

646659
define i2 @mul_splat_fold_too_narrow(i2 %x) {

0 commit comments

Comments
 (0)