Skip to content

Commit 95d584c

Browse files
authored
[InstCombine] Convert or concat to fshl if opposite or concat exists (#68502)
If there are two 'or' instructions concat variables in opposite order and the first 'or' dominates the second one, the second 'or' can be optimized to fshl to rotate shift first 'or'. This can eliminate an shl and expose more optimization opportunity for bswap/bitreverse.
1 parent 3494c55 commit 95d584c

File tree

2 files changed

+205
-2
lines changed

2 files changed

+205
-2
lines changed

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2699,7 +2699,8 @@ Instruction *InstCombinerImpl::matchBSwapOrBitReverse(Instruction &I,
26992699
}
27002700

27012701
/// Match UB-safe variants of the funnel shift intrinsic.
2702-
static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
2702+
static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC,
2703+
const DominatorTree &DT) {
27032704
// TODO: Can we reduce the code duplication between this and the related
27042705
// rotate matching code under visitSelect and visitTrunc?
27052706
unsigned Width = Or.getType()->getScalarSizeInBits();
@@ -2804,6 +2805,64 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
28042805
return nullptr;
28052806

28062807
FShiftArgs = {ShVal0, ShVal1, ShAmt};
2808+
} else if (isa<ZExtInst>(Or0) || isa<ZExtInst>(Or1)) {
2809+
// If there are two 'or' instructions concat variables in opposite order:
2810+
//
2811+
// Slot1 and Slot2 are all zero bits.
2812+
// | Slot1 | Low | Slot2 | High |
2813+
// LowHigh = or (shl (zext Low), ZextLowShlAmt), (zext High)
2814+
// | Slot2 | High | Slot1 | Low |
2815+
// HighLow = or (shl (zext High), ZextHighShlAmt), (zext Low)
2816+
//
2817+
// the latter 'or' can be safely convert to
2818+
// -> HighLow = fshl LowHigh, LowHigh, ZextHighShlAmt
2819+
// if ZextLowShlAmt + ZextHighShlAmt == Width.
2820+
if (!isa<ZExtInst>(Or1))
2821+
std::swap(Or0, Or1);
2822+
2823+
Value *High, *ZextHigh, *Low;
2824+
const APInt *ZextHighShlAmt;
2825+
if (!match(Or0,
2826+
m_OneUse(m_Shl(m_Value(ZextHigh), m_APInt(ZextHighShlAmt)))))
2827+
return nullptr;
2828+
2829+
if (!match(Or1, m_ZExt(m_Value(Low))) ||
2830+
!match(ZextHigh, m_ZExt(m_Value(High))))
2831+
return nullptr;
2832+
2833+
unsigned HighSize = High->getType()->getScalarSizeInBits();
2834+
unsigned LowSize = Low->getType()->getScalarSizeInBits();
2835+
// Make sure High does not overlap with Low and most significant bits of
2836+
// High aren't shifted out.
2837+
if (ZextHighShlAmt->ult(LowSize) || ZextHighShlAmt->ugt(Width - HighSize))
2838+
return nullptr;
2839+
2840+
for (User *U : ZextHigh->users()) {
2841+
Value *X, *Y;
2842+
if (!match(U, m_Or(m_Value(X), m_Value(Y))))
2843+
continue;
2844+
2845+
if (!isa<ZExtInst>(Y))
2846+
std::swap(X, Y);
2847+
2848+
const APInt *ZextLowShlAmt;
2849+
if (!match(X, m_Shl(m_Specific(Or1), m_APInt(ZextLowShlAmt))) ||
2850+
!match(Y, m_Specific(ZextHigh)) || !DT.dominates(U, &Or))
2851+
continue;
2852+
2853+
// HighLow is good concat. If sum of two shifts amount equals to Width,
2854+
// LowHigh must also be a good concat.
2855+
if (*ZextLowShlAmt + *ZextHighShlAmt != Width)
2856+
continue;
2857+
2858+
// Low must not overlap with High and most significant bits of Low must
2859+
// not be shifted out.
2860+
assert(ZextLowShlAmt->uge(HighSize) &&
2861+
ZextLowShlAmt->ule(Width - LowSize) && "Invalid concat");
2862+
2863+
FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)};
2864+
break;
2865+
}
28072866
}
28082867

28092868
if (FShiftArgs.empty())
@@ -3305,7 +3364,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
33053364
/*MatchBitReversals*/ true))
33063365
return BitOp;
33073366

3308-
if (Instruction *Funnel = matchFunnelShift(I, *this))
3367+
if (Instruction *Funnel = matchFunnelShift(I, *this, DT))
33093368
return Funnel;
33103369

33113370
if (Instruction *Concat = matchOrConcat(I, Builder))

llvm/test/Transforms/InstCombine/funnel.ll

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,150 @@ define <2 x i64> @fshl_select_vector(<2 x i64> %x, <2 x i64> %y, <2 x i64> %sham
354354
ret <2 x i64> %r
355355
}
356356

357+
; Convert 'or concat' to fshl if opposite 'or concat' exists.
358+
359+
define i32 @fshl_concat_i8_i24(i8 %x, i24 %y, ptr %addr) {
360+
; CHECK-LABEL: @fshl_concat_i8_i24(
361+
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
362+
; CHECK-NEXT: [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 24
363+
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i24 [[Y:%.*]] to i32
364+
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
365+
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
366+
; CHECK-NEXT: [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 8)
367+
; CHECK-NEXT: ret i32 [[YX]]
368+
;
369+
%zext.x = zext i8 %x to i32
370+
%slx = shl i32 %zext.x, 24
371+
%zext.y = zext i24 %y to i32
372+
%xy = or i32 %zext.y, %slx
373+
store i32 %xy, ptr %addr, align 4
374+
%sly = shl i32 %zext.y, 8
375+
%yx = or i32 %zext.x, %sly
376+
ret i32 %yx
377+
}
378+
379+
define i32 @fshl_concat_i8_i8(i8 %x, i8 %y, ptr %addr) {
380+
; CHECK-LABEL: @fshl_concat_i8_i8(
381+
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
382+
; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 13
383+
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
384+
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
385+
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
386+
; CHECK-NEXT: [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 19)
387+
; CHECK-NEXT: ret i32 [[YX]]
388+
;
389+
%zext.x = zext i8 %x to i32
390+
%slx = shl i32 %zext.x, 13
391+
%zext.y = zext i8 %y to i32
392+
%xy = or i32 %zext.y, %slx
393+
store i32 %xy, ptr %addr, align 4
394+
%sly = shl i32 %zext.y, 19
395+
%yx = or i32 %zext.x, %sly
396+
ret i32 %yx
397+
}
398+
399+
define i32 @fshl_concat_i8_i8_overlap(i8 %x, i8 %y, ptr %addr) {
400+
; CHECK-LABEL: @fshl_concat_i8_i8_overlap(
401+
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
402+
; CHECK-NEXT: [[SLX:%.*]] = shl i32 [[ZEXT_X]], 25
403+
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
404+
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
405+
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
406+
; CHECK-NEXT: [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 7
407+
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
408+
; CHECK-NEXT: ret i32 [[YX]]
409+
;
410+
; Test sly overlap.
411+
%zext.x = zext i8 %x to i32
412+
%slx = shl i32 %zext.x, 25
413+
%zext.y = zext i8 %y to i32
414+
%xy = or i32 %zext.y, %slx
415+
store i32 %xy, ptr %addr, align 4
416+
%sly = shl i32 %zext.y, 7
417+
%yx = or i32 %zext.x, %sly
418+
ret i32 %yx
419+
}
420+
421+
define i32 @fshl_concat_i8_i8_drop(i8 %x, i8 %y, ptr %addr) {
422+
; CHECK-LABEL: @fshl_concat_i8_i8_drop(
423+
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
424+
; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 7
425+
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
426+
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
427+
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
428+
; CHECK-NEXT: [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 25
429+
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
430+
; CHECK-NEXT: ret i32 [[YX]]
431+
;
432+
; Test sly drop.
433+
%zext.x = zext i8 %x to i32
434+
%slx = shl i32 %zext.x, 7
435+
%zext.y = zext i8 %y to i32
436+
%xy = or i32 %zext.y, %slx
437+
store i32 %xy, ptr %addr, align 4
438+
%sly = shl i32 %zext.y, 25
439+
%yx = or i32 %zext.x, %sly
440+
ret i32 %yx
441+
}
442+
443+
define i32 @fshl_concat_i8_i8_different_slot(i8 %x, i8 %y, ptr %addr) {
444+
; CHECK-LABEL: @fshl_concat_i8_i8_different_slot(
445+
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
446+
; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 9
447+
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
448+
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
449+
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
450+
; CHECK-NEXT: [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 22
451+
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
452+
; CHECK-NEXT: ret i32 [[YX]]
453+
;
454+
%zext.x = zext i8 %x to i32
455+
%slx = shl i32 %zext.x, 9
456+
%zext.y = zext i8 %y to i32
457+
%xy = or i32 %zext.y, %slx
458+
store i32 %xy, ptr %addr, align 4
459+
%sly = shl i32 %zext.y, 22
460+
%yx = or i32 %zext.x, %sly
461+
ret i32 %yx
462+
}
463+
464+
define i32 @fshl_concat_unknown_source(i32 %zext.x, i32 %zext.y, ptr %addr) {
465+
; CHECK-LABEL: @fshl_concat_unknown_source(
466+
; CHECK-NEXT: [[SLX:%.*]] = shl i32 [[ZEXT_X:%.*]], 16
467+
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y:%.*]]
468+
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
469+
; CHECK-NEXT: [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 16
470+
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
471+
; CHECK-NEXT: ret i32 [[YX]]
472+
;
473+
%slx = shl i32 %zext.x, 16
474+
%xy = or i32 %zext.y, %slx
475+
store i32 %xy, ptr %addr, align 4
476+
%sly = shl i32 %zext.y, 16
477+
%yx = or i32 %zext.x, %sly
478+
ret i32 %yx
479+
}
480+
481+
define <2 x i32> @fshl_concat_vector(<2 x i8> %x, <2 x i24> %y, ptr %addr) {
482+
; CHECK-LABEL: @fshl_concat_vector(
483+
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
484+
; CHECK-NEXT: [[SLX:%.*]] = shl nuw <2 x i32> [[ZEXT_X]], <i32 24, i32 24>
485+
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext <2 x i24> [[Y:%.*]] to <2 x i32>
486+
; CHECK-NEXT: [[XY:%.*]] = or <2 x i32> [[SLX]], [[ZEXT_Y]]
487+
; CHECK-NEXT: store <2 x i32> [[XY]], ptr [[ADDR:%.*]], align 4
488+
; CHECK-NEXT: [[YX:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[XY]], <2 x i32> [[XY]], <2 x i32> <i32 8, i32 8>)
489+
; CHECK-NEXT: ret <2 x i32> [[YX]]
490+
;
491+
%zext.x = zext <2 x i8> %x to <2 x i32>
492+
%slx = shl <2 x i32> %zext.x, <i32 24, i32 24>
493+
%zext.y = zext <2 x i24> %y to <2 x i32>
494+
%xy = or <2 x i32> %slx, %zext.y
495+
store <2 x i32> %xy, ptr %addr, align 4
496+
%sly = shl <2 x i32> %zext.y, <i32 8, i32 8>
497+
%yx = or <2 x i32> %sly, %zext.x
498+
ret <2 x i32> %yx
499+
}
500+
357501
; Negative test - an oversized shift in the narrow type would produce the wrong value.
358502

359503
define i8 @unmasked_shlop_unmasked_shift_amount(i32 %x, i32 %y, i32 %shamt) {

0 commit comments

Comments
 (0)