Skip to content

Commit cc39c3b

Browse files
committed
[Codegen][LegalizeIntegerTypes] New legalization strategy for scalar shifts: shift through stack
https://reviews.llvm.org/D140493 is going to teach SROA how to promote allocas that have variably-indexed loads. That does bring up questions of cost model, since that requires creating wide shifts. Indeed, our legalization for them is not optimal. We either split it into parts, or lower it into a libcall. But if the shift amount is by a multiple of CHAR_BIT, we can also legalize it throught stack. The basic idea is very simple: 1. Get a stack slot 2x the width of the shift type 2. store the value we are shifting into one half of the slot 3. pad the other half of the slot. for logical shifts, with zero, for arithmetic shift with signbit 4. index into the slot (starting from the base half into which we spilled, either upwards or downwards) 5. load 6. split loaded integer This works for both little-endian and big-endian machines: https://alive2.llvm.org/ce/z/YNVwd5 And better yet, if the original shift amount was not a multiple of CHAR_BIT, we can just shift by that remainder afterwards: https://alive2.llvm.org/ce/z/pz5G-K I think, if we are going perform shift->shift-by-parts expansion more than once, we should instead go through stack, which is what this patch does. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D140638
1 parent 6996162 commit cc39c3b

31 files changed

+11226
-43023
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -915,11 +915,19 @@ class TargetLoweringBase {
915915
return RepRegClassCostForVT[VT.SimpleTy];
916916
}
917917

918-
/// Return true if SHIFT instructions should be expanded to SHIFT_PARTS
919-
/// instructions, and false if a library call is preferred (e.g for code-size
920-
/// reasons).
921-
virtual bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
922-
return true;
918+
/// Return the preferred strategy to legalize tihs SHIFT instruction, with
919+
/// \p ExpansionFactor being the recursion depth - how many expansion needed.
920+
enum class ShiftLegalizationStrategy {
921+
ExpandToParts,
922+
ExpandThroughStack,
923+
LowerToLibcall
924+
};
925+
virtual ShiftLegalizationStrategy
926+
preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
927+
unsigned ExpansionFactor) const {
928+
if (ExpansionFactor == 1)
929+
return ShiftLegalizationStrategy::ExpandToParts;
930+
return ShiftLegalizationStrategy::ExpandThroughStack;
923931
}
924932

925933
/// Return true if the target has native support for the specified value type.

llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4140,6 +4140,111 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
41404140
SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
41414141
}
41424142

4143+
void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
4144+
SDValue &Hi) {
4145+
SDLoc dl(N);
4146+
SDValue Shiftee = N->getOperand(0);
4147+
EVT VT = Shiftee.getValueType();
4148+
SDValue ShAmt = N->getOperand(1);
4149+
EVT ShAmtVT = ShAmt.getValueType();
4150+
4151+
// This legalization is optimal when the shift is by a multiple of byte width,
4152+
// %x * 8 <-> %x << 3 so 3 low bits should be be known zero.
4153+
bool ShiftByByteMultiple =
4154+
DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
4155+
4156+
// If we can't do it as one step, we'll have two uses of shift amount,
4157+
// and thus must freeze it.
4158+
if (!ShiftByByteMultiple)
4159+
ShAmt = DAG.getFreeze(ShAmt);
4160+
4161+
unsigned VTBitWidth = VT.getScalarSizeInBits();
4162+
assert(VTBitWidth % 8 == 0 && "Shifting a not byte multiple value?");
4163+
unsigned VTByteWidth = VTBitWidth / 8;
4164+
assert(isPowerOf2_32(VTByteWidth) &&
4165+
"Shiftee type size is not a power of two!");
4166+
unsigned StackSlotByteWidth = 2 * VTByteWidth;
4167+
unsigned StackSlotBitWidth = 8 * StackSlotByteWidth;
4168+
EVT StackSlotVT = EVT::getIntegerVT(*DAG.getContext(), StackSlotBitWidth);
4169+
4170+
// Get a temporary stack slot 2x the width of our VT.
4171+
// FIXME: reuse stack slots?
4172+
// FIXME: should we be more picky about alignment?
4173+
Align StackSlotAlignment(1);
4174+
SDValue StackPtr = DAG.CreateStackTemporary(
4175+
TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
4176+
EVT PtrTy = StackPtr.getValueType();
4177+
SDValue Ch = DAG.getEntryNode();
4178+
4179+
MachinePointerInfo StackPtrInfo = MachinePointerInfo::getFixedStack(
4180+
DAG.getMachineFunction(),
4181+
cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
4182+
4183+
// Extend the value, that is being shifted, to the entire stack slot's width.
4184+
SDValue Init;
4185+
if (N->getOpcode() != ISD::SHL) {
4186+
unsigned WideningOpc =
4187+
N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4188+
Init = DAG.getNode(WideningOpc, dl, StackSlotVT, Shiftee);
4189+
} else {
4190+
// For left-shifts, pad the Shiftee's LSB with zeros to twice it's width.
4191+
SDValue AllZeros = DAG.getConstant(0, dl, VT);
4192+
Init = DAG.getNode(ISD::BUILD_PAIR, dl, StackSlotVT, AllZeros, Shiftee);
4193+
}
4194+
// And spill it into the stack slot.
4195+
Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
4196+
4197+
// Now, compute the full-byte offset into stack slot from where we can load.
4198+
// We have shift amount, which is in bits, but in multiples of byte.
4199+
// So just divide by CHAR_BIT.
4200+
SDNodeFlags Flags;
4201+
if (ShiftByByteMultiple)
4202+
Flags.setExact(true);
4203+
SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
4204+
DAG.getConstant(3, dl, ShAmtVT), Flags);
4205+
// And clamp it, because OOB load is an immediate UB,
4206+
// while shift overflow would have *just* been poison.
4207+
ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset,
4208+
DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
4209+
// We have exactly two strategies on indexing into stack slot here:
4210+
// 1. upwards starting from the beginning of the slot
4211+
// 2. downwards starting from the middle of the slot
4212+
// On little-endian machine, we pick 1. for right shifts and 2. for left-shift
4213+
// and vice versa on big-endian machine.
4214+
bool WillIndexUpwards = N->getOpcode() != ISD::SHL;
4215+
if (DAG.getDataLayout().isBigEndian())
4216+
WillIndexUpwards = !WillIndexUpwards;
4217+
4218+
SDValue AdjStackPtr;
4219+
if (WillIndexUpwards) {
4220+
AdjStackPtr = StackPtr;
4221+
} else {
4222+
AdjStackPtr = DAG.getMemBasePlusOffset(
4223+
StackPtr, DAG.getConstant(VTByteWidth, dl, PtrTy), dl);
4224+
ByteOffset = DAG.getNegative(ByteOffset, dl, ShAmtVT);
4225+
}
4226+
4227+
// Get the pointer somewhere into the stack slot from which we need to load.
4228+
ByteOffset = DAG.getSExtOrTrunc(ByteOffset, dl, PtrTy);
4229+
AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
4230+
4231+
// And load it! While the load is not legal, legalizing it is obvious.
4232+
SDValue Res = DAG.getLoad(
4233+
VT, dl, Ch, AdjStackPtr,
4234+
MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
4235+
// We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
4236+
4237+
// If we may still have a less-than-CHAR_BIT to shift by, do so now.
4238+
if (!ShiftByByteMultiple) {
4239+
SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
4240+
DAG.getConstant(7, dl, ShAmtVT));
4241+
Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
4242+
}
4243+
4244+
// Finally, split the computed value.
4245+
SplitInteger(Res, Lo, Hi);
4246+
}
4247+
41434248
void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
41444249
SDValue &Lo, SDValue &Hi) {
41454250
EVT VT = N->getValueType(0);
@@ -4175,7 +4280,24 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
41754280
(Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
41764281
Action == TargetLowering::Custom;
41774282

4178-
if (LegalOrCustom && TLI.shouldExpandShift(DAG, N)) {
4283+
unsigned ExpansionFactor = 1;
4284+
// That VT->NVT expansion is one step. But will we re-expand NVT?
4285+
for (EVT TmpVT = NVT;;) {
4286+
EVT NewTMPVT = TLI.getTypeToTransformTo(*DAG.getContext(), TmpVT);
4287+
if (NewTMPVT == TmpVT)
4288+
break;
4289+
TmpVT = NewTMPVT;
4290+
++ExpansionFactor;
4291+
}
4292+
4293+
TargetLowering::ShiftLegalizationStrategy S =
4294+
TLI.preferredShiftLegalizationStrategy(DAG, N, ExpansionFactor);
4295+
4296+
if (S == TargetLowering::ShiftLegalizationStrategy::ExpandThroughStack)
4297+
return ExpandIntRes_ShiftThroughStack(N, Lo, Hi);
4298+
4299+
if (LegalOrCustom &&
4300+
S != TargetLowering::ShiftLegalizationStrategy::LowerToLibcall) {
41794301
// Expand the subcomponents.
41804302
SDValue LHSL, LHSH;
41814303
GetExpandedInteger(N->getOperand(0), LHSL, LHSH);

llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
457457
void ExpandIntRes_SREM (SDNode *N, SDValue &Lo, SDValue &Hi);
458458
void ExpandIntRes_UDIV (SDNode *N, SDValue &Lo, SDValue &Hi);
459459
void ExpandIntRes_UREM (SDNode *N, SDValue &Lo, SDValue &Hi);
460+
void ExpandIntRes_ShiftThroughStack (SDNode *N, SDValue &Lo, SDValue &Hi);
460461
void ExpandIntRes_Shift (SDNode *N, SDValue &Lo, SDValue &Hi);
461462

462463
void ExpandIntRes_MINMAX (SDNode *N, SDValue &Lo, SDValue &Hi);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22545,12 +22545,14 @@ bool AArch64TargetLowering::
2254522545
return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
2254622546
}
2254722547

22548-
bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
22549-
SDNode *N) const {
22548+
TargetLowering::ShiftLegalizationStrategy
22549+
AArch64TargetLowering::preferredShiftLegalizationStrategy(
22550+
SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
2255022551
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
2255122552
!Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
22552-
return false;
22553-
return true;
22553+
return ShiftLegalizationStrategy::LowerToLibcall;
22554+
return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
22555+
ExpansionFactor);
2255422556
}
2255522557

2255622558
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,9 @@ class AArch64TargetLowering : public TargetLowering {
799799
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
800800
SelectionDAG &DAG) const override;
801801

802-
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
802+
ShiftLegalizationStrategy
803+
preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
804+
unsigned ExpansionFactor) const override;
803805

804806
bool shouldTransformSignedTruncationCheck(EVT XVT,
805807
unsigned KeptBits) const override {

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21233,8 +21233,13 @@ bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
2123321233
: ARM_AM::getSOImmVal(MaskVal)) != -1;
2123421234
}
2123521235

21236-
bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
21237-
return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
21236+
TargetLowering::ShiftLegalizationStrategy
21237+
ARMTargetLowering::preferredShiftLegalizationStrategy(
21238+
SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21239+
if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21240+
return ShiftLegalizationStrategy::LowerToLibcall;
21241+
return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
21242+
ExpansionFactor);
2123821243
}
2123921244

2124021245
Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,9 @@ class VectorType;
697697
return HasStandaloneRem;
698698
}
699699

700-
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
700+
ShiftLegalizationStrategy
701+
preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
702+
unsigned ExpansionFactor) const override;
701703

702704
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
703705
CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;

llvm/lib/Target/AVR/AVRISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,12 @@ class AVRTargetLowering : public TargetLowering {
147147
return false;
148148
}
149149

150+
ShiftLegalizationStrategy
151+
preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
152+
unsigned ExpansionFactor) const override {
153+
return ShiftLegalizationStrategy::LowerToLibcall;
154+
}
155+
150156
private:
151157
SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
152158
SelectionDAG &DAG, SDLoc dl) const;

llvm/lib/Target/RISCV/RISCVISelLowering.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -483,11 +483,15 @@ class RISCVTargetLowering : public TargetLowering {
483483
return ISD::SIGN_EXTEND;
484484
}
485485

486-
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
486+
TargetLowering::ShiftLegalizationStrategy
487+
preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
488+
unsigned ExpansionFactor) const override {
487489
if (DAG.getMachineFunction().getFunction().hasMinSize())
488-
return false;
489-
return true;
490+
return ShiftLegalizationStrategy::LowerToLibcall;
491+
return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
492+
ExpansionFactor);
490493
}
494+
491495
bool isDesirableToCommuteWithShift(const SDNode *N,
492496
CombineLevel Level) const override;
493497

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6045,12 +6045,14 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
60456045
return true;
60466046
}
60476047

6048-
bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
6049-
SDNode *N) const {
6048+
TargetLowering::ShiftLegalizationStrategy
6049+
X86TargetLowering::preferredShiftLegalizationStrategy(
6050+
SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
60506051
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
60516052
!Subtarget.isOSWindows())
6052-
return false;
6053-
return true;
6053+
return ShiftLegalizationStrategy::LowerToLibcall;
6054+
return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
6055+
ExpansionFactor);
60546056
}
60556057

60566058
bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1114,7 +1114,9 @@ namespace llvm {
11141114
return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
11151115
}
11161116

1117-
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
1117+
ShiftLegalizationStrategy
1118+
preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
1119+
unsigned ExpansionFactor) const override;
11181120

11191121
bool shouldSplatInsEltVarIndex(EVT VT) const override;
11201122

0 commit comments

Comments
 (0)