-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Codegen][LegalizeIntegerTypes] Improve shift through stack #96151
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-selectiondag Author: None (futog) ChangesMinor improvement on cc39c3b. If the target does not support unaligned memory access, use native register aligment instead of byte alignment. The shift amount is also splitted based on the native aligment, so load happens only from aligned addresses. Patch is 257.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96151.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a058b509b3aca..f21ed7581a5af 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4530,14 +4530,25 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
SDValue ShAmt = N->getOperand(1);
EVT ShAmtVT = ShAmt.getValueType();
- // This legalization is optimal when the shift is by a multiple of byte width,
- // %x * 8 <-> %x << 3 so 3 low bits should be be known zero.
- bool ShiftByByteMultiple =
- DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+ EVT LoadStoreVT = VT;
+ do {
+ LoadStoreVT = TLI.getTypeToTransformTo(*DAG.getContext(), LoadStoreVT);
+ }while (!TLI.isTypeLegal(LoadStoreVT));
+
+ const Align LoadStoreAlign = [&]() -> Align {
+ if (TLI.allowsMisalignedMemoryAccesses(LoadStoreVT))
+ return Align(1);
+
+ return DAG.getReducedAlign(LoadStoreVT, /*UseABI=*/false);
+ }();
+
+ const unsigned ShiftUnitInBits = LoadStoreAlign.value() * 8;
+ const bool IsOneStepShift =
+ DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= Log2_32(ShiftUnitInBits);
// If we can't do it as one step, we'll have two uses of shift amount,
// and thus must freeze it.
- if (!ShiftByByteMultiple)
+ if (!IsOneStepShift)
ShAmt = DAG.getFreeze(ShAmt);
unsigned VTBitWidth = VT.getScalarSizeInBits();
@@ -4551,8 +4562,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
// Get a temporary stack slot 2x the width of our VT.
// FIXME: reuse stack slots?
- // FIXME: should we be more picky about alignment?
- Align StackSlotAlignment(1);
+ Align StackSlotAlignment(LoadStoreAlign);
SDValue StackPtr = DAG.CreateStackTemporary(
TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
EVT PtrTy = StackPtr.getValueType();
@@ -4577,16 +4587,22 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
// Now, compute the full-byte offset into stack slot from where we can load.
- // We have shift amount, which is in bits, but in multiples of byte.
- // So just divide by CHAR_BIT.
+ // We have shift amount, which is in bits. Offset should point to an aligned
+ // address.
SDNodeFlags Flags;
- if (ShiftByByteMultiple)
+ if (IsOneStepShift)
Flags.setExact(true);
- SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
- DAG.getConstant(3, dl, ShAmtVT), Flags);
+ SDValue OffsetInBits = DAG.getNode(ISD::SHL, dl, ShAmtVT,
+ DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt, DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT), Flags),
+ DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT));
+ Flags.setExact(true);
+ SDValue Offset = DAG.getNode(
+ ISD::SRL, dl, ShAmtVT,
+ OffsetInBits,
+ DAG.getConstant(3, dl, ShAmtVT), Flags);
// And clamp it, because OOB load is an immediate UB,
// while shift overflow would have *just* been poison.
- ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset,
+ Offset = DAG.getNode(ISD::AND, dl, ShAmtVT, Offset,
DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
// We have exactly two strategies on indexing into stack slot here:
// 1. upwards starting from the beginning of the slot
@@ -4603,23 +4619,23 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
} else {
AdjStackPtr = DAG.getMemBasePlusOffset(
StackPtr, DAG.getConstant(VTByteWidth, dl, PtrTy), dl);
- ByteOffset = DAG.getNegative(ByteOffset, dl, ShAmtVT);
+ Offset = DAG.getNegative(Offset, dl, ShAmtVT);
}
// Get the pointer somewhere into the stack slot from which we need to load.
- ByteOffset = DAG.getSExtOrTrunc(ByteOffset, dl, PtrTy);
- AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
+ Offset = DAG.getSExtOrTrunc(Offset, dl, PtrTy);
+ AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, Offset, dl);
// And load it! While the load is not legal, legalizing it is obvious.
SDValue Res = DAG.getLoad(
VT, dl, Ch, AdjStackPtr,
- MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
- // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
+ MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), LoadStoreAlign);
+ // We've performed the shift by a CHAR_BIT * [ShAmt / LoadAlign]
- // If we may still have a less-than-CHAR_BIT to shift by, do so now.
- if (!ShiftByByteMultiple) {
+ // If we may still have a remaining bits to shift by, do so now.
+ if (!IsOneStepShift) {
SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
- DAG.getConstant(7, dl, ShAmtVT));
+ DAG.getConstant(ShiftUnitInBits - 1, dl, ShAmtVT));
Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
}
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed725..5ba8755201ddf 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb a1, 12(sp)
-; RV32I-NEXT: sb a5, 8(sp)
-; RV32I-NEXT: sb a4, 4(sp)
-; RV32I-NEXT: sb a3, 0(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 15(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 14(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 13(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 11(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 10(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 9(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 7(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 6(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 3(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 2(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 1(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a1, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 8(a1)
; RV32I-NEXT: lw a5, 4(a1)
; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: sb a3, 12(sp)
-; RV32I-NEXT: sb a4, 8(sp)
-; RV32I-NEXT: sb a5, 4(sp)
-; RV32I-NEXT: sb a1, 0(sp)
-; RV32I-NEXT: srai a6, a3, 31
-; RV32I-NEXT: sb a6, 28(sp)
-; RV32I-NEXT: sb a6, 24(sp)
-; RV32I-NEXT: sb a6, 20(sp)
-; RV32I-NEXT: sb a6, 16(sp)
-; RV32I-NEXT: srli a7, a3, 24
-; RV32I-NEXT: sb a7, 15(sp)
-; RV32I-NEXT: srli a7, a3, 16
-; RV32I-NEXT: sb a7, 14(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(sp)
-; RV32I-NEXT: srli a3, a4, 24
-; RV32I-NEXT: sb a3, 11(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 10(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 9(sp)
-; RV32I-NEXT: srli a3, a5, 24
-; RV32I-NEXT: sb a3, 7(sp)
-; RV32I-NEXT: srli a3, a5, 16
-; RV32I-NEXT: sb a3, 6(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 5(sp)
-; RV32I-NEXT: srli a3, a1, 24
-; RV32I-NEXT: sb a3, 3(sp)
-; RV32I-NEXT: srli a3, a1, 16
-; RV32I-NEXT: sb a3, 2(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 1(sp)
-; RV32I-NEXT: srli a1, a6, 24
-; RV32I-NEXT: sb a1, 31(sp)
-; RV32I-NEXT: srli a3, a6, 16
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: sb a3, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: sb a3, 18(sp)
-; RV32I-NEXT: sb a4, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw a3, 12(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw a1, 0(sp)
+; RV32I-NEXT: srai a3, a3, 31
+; RV32I-NEXT: sw a3, 28(sp)
+; RV32I-NEXT: sw a3, 24(sp)
+; RV32I-NEXT: sw a3, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb zero, 11(sp)
-; RV32I-NEXT: sb zero, 10(sp)
-; RV32I-NEXT: sb zero, 9(sp)
-; RV32I-NEXT: sb zero, 8(sp)
-; RV32I-NEXT: sb zero, 7(sp)
-; RV32I-NEXT: sb zero, 6(sp)
-; RV32I-NEXT: sb zero, 5(sp)
-; RV32I-NEXT: sb zero, 4(sp)
-; RV32I-NEXT: sb zero, 3(sp)
-; RV32I-NEXT: sb zero, 2(sp)
-; RV32I-NEXT: sb zero, 1(sp)
-; RV32I-NEXT: sb zero, 0(sp)
-; RV32I-NEXT: sb a1, 28(sp)
-; RV32I-NEXT: sb a5, 24(sp)
-; RV32I-NEXT: sb a4, 20(sp)
-; RV32I-NEXT: sb a3, 16(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 31(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 30(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 29(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 26(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 25(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 22(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 18(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a1, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: addi a3, sp, 16
-; RV32I-NEXT: sub a1, a3, a1
-; RV32I-NEXT: lbu a3, 5(a1)
-; RV32I-NEXT: lbu a4, 4(a1)
-; RV32I-NEXT: lbu a5, 6(a1)
-; RV32I-NEXT: lbu a6, 7(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
-; RV32I-NEXT: sll a4, a3, a2
-; RV32I-NEXT: lbu a5, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
-; RV32I-NEXT: lbu a7, 2(a1)
-; RV32I-NEXT: lbu t0, 3(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: srli a6, a5, 1
-; RV32I-NEXT: xori a7, a2, 31
+; RV32I-NEXT: sub a3, a3, a1
+; RV32I-NEXT: lw a1, 4(a3)
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a1, a2
+; RV32I-NEXT: srli a6, a4, 1
+; RV32I-NEXT: andi a7, a2, 31
+; RV32I-NEXT: lw t0, 8(a3)
+; RV32I-NEXT: xori a7, a7, 31
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: lbu a6, 9(a1)
-; RV32I-NEXT: lbu t0, 8(a1)
-; RV32I-NEXT: lbu t1, 10(a1)
-; RV32I-NEXT: lbu t2, 11(a1)
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: or a6, a6, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: sll t0, a6, a2
-; RV32I-NEXT: srli a3, a3, 1
-; RV32I-NEXT: not t1, a2
-; RV32I-NEXT: srl a3, a3, t1
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: lbu t0, 13(a1)
-; RV32I-NEXT: lbu t1, 12(a1)
-; RV32I-NEXT: lbu t2, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t2
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: sll a1, a1, a2
-; RV32I-NEXT: srli a6, a6, 1
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: sll a6, t0, a2
+; RV32I-NEXT: lw a3, 12(a3)
+; RV32I-NEXT: srli a1, a1, 1
+; RV32I-NEXT: srl a1, a1, a7
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: srli a6, t0, 1
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: sll a2, a5, a2
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: sll a2, a4, a2
; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a1, 12(a0)
-; RV32I-NEXT: sw a3, 8(a0)
-; RV32I-NEXT: sw a4, 4(a0)
+; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a1, 8(a0)
+; RV32I-NEXT: sw a5, 4(a0)
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92b..0b87bb05cfd63 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: ...
[truncated]
|
b34d1c3
to
90bdd43
Compare
0dd7ef4
to
5794d2c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the code change looks fine, but the AArch64 and RISCV cases look like regressions. Is there an explanation for those? Is the type transform loop excessively widening these now?
llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
Outdated
Show resolved
Hide resolved
llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
Show resolved
Hide resolved
Hi @arsenm, do you have any more concerns with this PR? |
7f3acd3
to
1d92d08
Compare
llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
Show resolved
Hide resolved
llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
Show resolved
Hide resolved
f3af653
to
4506bf1
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
Hi @arsenm, Do you have any suggestion on how to proceed with this PR? I think every obvious additional instructions are explained now. Regards, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still don't really get why the tests were using unaligned loads in the first place, or why this regresses them, but it shouldn't be directly related to this patch
@futog please can you rebase? |
fda38ef
to
f512d3b
Compare
Hi @arsenm could you please do the merge? I don't have the rights to do so. Thanks. |
1ca6337
to
551b106
Compare
2465f13
to
ccd69a6
Compare
Minor improvement on cc39c3b. Use an aligned stack slot to store the shifted value. Use the native register width as shifting unit, so the load of the shift result is aligned. If the shift amount is a multiple of the native register width, there is no need to do a follow-up shift after the load. I added new tests for these cases.
ccd69a6
to
03c0f5c
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/6785 Here is the relevant piece of the build log for the reference
|
Minor improvement on cc39c3b.
Use an aligned stack slot to store the shifted value.
Use the native register width as shifting unit, so the load of the
shift result is aligned.
If the shift amount is a multiple of the native register width, there is
no need to do a follow-up shift after the load. I added new tests for
these cases.