Skip to content

Patch series related to DAGCombiner::ReduceLoadOpStoreWidth #119140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 77 additions & 48 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ static cl::opt<bool> EnableReduceLoadOpStoreWidth(
"combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
cl::desc("DAG combiner enable reducing the width of load/op/store "
"sequence"));
static cl::opt<bool> ReduceLoadOpStoreWidthForceNarrowingProfitable(
"combiner-reduce-load-op-store-width-force-narrowing-profitable",
cl::Hidden, cl::init(false),
cl::desc("DAG combiner force override the narrowing profitable check when"
"reducing the width of load/op/store sequences"));

static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
"combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
Expand Down Expand Up @@ -20334,74 +20339,98 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
ST->getPointerInfo().getAddrSpace())
return SDValue();

// Find the type to narrow it the load / op / store to.
// Find the type NewVT to narrow the load / op / store to.
SDValue N1 = Value.getOperand(1);
unsigned BitWidth = N1.getValueSizeInBits();
APInt Imm = N1->getAsAPIntVal();
if (Opc == ISD::AND)
Imm ^= APInt::getAllOnes(BitWidth);
if (Imm == 0 || Imm.isAllOnes())
return SDValue();
unsigned ShAmt = Imm.countr_zero();
unsigned MSB = BitWidth - Imm.countl_zero() - 1;
unsigned NewBW = NextPowerOf2(MSB - ShAmt);
// Find least/most significant bit that need to be part of the narrowed
// operation. We assume target will need to address/access full bytes, so
// we make sure to align LSB and MSB at byte boundaries.
unsigned BitsPerByteMask = 7u;
unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
unsigned NewBW = NextPowerOf2(MSB - LSB);
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
!TLI.isNarrowingProfitable(N, VT, NewVT))) {
while (NewBW < BitWidth &&
(NewVT.getStoreSizeInBits() != NewBW ||
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
!(TLI.isNarrowingProfitable(N, VT, NewVT) ||
ReduceLoadOpStoreWidthForceNarrowingProfitable))) {
NewBW = NextPowerOf2(NewBW);
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
}
if (NewBW >= BitWidth)
return SDValue();

// If the lsb changed does not start at the type bitwidth boundary,
// start at the previous one.
if (ShAmt % NewBW)
ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
std::min(BitWidth, ShAmt + NewBW));
if ((Imm & Mask) == Imm) {
APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
if (Opc == ISD::AND)
NewImm ^= APInt::getAllOnes(NewBW);
uint64_t PtrOff = ShAmt / 8;
// For big endian targets, we need to adjust the offset to the pointer to
// load the correct bytes.
if (DAG.getDataLayout().isBigEndian())
PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
// If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
// large enough to cover all bits that should be modified. This type might
// however be larger than really needed (such as i32 while we actually only
// need to modify one byte). Now we need to find our how to align the memory
// accesses to satisfy preferred alignments as well as avoiding to access
// memory outside the store size of the orignal access.

unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();

// Let ShAmt denote amount of bits to skip, counted from the least
// significant bits of Imm. And let PtrOff how much the pointer needs to be
// offsetted (in bytes) for the new access.
unsigned ShAmt = 0;
uint64_t PtrOff = 0;
for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
// Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
if (ShAmt > LSB)
return SDValue();
if (ShAmt + NewBW < MSB)
continue;

// Calculate PtrOff.
unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
? VTStoreSize - NewBW - ShAmt
: ShAmt;
PtrOff = PtrAdjustmentInBits / 8;

// Now check if narrow access is allowed and fast, considering alignments.
unsigned IsFast = 0;
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
LD->getAddressSpace(), NewAlign,
LD->getMemOperand()->getFlags(), &IsFast) ||
!IsFast)
return SDValue();

SDValue NewPtr =
DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
SDValue NewLD =
DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
DAG.getConstant(NewImm, SDLoc(Value),
NewVT));
SDValue NewST =
DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);

AddToWorklist(NewPtr.getNode());
AddToWorklist(NewLD.getNode());
AddToWorklist(NewVal.getNode());
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
++OpsNarrowed;
return NewST;
if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
LD->getAddressSpace(), NewAlign,
LD->getMemOperand()->getFlags(), &IsFast) &&
IsFast)
break;
}
// If loop above did not find any accepted ShAmt we need to exit here.
if (ShAmt + NewBW > VTStoreSize)
return SDValue();

APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
if (Opc == ISD::AND)
NewImm ^= APInt::getAllOnes(NewBW);
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
SDValue NewPtr =
DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
SDValue NewLD =
DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
DAG.getConstant(NewImm, SDLoc(Value), NewVT));
SDValue NewST =
DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);

AddToWorklist(NewPtr.getNode());
AddToWorklist(NewLD.getNode());
AddToWorklist(NewVal.getNode());
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
++OpsNarrowed;
return NewST;
}

return SDValue();
Expand Down
66 changes: 66 additions & 0 deletions llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple armv7 -O1 | FileCheck %s -check-prefix=CHECK-LE-NORMAL
; RUN: llc < %s -mtriple armv7 -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-LE-NARROW
; RUN: llc < %s -mtriple armv7eb -O1 | FileCheck %s -check-prefix=CHECK-BE-NORMAL
; RUN: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW

; This is a reproducer for a bug when DAGCombiner::ReduceLoadOpStoreWidth
; would end up narrowing the load-op-store sequence into this SDNode sequence
; for little-endian
;
; t18: i32,ch = load<(load (s32) from %ir.p1 + 8, align 8)> t0, t17, undef:i32
; t20: i32 = or t18, Constant:i32<65534>
; t21: ch = store<(store (s32) into %ir.p1 + 8, align 8)> t18:1, t20, t17, undef:i32
;
; This was wrong since it accesses memory above %ir.p1+9 which is outside the
; "store size" for the original store.
;
; For big-endian we used to hit an assertion due to passing a negative offset
; to getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while
; before that commit we got load/store instructions that accessed memory at a
; negative offset from %p1).
;
define i16 @test(ptr %p1) {
; CHECK-LE-NORMAL-LABEL: test:
; CHECK-LE-NORMAL: @ %bb.0: @ %entry
; CHECK-LE-NORMAL-NEXT: ldrh r1, [r0, #8]
; CHECK-LE-NORMAL-NEXT: movw r2, #65534
; CHECK-LE-NORMAL-NEXT: orr r1, r1, r2
; CHECK-LE-NORMAL-NEXT: strh r1, [r0, #8]
; CHECK-LE-NORMAL-NEXT: mov r0, #0
; CHECK-LE-NORMAL-NEXT: bx lr
;
; CHECK-LE-NARROW-LABEL: test:
; CHECK-LE-NARROW: @ %bb.0: @ %entry
; CHECK-LE-NARROW-NEXT: ldr r1, [r0, #6]
; CHECK-LE-NARROW-NEXT: orr r1, r1, #16646144
; CHECK-LE-NARROW-NEXT: orr r1, r1, #-16777216
; CHECK-LE-NARROW-NEXT: str r1, [r0, #6]
; CHECK-LE-NARROW-NEXT: mov r0, #0
; CHECK-LE-NARROW-NEXT: bx lr
;
; CHECK-BE-NORMAL-LABEL: test:
; CHECK-BE-NORMAL: @ %bb.0: @ %entry
; CHECK-BE-NORMAL-NEXT: ldrh r1, [r0]
; CHECK-BE-NORMAL-NEXT: movw r2, #65534
; CHECK-BE-NORMAL-NEXT: orr r1, r1, r2
; CHECK-BE-NORMAL-NEXT: strh r1, [r0]
; CHECK-BE-NORMAL-NEXT: mov r0, #0
; CHECK-BE-NORMAL-NEXT: bx lr
;
; CHECK-BE-NARROW-LABEL: test:
; CHECK-BE-NARROW: @ %bb.0: @ %entry
; CHECK-BE-NARROW-NEXT: ldr r1, [r0]
; CHECK-BE-NARROW-NEXT: orr r1, r1, #16646144
; CHECK-BE-NARROW-NEXT: orr r1, r1, #-16777216
; CHECK-BE-NARROW-NEXT: str r1, [r0]
; CHECK-BE-NARROW-NEXT: mov r0, #0
; CHECK-BE-NARROW-NEXT: bx lr
entry:
%load = load i80, ptr %p1, align 32
%mask = shl i80 -1, 65
%op = or i80 %load, %mask
store i80 %op, ptr %p1, align 32
ret i16 0
}

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/apx/or.ll
Original file line number Diff line number Diff line change
Expand Up @@ -906,13 +906,13 @@ entry:
define void @or64mi_legacy(ptr %a) {
; CHECK-LABEL: or64mi_legacy:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
; CHECK-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
; CHECK-NEXT: # imm = 0x1E240
; CHECK-NEXT: retq # encoding: [0xc3]
;
; NF-LABEL: or64mi_legacy:
; NF: # %bb.0: # %entry
; NF-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
; NF-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
; NF-NEXT: # imm = 0x1E240
; NF-NEXT: retq # encoding: [0xc3]
entry:
Expand Down
18 changes: 4 additions & 14 deletions llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,12 @@ define void @i24_or(ptr %a) {
; X86-LABEL: i24_or:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %ecx
; X86-NEXT: movzbl 2(%eax), %edx
; X86-NEXT: shll $16, %edx
; X86-NEXT: orl %ecx, %edx
; X86-NEXT: orl $384, %edx # imm = 0x180
; X86-NEXT: movw %dx, (%eax)
; X86-NEXT: orw $384, (%eax) # imm = 0x180
; X86-NEXT: retl
;
; X64-LABEL: i24_or:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzbl 2(%rdi), %ecx
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: orl $384, %ecx # imm = 0x180
; X64-NEXT: movw %cx, (%rdi)
; X64-NEXT: orw $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i24, ptr %a, align 1
%b = or i24 %aa, 384
Expand Down Expand Up @@ -103,12 +93,12 @@ define void @i56_or(ptr %a) {
; X86-LABEL: i56_or:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: orl $384, (%eax) # imm = 0x180
; X86-NEXT: orw $384, (%eax) # imm = 0x180
; X86-NEXT: retl
;
; X64-LABEL: i56_or:
; X64: # %bb.0:
; X64-NEXT: orl $384, (%rdi) # imm = 0x180
; X64-NEXT: orw $384, (%rdi) # imm = 0x180
; X64-NEXT: retq
%aa = load i56, ptr %a, align 1
%b = or i56 %aa, 384
Expand Down
10 changes: 1 addition & 9 deletions llvm/test/CodeGen/X86/pr35763.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,7 @@ define dso_local void @PR35763() {
; CHECK-NEXT: movzwl z+2(%rip), %ecx
; CHECK-NEXT: orl %eax, %ecx
; CHECK-NEXT: movq %rcx, tf_3_var_136(%rip)
; CHECK-NEXT: movl z+6(%rip), %eax
; CHECK-NEXT: movzbl z+10(%rip), %ecx
; CHECK-NEXT: shlq $32, %rcx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
; CHECK-NEXT: andq %rcx, %rax
; CHECK-NEXT: movl %eax, z+6(%rip)
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: movb %al, z+10(%rip)
; CHECK-NEXT: andl $-33554177, z+7(%rip) # imm = 0xFE0000FF
; CHECK-NEXT: retq
entry:
%0 = load i16, ptr @z, align 8
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/store_op_load_fold.ll
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ define void @test2() nounwind uwtable ssp {
; CHECK-LABEL: test2:
; CHECK: ## %bb.0:
; CHECK-NEXT: movl L_s2$non_lazy_ptr, %eax
; CHECK-NEXT: andl $-262144, 20(%eax) ## imm = 0xFFFC0000
; CHECK-NEXT: andl $-67108609, 19(%eax) ## imm = 0xFC0000FF
; CHECK-NEXT: retl
%bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
%bf.clear36 = and i56 %bf.load35, -1125895611875329
Expand Down
Loading