Skip to content

Commit 3ad2399

Browse files
authored
[DAGCombiner] Refactor and improve ReduceLoadOpStoreWidth (#119564)
This patch make a couple of improvements to ReduceLoadOpStoreWidth. When determining the minimum size of "NewBW" we now take byte boundaries into account. If we for example touch bits 6-10 we shouldn't accept NewBW=8, because we would fail later when detecting that we can't access bits from two different bytes in memory using a single load. Instead we make sure to align LSB/MSB according to byte size boundaries up front before searching for a viable "NewBW". In the past we only tried to find a "ShAmt" that was a multiple of "NewBW", but now we use a sliding window technique to scan for a viable "ShAmt" that is a multiple of the byte size. This can help out finding more opportunities for optimization (specially if the original type isn't byte sized, and for big-endian targets when the original load/store is aligned on the most significant bit).
1 parent 6414d61 commit 3ad2399

File tree

6 files changed

+83
-99
lines changed

6 files changed

+83
-99
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 67 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -20342,17 +20342,21 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
2034220342
ST->getPointerInfo().getAddrSpace())
2034320343
return SDValue();
2034420344

20345-
// Find the type to narrow it the load / op / store to.
20345+
// Find the type NewVT to narrow the load / op / store to.
2034620346
SDValue N1 = Value.getOperand(1);
2034720347
unsigned BitWidth = N1.getValueSizeInBits();
2034820348
APInt Imm = N1->getAsAPIntVal();
2034920349
if (Opc == ISD::AND)
20350-
Imm ^= APInt::getAllOnes(BitWidth);
20350+
Imm.flipAllBits();
2035120351
if (Imm == 0 || Imm.isAllOnes())
2035220352
return SDValue();
20353-
unsigned ShAmt = Imm.countr_zero();
20354-
unsigned MSB = BitWidth - Imm.countl_zero() - 1;
20355-
unsigned NewBW = NextPowerOf2(MSB - ShAmt);
20353+
// Find least/most significant bit that need to be part of the narrowed
20354+
// operation. We assume target will need to address/access full bytes, so
20355+
// we make sure to align LSB and MSB at byte boundaries.
20356+
unsigned BitsPerByteMask = 7u;
20357+
unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
20358+
unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
20359+
unsigned NewBW = NextPowerOf2(MSB - LSB);
2035620360
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2035720361
// The narrowing should be profitable, the load/store operation should be
2035820362
// legal (or custom) and the store size should be equal to the NewVT width.
@@ -20367,68 +20371,69 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
2036720371
if (NewBW >= BitWidth)
2036820372
return SDValue();
2036920373

20370-
// TODO: For big-endian we probably want to align given the most significant
20371-
// bit being modified instead of adjusting ShAmt based on least significant
20372-
// bits. This to reduce the risk of failing on the alignment check below. If
20373-
// for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to
20374-
// find NewBW=16, and we want to load/store with a PtrOff set to 2. But then
20375-
// ShAmt should be set to 8, which isn't a multiple of NewBW. But given
20376-
// that isNarrowingProfitable doesn't seem to be overridden for any in-tree
20377-
// big-endian target, then the support for big-endian here isn't covered by
20378-
// any in-tree lit tests, so it is unfortunately not highly optimized
20379-
// either. It should be possible to improve that by using
20380-
// ReduceLoadOpStoreWidthForceNarrowingProfitable.
20381-
20382-
// If the lsb that is modified does not start at the type bitwidth boundary,
20383-
// align to start at the previous boundary.
20384-
ShAmt = ShAmt - (ShAmt % NewBW);
20385-
20386-
// Make sure we do not access memory outside the memory touched by the
20387-
// original load/store.
20388-
if (ShAmt + NewBW > VT.getStoreSizeInBits())
20389-
return SDValue();
20374+
// If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
20375+
// large enough to cover all bits that should be modified. This type might
20376+
// however be larger than really needed (such as i32 while we actually only
20377+
// need to modify one byte). Now we need to find our how to align the memory
20378+
// accesses to satisfy preferred alignments as well as avoiding to access
20379+
// memory outside the store size of the orignal access.
20380+
20381+
unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();
20382+
20383+
// Let ShAmt denote amount of bits to skip, counted from the least
20384+
// significant bits of Imm. And let PtrOff how much the pointer needs to be
20385+
// offsetted (in bytes) for the new access.
20386+
unsigned ShAmt = 0;
20387+
uint64_t PtrOff = 0;
20388+
for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
20389+
// Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
20390+
if (ShAmt > LSB)
20391+
return SDValue();
20392+
if (ShAmt + NewBW < MSB)
20393+
continue;
2039020394

20391-
APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
20392-
std::min(BitWidth, ShAmt + NewBW));
20393-
if ((Imm & Mask) == Imm) {
20394-
APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
20395-
if (Opc == ISD::AND)
20396-
NewImm ^= APInt::getAllOnes(NewBW);
20397-
uint64_t PtrOff = ShAmt / 8;
20398-
// For big endian targets, we need to adjust the offset to the pointer to
20399-
// load the correct bytes.
20400-
if (DAG.getDataLayout().isBigEndian())
20401-
PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
20395+
// Calculate PtrOff.
20396+
unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
20397+
? VTStoreSize - NewBW - ShAmt
20398+
: ShAmt;
20399+
PtrOff = PtrAdjustmentInBits / 8;
2040220400

20401+
// Now check if narrow access is allowed and fast, considering alignments.
2040320402
unsigned IsFast = 0;
2040420403
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
20405-
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
20406-
LD->getAddressSpace(), NewAlign,
20407-
LD->getMemOperand()->getFlags(), &IsFast) ||
20408-
!IsFast)
20409-
return SDValue();
20410-
20411-
SDValue NewPtr =
20412-
DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
20413-
SDValue NewLD =
20414-
DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
20415-
LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
20416-
LD->getMemOperand()->getFlags(), LD->getAAInfo());
20417-
SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
20418-
DAG.getConstant(NewImm, SDLoc(Value),
20419-
NewVT));
20420-
SDValue NewST =
20421-
DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
20422-
ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
20423-
20424-
AddToWorklist(NewPtr.getNode());
20425-
AddToWorklist(NewLD.getNode());
20426-
AddToWorklist(NewVal.getNode());
20427-
WorklistRemover DeadNodes(*this);
20428-
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
20429-
++OpsNarrowed;
20430-
return NewST;
20404+
if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
20405+
LD->getAddressSpace(), NewAlign,
20406+
LD->getMemOperand()->getFlags(), &IsFast) &&
20407+
IsFast)
20408+
break;
2043120409
}
20410+
// If loop above did not find any accepted ShAmt we need to exit here.
20411+
if (ShAmt + NewBW > VTStoreSize)
20412+
return SDValue();
20413+
20414+
APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
20415+
if (Opc == ISD::AND)
20416+
NewImm.flipAllBits();
20417+
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
20418+
SDValue NewPtr =
20419+
DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
20420+
SDValue NewLD =
20421+
DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
20422+
LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
20423+
LD->getMemOperand()->getFlags(), LD->getAAInfo());
20424+
SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
20425+
DAG.getConstant(NewImm, SDLoc(Value), NewVT));
20426+
SDValue NewST =
20427+
DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
20428+
ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
20429+
20430+
AddToWorklist(NewPtr.getNode());
20431+
AddToWorklist(NewLD.getNode());
20432+
AddToWorklist(NewVal.getNode());
20433+
WorklistRemover DeadNodes(*this);
20434+
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
20435+
++OpsNarrowed;
20436+
return NewST;
2043220437
}
2043320438

2043420439
return SDValue();

llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ define i16 @test(ptr %p1) {
3232
;
3333
; CHECK-LE-NARROW-LABEL: test:
3434
; CHECK-LE-NARROW: @ %bb.0: @ %entry
35-
; CHECK-LE-NARROW-NEXT: ldrh r1, [r0, #8]
36-
; CHECK-LE-NARROW-NEXT: movw r2, #65534
37-
; CHECK-LE-NARROW-NEXT: orr r1, r1, r2
38-
; CHECK-LE-NARROW-NEXT: strh r1, [r0, #8]
35+
; CHECK-LE-NARROW-NEXT: ldr r1, [r0, #6]
36+
; CHECK-LE-NARROW-NEXT: orr r1, r1, #16646144
37+
; CHECK-LE-NARROW-NEXT: orr r1, r1, #-16777216
38+
; CHECK-LE-NARROW-NEXT: str r1, [r0, #6]
3939
; CHECK-LE-NARROW-NEXT: mov r0, #0
4040
; CHECK-LE-NARROW-NEXT: bx lr
4141
;
@@ -50,10 +50,10 @@ define i16 @test(ptr %p1) {
5050
;
5151
; CHECK-BE-NARROW-LABEL: test:
5252
; CHECK-BE-NARROW: @ %bb.0: @ %entry
53-
; CHECK-BE-NARROW-NEXT: ldrh r1, [r0]
54-
; CHECK-BE-NARROW-NEXT: movw r2, #65534
55-
; CHECK-BE-NARROW-NEXT: orr r1, r1, r2
56-
; CHECK-BE-NARROW-NEXT: strh r1, [r0]
53+
; CHECK-BE-NARROW-NEXT: ldr r1, [r0]
54+
; CHECK-BE-NARROW-NEXT: orr r1, r1, #16646144
55+
; CHECK-BE-NARROW-NEXT: orr r1, r1, #-16777216
56+
; CHECK-BE-NARROW-NEXT: str r1, [r0]
5757
; CHECK-BE-NARROW-NEXT: mov r0, #0
5858
; CHECK-BE-NARROW-NEXT: bx lr
5959
entry:

llvm/test/CodeGen/X86/apx/or.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -906,13 +906,13 @@ entry:
906906
define void @or64mi_legacy(ptr %a) {
907907
; CHECK-LABEL: or64mi_legacy:
908908
; CHECK: # %bb.0: # %entry
909-
; CHECK-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
909+
; CHECK-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
910910
; CHECK-NEXT: # imm = 0x1E240
911911
; CHECK-NEXT: retq # encoding: [0xc3]
912912
;
913913
; NF-LABEL: or64mi_legacy:
914914
; NF: # %bb.0: # %entry
915-
; NF-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
915+
; NF-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
916916
; NF-NEXT: # imm = 0x1E240
917917
; NF-NEXT: retq # encoding: [0xc3]
918918
entry:

llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,12 @@ define void @i24_or(ptr %a) {
66
; X86-LABEL: i24_or:
77
; X86: # %bb.0:
88
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
9-
; X86-NEXT: movzwl (%eax), %ecx
10-
; X86-NEXT: movzbl 2(%eax), %edx
11-
; X86-NEXT: shll $16, %edx
12-
; X86-NEXT: orl %ecx, %edx
13-
; X86-NEXT: orl $384, %edx # imm = 0x180
14-
; X86-NEXT: movw %dx, (%eax)
9+
; X86-NEXT: orw $384, (%eax) # imm = 0x180
1510
; X86-NEXT: retl
1611
;
1712
; X64-LABEL: i24_or:
1813
; X64: # %bb.0:
19-
; X64-NEXT: movzwl (%rdi), %eax
20-
; X64-NEXT: movzbl 2(%rdi), %ecx
21-
; X64-NEXT: shll $16, %ecx
22-
; X64-NEXT: orl %eax, %ecx
23-
; X64-NEXT: orl $384, %ecx # imm = 0x180
24-
; X64-NEXT: movw %cx, (%rdi)
14+
; X64-NEXT: orw $384, (%rdi) # imm = 0x180
2515
; X64-NEXT: retq
2616
%aa = load i24, ptr %a, align 1
2717
%b = or i24 %aa, 384
@@ -103,12 +93,12 @@ define void @i56_or(ptr %a) {
10393
; X86-LABEL: i56_or:
10494
; X86: # %bb.0:
10595
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
106-
; X86-NEXT: orl $384, (%eax) # imm = 0x180
96+
; X86-NEXT: orw $384, (%eax) # imm = 0x180
10797
; X86-NEXT: retl
10898
;
10999
; X64-LABEL: i56_or:
110100
; X64: # %bb.0:
111-
; X64-NEXT: orl $384, (%rdi) # imm = 0x180
101+
; X64-NEXT: orw $384, (%rdi) # imm = 0x180
112102
; X64-NEXT: retq
113103
%aa = load i56, ptr %a, align 1
114104
%b = or i56 %aa, 384

llvm/test/CodeGen/X86/pr35763.ll

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,7 @@ define dso_local void @PR35763() {
1414
; CHECK-NEXT: movzwl z+2(%rip), %ecx
1515
; CHECK-NEXT: orl %eax, %ecx
1616
; CHECK-NEXT: movq %rcx, tf_3_var_136(%rip)
17-
; CHECK-NEXT: movl z+6(%rip), %eax
18-
; CHECK-NEXT: movzbl z+10(%rip), %ecx
19-
; CHECK-NEXT: shlq $32, %rcx
20-
; CHECK-NEXT: orq %rax, %rcx
21-
; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
22-
; CHECK-NEXT: andq %rcx, %rax
23-
; CHECK-NEXT: movl %eax, z+6(%rip)
24-
; CHECK-NEXT: shrq $32, %rax
25-
; CHECK-NEXT: movb %al, z+10(%rip)
17+
; CHECK-NEXT: andl $-33554177, z+7(%rip) # imm = 0xFE0000FF
2618
; CHECK-NEXT: retq
2719
entry:
2820
%0 = load i16, ptr @z, align 8

llvm/test/CodeGen/X86/store_op_load_fold.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,7 @@ define void @test2() nounwind uwtable ssp {
2323
; CHECK-LABEL: test2:
2424
; CHECK: ## %bb.0:
2525
; CHECK-NEXT: movl L_s2$non_lazy_ptr, %eax
26-
; CHECK-NEXT: movzbl 22(%eax), %ecx
27-
; CHECK-NEXT: andl $-4, %ecx
28-
; CHECK-NEXT: movb %cl, 22(%eax)
29-
; CHECK-NEXT: movw $0, 20(%eax)
26+
; CHECK-NEXT: andl $-67108609, 19(%eax) ## imm = 0xFC0000FF
3027
; CHECK-NEXT: retl
3128
%bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
3229
%bf.clear36 = and i56 %bf.load35, -1125895611875329

0 commit comments

Comments
 (0)