Skip to content

Commit 33e6b48

Browse files
authored
[SelectionDAG] Fix and improve TargetLowering::SimplifySetCC (#87646)
The load narrowing part of TargetLowering::SimplifySetCC is updated according to this: 1) The offset calculation (for big endian) did not work properly for non byte-sized types. This is basically solved by an early exit if the memory type isn't byte-sized. But the code is also corrected to use the store size when calculating the offset. 2) To still allow some optimizations for non-byte-sized types the TargetLowering::isPaddedAtMostSignificantBitsWhenStored hook is added. By default it assumes that scalar integer types are padded starting at the most significant bits, if the type needs padding when being stored to memory. 3) Allow optimizing when isPaddedAtMostSignificantBitsWhenStored is true, as that hook makes it possible for TargetLowering to know how the non byte-sized value is aligned in memory. 4) Update the algorithm to always search for a narrowed load with a power-of-2 byte-sized type. In the past the algorithm started with the the width of the original load, and then divided it by two for each iteration. But for a type such as i48 that would just end up trying to narrow the load into a i24 or i12 load, and then we would fail sooner or later due to not finding a newVT that fulfilled newVT.isRound(). With this new approach we can narrow the i48 load into either an i8, i16 or i32 load. By checking if such a load is allowed (e.g. alignment wise) for any "multiple of 8 offset", then we can find more opportunities for the optimization to trigger. So even for a byte-sized type such as i32 we may now end up narrowing the load into loading the 16 bits starting at offset 8 (if that is allowed by the target). The old algorithm did not even consider that case. 5) Also start using getObjectPtrOffset instead of getMemBasePlusOffset when creating the new ptr. This way we get "nsw" on the add.
1 parent bcf047a commit 33e6b48

File tree

4 files changed

+79
-67
lines changed

4 files changed

+79
-67
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,6 +1804,13 @@ class TargetLoweringBase {
18041804
/// where the sext is redundant, and use x directly.
18051805
virtual bool shouldRemoveRedundantExtend(SDValue Op) const { return true; }
18061806

1807+
/// Indicates if any padding is guaranteed to go at the most significant bits
1808+
/// when storing the type to memory and the type size isn't equal to the store
1809+
/// size.
1810+
bool isPaddedAtMostSignificantBitsWhenStored(EVT VT) const {
1811+
return VT.isScalarInteger() && !VT.isByteSized();
1812+
}
1813+
18071814
/// When splitting a value of the specified type into parts, does the Lo
18081815
/// or Hi part come first? This usually follows the endianness, except
18091816
/// for ppcf128, where the Hi part always comes first.

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4621,48 +4621,62 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
46214621
LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
46224622
APInt bestMask;
46234623
unsigned bestWidth = 0, bestOffset = 0;
4624-
if (Lod->isSimple() && Lod->isUnindexed()) {
4624+
if (Lod->isSimple() && Lod->isUnindexed() &&
4625+
(Lod->getMemoryVT().isByteSized() ||
4626+
isPaddedAtMostSignificantBitsWhenStored(Lod->getMemoryVT()))) {
4627+
unsigned memWidth = Lod->getMemoryVT().getStoreSizeInBits();
46254628
unsigned origWidth = N0.getValueSizeInBits();
46264629
unsigned maskWidth = origWidth;
46274630
// We can narrow (e.g.) 16-bit extending loads on 32-bit target to
46284631
// 8 bits, but have to be careful...
46294632
if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
46304633
origWidth = Lod->getMemoryVT().getSizeInBits();
46314634
const APInt &Mask = N0.getConstantOperandAPInt(1);
4632-
for (unsigned width = origWidth / 2; width>=8; width /= 2) {
4635+
// Only consider power-of-2 widths (and at least one byte) as candiates
4636+
// for the narrowed load.
4637+
for (unsigned width = 8; width < origWidth; width *= 2) {
4638+
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), width);
4639+
if (!shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT))
4640+
continue;
46334641
APInt newMask = APInt::getLowBitsSet(maskWidth, width);
4634-
for (unsigned offset=0; offset<origWidth/width; offset++) {
4642+
// Avoid accessing any padding here for now (we could use memWidth
4643+
// instead of origWidth here otherwise).
4644+
unsigned maxOffset = origWidth - width;
4645+
for (unsigned offset = 0; offset <= maxOffset; offset += 8) {
46354646
if (Mask.isSubsetOf(newMask)) {
4636-
if (Layout.isLittleEndian())
4637-
bestOffset = (uint64_t)offset * (width/8);
4638-
else
4639-
bestOffset = (origWidth/width - offset - 1) * (width/8);
4640-
bestMask = Mask.lshr(offset * (width/8) * 8);
4641-
bestWidth = width;
4642-
break;
4647+
unsigned ptrOffset =
4648+
Layout.isLittleEndian() ? offset : memWidth - width - offset;
4649+
unsigned IsFast = 0;
4650+
Align NewAlign = commonAlignment(Lod->getAlign(), ptrOffset / 8);
4651+
if (allowsMemoryAccess(
4652+
*DAG.getContext(), Layout, newVT, Lod->getAddressSpace(),
4653+
NewAlign, Lod->getMemOperand()->getFlags(), &IsFast) &&
4654+
IsFast) {
4655+
bestOffset = ptrOffset / 8;
4656+
bestMask = Mask.lshr(offset);
4657+
bestWidth = width;
4658+
break;
4659+
}
46434660
}
4644-
newMask <<= width;
4661+
newMask <<= 8;
46454662
}
4663+
if (bestWidth)
4664+
break;
46464665
}
46474666
}
46484667
if (bestWidth) {
46494668
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
4650-
if (newVT.isRound() &&
4651-
shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
4652-
SDValue Ptr = Lod->getBasePtr();
4653-
if (bestOffset != 0)
4654-
Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(bestOffset),
4655-
dl);
4656-
SDValue NewLoad =
4657-
DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
4658-
Lod->getPointerInfo().getWithOffset(bestOffset),
4659-
Lod->getOriginalAlign());
4660-
return DAG.getSetCC(dl, VT,
4661-
DAG.getNode(ISD::AND, dl, newVT, NewLoad,
4662-
DAG.getConstant(bestMask.trunc(bestWidth),
4663-
dl, newVT)),
4664-
DAG.getConstant(0LL, dl, newVT), Cond);
4665-
}
4669+
SDValue Ptr = Lod->getBasePtr();
4670+
if (bestOffset != 0)
4671+
Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(bestOffset));
4672+
SDValue NewLoad =
4673+
DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
4674+
Lod->getPointerInfo().getWithOffset(bestOffset),
4675+
Lod->getOriginalAlign());
4676+
SDValue And =
4677+
DAG.getNode(ISD::AND, dl, newVT, NewLoad,
4678+
DAG.getConstant(bestMask.trunc(bestWidth), dl, newVT));
4679+
return DAG.getSetCC(dl, VT, And, DAG.getConstant(0LL, dl, newVT), Cond);
46664680
}
46674681
}
46684682

llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ define i1 @test_129_15_0(ptr %y) {
4040
;
4141
; CHECK-BE-LABEL: test_129_15_0:
4242
; CHECK-BE: @ %bb.0:
43-
; CHECK-BE-NEXT: ldrh r0, [r0, #14]
43+
; CHECK-BE-NEXT: ldr r1, [r0, #12]
44+
; CHECK-BE-NEXT: ldrb r0, [r0, #16]
45+
; CHECK-BE-NEXT: orr r0, r0, r1, lsl #8
4446
; CHECK-BE-NEXT: mov r1, #255
4547
; CHECK-BE-NEXT: orr r1, r1, #32512
4648
; CHECK-BE-NEXT: ands r0, r0, r1
@@ -49,7 +51,7 @@ define i1 @test_129_15_0(ptr %y) {
4951
;
5052
; CHECK-V7-BE-LABEL: test_129_15_0:
5153
; CHECK-V7-BE: @ %bb.0:
52-
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #14]
54+
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #15]
5355
; CHECK-V7-BE-NEXT: bfc r0, #15, #17
5456
; CHECK-V7-BE-NEXT: cmp r0, #0
5557
; CHECK-V7-BE-NEXT: movwne r0, #1
@@ -119,14 +121,14 @@ define i1 @test_33_8_0(ptr %y) {
119121
;
120122
; CHECK-BE-LABEL: test_33_8_0:
121123
; CHECK-BE: @ %bb.0:
122-
; CHECK-BE-NEXT: ldrb r0, [r0, #3]
124+
; CHECK-BE-NEXT: ldrb r0, [r0, #4]
123125
; CHECK-BE-NEXT: cmp r0, #0
124126
; CHECK-BE-NEXT: movne r0, #1
125127
; CHECK-BE-NEXT: mov pc, lr
126128
;
127129
; CHECK-V7-BE-LABEL: test_33_8_0:
128130
; CHECK-V7-BE: @ %bb.0:
129-
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #3]
131+
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #4]
130132
; CHECK-V7-BE-NEXT: cmp r0, #0
131133
; CHECK-V7-BE-NEXT: movwne r0, #1
132134
; CHECK-V7-BE-NEXT: bx lr
@@ -179,13 +181,13 @@ define i1 @test_33_1_31(ptr %y) {
179181
;
180182
; CHECK-BE-LABEL: test_33_1_31:
181183
; CHECK-BE: @ %bb.0:
182-
; CHECK-BE-NEXT: ldrb r0, [r0]
184+
; CHECK-BE-NEXT: ldrb r0, [r0, #1]
183185
; CHECK-BE-NEXT: lsr r0, r0, #7
184186
; CHECK-BE-NEXT: mov pc, lr
185187
;
186188
; CHECK-V7-BE-LABEL: test_33_1_31:
187189
; CHECK-V7-BE: @ %bb.0:
188-
; CHECK-V7-BE-NEXT: ldrb r0, [r0]
190+
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #1]
189191
; CHECK-V7-BE-NEXT: lsr r0, r0, #7
190192
; CHECK-V7-BE-NEXT: bx lr
191193
%a = load i33, ptr %y
@@ -209,13 +211,13 @@ define i1 @test_33_1_0(ptr %y) {
209211
;
210212
; CHECK-BE-LABEL: test_33_1_0:
211213
; CHECK-BE: @ %bb.0:
212-
; CHECK-BE-NEXT: ldrb r0, [r0, #3]
214+
; CHECK-BE-NEXT: ldrb r0, [r0, #4]
213215
; CHECK-BE-NEXT: and r0, r0, #1
214216
; CHECK-BE-NEXT: mov pc, lr
215217
;
216218
; CHECK-V7-BE-LABEL: test_33_1_0:
217219
; CHECK-V7-BE: @ %bb.0:
218-
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #3]
220+
; CHECK-V7-BE-NEXT: ldrb r0, [r0, #4]
219221
; CHECK-V7-BE-NEXT: and r0, r0, #1
220222
; CHECK-V7-BE-NEXT: bx lr
221223
%a = load i33, ptr %y
@@ -309,7 +311,7 @@ define i1 @test_48_16_8(ptr %y) {
309311
; CHECK-LE-LABEL: test_48_16_8:
310312
; CHECK-LE: @ %bb.0:
311313
; CHECK-LE-NEXT: ldrh r0, [r0, #1]
312-
; CHECK-LE-NEXT: cmp r0, #0
314+
; CHECK-LE-NEXT: lsls r0, r0, #8
313315
; CHECK-LE-NEXT: movne r0, #1
314316
; CHECK-LE-NEXT: mov pc, lr
315317
;
@@ -444,9 +446,7 @@ define i1 @test_48_17_0(ptr %y) {
444446
;
445447
; CHECK-V7-BE-LABEL: test_48_17_0:
446448
; CHECK-V7-BE: @ %bb.0:
447-
; CHECK-V7-BE-NEXT: ldr r1, [r0]
448-
; CHECK-V7-BE-NEXT: ldrh r0, [r0, #4]
449-
; CHECK-V7-BE-NEXT: orr r0, r0, r1, lsl #16
449+
; CHECK-V7-BE-NEXT: ldr r0, [r0, #2]
450450
; CHECK-V7-BE-NEXT: bfc r0, #17, #15
451451
; CHECK-V7-BE-NEXT: cmp r0, #0
452452
; CHECK-V7-BE-NEXT: movwne r0, #1
@@ -506,15 +506,14 @@ define i1 @test_40_1_32(ptr %y) {
506506
;
507507
; CHECK-BE-LABEL: test_40_1_32:
508508
; CHECK-BE: @ %bb.0:
509-
; CHECK-BE-NEXT: ldr r0, [r0]
510-
; CHECK-BE-NEXT: mov r1, #1
511-
; CHECK-BE-NEXT: and r0, r1, r0, lsr #24
509+
; CHECK-BE-NEXT: ldrb r0, [r0]
510+
; CHECK-BE-NEXT: and r0, r0, #1
512511
; CHECK-BE-NEXT: mov pc, lr
513512
;
514513
; CHECK-V7-BE-LABEL: test_40_1_32:
515514
; CHECK-V7-BE: @ %bb.0:
516-
; CHECK-V7-BE-NEXT: ldr r0, [r0]
517-
; CHECK-V7-BE-NEXT: ubfx r0, r0, #24, #1
515+
; CHECK-V7-BE-NEXT: ldrb r0, [r0]
516+
; CHECK-V7-BE-NEXT: and r0, r0, #1
518517
; CHECK-V7-BE-NEXT: bx lr
519518
%a = load i40, ptr %y
520519
%b = and i40 %a, u0x100000000

llvm/test/CodeGen/PowerPC/simplifysetcc_narrow_load.ll

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ define i1 @test_129_15_0(ptr %y) {
2626
;
2727
; CHECK-BE-LABEL: test_129_15_0:
2828
; CHECK-BE: # %bb.0:
29-
; CHECK-BE-NEXT: lhz 3, 14(3)
29+
; CHECK-BE-NEXT: lhz 3, 15(3)
3030
; CHECK-BE-NEXT: clrlwi 3, 3, 17
3131
; CHECK-BE-NEXT: addic 4, 3, -1
3232
; CHECK-BE-NEXT: subfe 3, 4, 3
@@ -69,7 +69,7 @@ define i1 @test_33_8_0(ptr %y) {
6969
;
7070
; CHECK-BE-LABEL: test_33_8_0:
7171
; CHECK-BE: # %bb.0:
72-
; CHECK-BE-NEXT: lbz 3, 3(3)
72+
; CHECK-BE-NEXT: lbz 3, 4(3)
7373
; CHECK-BE-NEXT: addic 4, 3, -1
7474
; CHECK-BE-NEXT: subfe 3, 4, 3
7575
; CHECK-BE-NEXT: blr
@@ -105,7 +105,7 @@ define i1 @test_33_1_31(ptr %y) {
105105
;
106106
; CHECK-BE-LABEL: test_33_1_31:
107107
; CHECK-BE: # %bb.0:
108-
; CHECK-BE-NEXT: lbz 3, 0(3)
108+
; CHECK-BE-NEXT: lbz 3, 1(3)
109109
; CHECK-BE-NEXT: srwi 3, 3, 7
110110
; CHECK-BE-NEXT: blr
111111
%a = load i33, ptr %y
@@ -123,7 +123,7 @@ define i1 @test_33_1_0(ptr %y) {
123123
;
124124
; CHECK-BE-LABEL: test_33_1_0:
125125
; CHECK-BE: # %bb.0:
126-
; CHECK-BE-NEXT: lbz 3, 3(3)
126+
; CHECK-BE-NEXT: lbz 3, 4(3)
127127
; CHECK-BE-NEXT: clrlwi 3, 3, 31
128128
; CHECK-BE-NEXT: blr
129129
%a = load i33, ptr %y
@@ -250,12 +250,10 @@ define i1 @test_48_17_0(ptr %y) {
250250
;
251251
; CHECK-BE-LABEL: test_48_17_0:
252252
; CHECK-BE: # %bb.0:
253-
; CHECK-BE-NEXT: lhz 4, 4(3)
254-
; CHECK-BE-NEXT: lwz 3, 0(3)
255-
; CHECK-BE-NEXT: clrlwi 4, 4, 16
256-
; CHECK-BE-NEXT: rlwimi 4, 3, 16, 15, 15
257-
; CHECK-BE-NEXT: addic 3, 4, -1
258-
; CHECK-BE-NEXT: subfe 3, 3, 4
253+
; CHECK-BE-NEXT: lwz 3, 2(3)
254+
; CHECK-BE-NEXT: clrlwi 3, 3, 15
255+
; CHECK-BE-NEXT: addic 4, 3, -1
256+
; CHECK-BE-NEXT: subfe 3, 4, 3
259257
; CHECK-BE-NEXT: blr
260258
%a = load i48, ptr %y
261259
%b = and i48 %a, u0x1ffff
@@ -292,8 +290,8 @@ define i1 @test_40_1_32(ptr %y) {
292290
;
293291
; CHECK-BE-LABEL: test_40_1_32:
294292
; CHECK-BE: # %bb.0:
295-
; CHECK-BE-NEXT: lwz 3, 0(3)
296-
; CHECK-BE-NEXT: rlwinm 3, 3, 8, 31, 31
293+
; CHECK-BE-NEXT: lbz 3, 0(3)
294+
; CHECK-BE-NEXT: clrlwi 3, 3, 31
297295
; CHECK-BE-NEXT: blr
298296
%a = load i40, ptr %y
299297
%b = and i40 %a, u0x100000000
@@ -325,15 +323,13 @@ define i1 @test_24_8_8(ptr %y) {
325323
; CHECK-LE-LABEL: test_24_8_8:
326324
; CHECK-LE: # %bb.0:
327325
; CHECK-LE-NEXT: lbz 3, 1(3)
328-
; CHECK-LE-NEXT: slwi 3, 3, 8
329326
; CHECK-LE-NEXT: addic 4, 3, -1
330327
; CHECK-LE-NEXT: subfe 3, 4, 3
331328
; CHECK-LE-NEXT: blr
332329
;
333330
; CHECK-BE-LABEL: test_24_8_8:
334331
; CHECK-BE: # %bb.0:
335332
; CHECK-BE-NEXT: lbz 3, 1(3)
336-
; CHECK-BE-NEXT: slwi 3, 3, 8
337333
; CHECK-BE-NEXT: addic 4, 3, -1
338334
; CHECK-BE-NEXT: subfe 3, 4, 3
339335
; CHECK-BE-NEXT: blr
@@ -346,18 +342,16 @@ define i1 @test_24_8_8(ptr %y) {
346342
define i1 @test_24_8_12(ptr %y) {
347343
; CHECK-LE-LABEL: test_24_8_12:
348344
; CHECK-LE: # %bb.0:
349-
; CHECK-LE-NEXT: lhz 4, 0(3)
350-
; CHECK-LE-NEXT: lbz 3, 2(3)
351-
; CHECK-LE-NEXT: rlwinm 4, 4, 0, 16, 19
352-
; CHECK-LE-NEXT: rlwimi 4, 3, 16, 12, 15
353-
; CHECK-LE-NEXT: addic 3, 4, -1
354-
; CHECK-LE-NEXT: subfe 3, 3, 4
345+
; CHECK-LE-NEXT: lhz 3, 1(3)
346+
; CHECK-LE-NEXT: rlwinm 3, 3, 0, 20, 27
347+
; CHECK-LE-NEXT: addic 4, 3, -1
348+
; CHECK-LE-NEXT: subfe 3, 4, 3
355349
; CHECK-LE-NEXT: blr
356350
;
357351
; CHECK-BE-LABEL: test_24_8_12:
358352
; CHECK-BE: # %bb.0:
359353
; CHECK-BE-NEXT: lhz 3, 0(3)
360-
; CHECK-BE-NEXT: rlwinm 3, 3, 8, 12, 19
354+
; CHECK-BE-NEXT: rlwinm 3, 3, 0, 20, 27
361355
; CHECK-BE-NEXT: addic 4, 3, -1
362356
; CHECK-BE-NEXT: subfe 3, 4, 3
363357
; CHECK-BE-NEXT: blr
@@ -371,15 +365,13 @@ define i1 @test_24_8_16(ptr %y) {
371365
; CHECK-LE-LABEL: test_24_8_16:
372366
; CHECK-LE: # %bb.0:
373367
; CHECK-LE-NEXT: lbz 3, 2(3)
374-
; CHECK-LE-NEXT: slwi 3, 3, 16
375368
; CHECK-LE-NEXT: addic 4, 3, -1
376369
; CHECK-LE-NEXT: subfe 3, 4, 3
377370
; CHECK-LE-NEXT: blr
378371
;
379372
; CHECK-BE-LABEL: test_24_8_16:
380373
; CHECK-BE: # %bb.0:
381374
; CHECK-BE-NEXT: lbz 3, 0(3)
382-
; CHECK-BE-NEXT: slwi 3, 3, 16
383375
; CHECK-BE-NEXT: addic 4, 3, -1
384376
; CHECK-BE-NEXT: subfe 3, 4, 3
385377
; CHECK-BE-NEXT: blr

0 commit comments

Comments
 (0)