@@ -20342,17 +20342,21 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
20342
20342
ST->getPointerInfo().getAddrSpace())
20343
20343
return SDValue();
20344
20344
20345
- // Find the type to narrow it the load / op / store to.
20345
+ // Find the type NewVT to narrow the load / op / store to.
20346
20346
SDValue N1 = Value.getOperand(1);
20347
20347
unsigned BitWidth = N1.getValueSizeInBits();
20348
20348
APInt Imm = N1->getAsAPIntVal();
20349
20349
if (Opc == ISD::AND)
20350
- Imm ^= APInt::getAllOnes(BitWidth );
20350
+ Imm.flipAllBits( );
20351
20351
if (Imm == 0 || Imm.isAllOnes())
20352
20352
return SDValue();
20353
- unsigned ShAmt = Imm.countr_zero();
20354
- unsigned MSB = BitWidth - Imm.countl_zero() - 1;
20355
- unsigned NewBW = NextPowerOf2(MSB - ShAmt);
20353
+ // Find least/most significant bit that need to be part of the narrowed
20354
+ // operation. We assume target will need to address/access full bytes, so
20355
+ // we make sure to align LSB and MSB at byte boundaries.
20356
+ unsigned BitsPerByteMask = 7u;
20357
+ unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
20358
+ unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
20359
+ unsigned NewBW = NextPowerOf2(MSB - LSB);
20356
20360
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
20357
20361
// The narrowing should be profitable, the load/store operation should be
20358
20362
// legal (or custom) and the store size should be equal to the NewVT width.
@@ -20367,68 +20371,69 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
20367
20371
if (NewBW >= BitWidth)
20368
20372
return SDValue();
20369
20373
20370
- // TODO: For big-endian we probably want to align given the most significant
20371
- // bit being modified instead of adjusting ShAmt based on least significant
20372
- // bits. This to reduce the risk of failing on the alignment check below. If
20373
- // for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to
20374
- // find NewBW=16, and we want to load/store with a PtrOff set to 2. But then
20375
- // ShAmt should be set to 8, which isn't a multiple of NewBW. But given
20376
- // that isNarrowingProfitable doesn't seem to be overridden for any in-tree
20377
- // big-endian target, then the support for big-endian here isn't covered by
20378
- // any in-tree lit tests, so it is unfortunately not highly optimized
20379
- // either. It should be possible to improve that by using
20380
- // ReduceLoadOpStoreWidthForceNarrowingProfitable.
20381
-
20382
- // If the lsb that is modified does not start at the type bitwidth boundary,
20383
- // align to start at the previous boundary.
20384
- ShAmt = ShAmt - ( ShAmt % NewBW);
20385
-
20386
- // Make sure we do not access memory outside the memory touched by the
20387
- // original load/store.
20388
- if (ShAmt + NewBW > VT.getStoreSizeInBits() )
20389
- return SDValue() ;
20374
+ // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
20375
+ // large enough to cover all bits that should be modified. This type might
20376
+ // however be larger than really needed (such as i32 while we actually only
20377
+ // need to modify one byte). Now we need to find our how to align the memory
20378
+ // accesses to satisfy preferred alignments as well as avoiding to access
20379
+ // memory outside the store size of the orignal access.
20380
+
20381
+ unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();
20382
+
20383
+ // Let ShAmt denote amount of bits to skip, counted from the least
20384
+ // significant bits of Imm. And let PtrOff how much the pointer needs to be
20385
+ // offsetted (in bytes) for the new access.
20386
+ unsigned ShAmt = 0;
20387
+ uint64_t PtrOff = 0;
20388
+ for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
20389
+ // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
20390
+ if (ShAmt > LSB)
20391
+ return SDValue();
20392
+ if (ShAmt + NewBW < MSB )
20393
+ continue ;
20390
20394
20391
- APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
20392
- std::min(BitWidth, ShAmt + NewBW));
20393
- if ((Imm & Mask) == Imm) {
20394
- APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
20395
- if (Opc == ISD::AND)
20396
- NewImm ^= APInt::getAllOnes(NewBW);
20397
- uint64_t PtrOff = ShAmt / 8;
20398
- // For big endian targets, we need to adjust the offset to the pointer to
20399
- // load the correct bytes.
20400
- if (DAG.getDataLayout().isBigEndian())
20401
- PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
20395
+ // Calculate PtrOff.
20396
+ unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
20397
+ ? VTStoreSize - NewBW - ShAmt
20398
+ : ShAmt;
20399
+ PtrOff = PtrAdjustmentInBits / 8;
20402
20400
20401
+ // Now check if narrow access is allowed and fast, considering alignments.
20403
20402
unsigned IsFast = 0;
20404
20403
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
20405
- if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
20406
- LD->getAddressSpace(), NewAlign,
20407
- LD->getMemOperand()->getFlags(), &IsFast) ||
20408
- !IsFast)
20409
- return SDValue();
20410
-
20411
- SDValue NewPtr =
20412
- DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
20413
- SDValue NewLD =
20414
- DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
20415
- LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
20416
- LD->getMemOperand()->getFlags(), LD->getAAInfo());
20417
- SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
20418
- DAG.getConstant(NewImm, SDLoc(Value),
20419
- NewVT));
20420
- SDValue NewST =
20421
- DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
20422
- ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
20423
-
20424
- AddToWorklist(NewPtr.getNode());
20425
- AddToWorklist(NewLD.getNode());
20426
- AddToWorklist(NewVal.getNode());
20427
- WorklistRemover DeadNodes(*this);
20428
- DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
20429
- ++OpsNarrowed;
20430
- return NewST;
20404
+ if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
20405
+ LD->getAddressSpace(), NewAlign,
20406
+ LD->getMemOperand()->getFlags(), &IsFast) &&
20407
+ IsFast)
20408
+ break;
20431
20409
}
20410
+ // If loop above did not find any accepted ShAmt we need to exit here.
20411
+ if (ShAmt + NewBW > VTStoreSize)
20412
+ return SDValue();
20413
+
20414
+ APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
20415
+ if (Opc == ISD::AND)
20416
+ NewImm.flipAllBits();
20417
+ Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
20418
+ SDValue NewPtr =
20419
+ DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
20420
+ SDValue NewLD =
20421
+ DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
20422
+ LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
20423
+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
20424
+ SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
20425
+ DAG.getConstant(NewImm, SDLoc(Value), NewVT));
20426
+ SDValue NewST =
20427
+ DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
20428
+ ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
20429
+
20430
+ AddToWorklist(NewPtr.getNode());
20431
+ AddToWorklist(NewLD.getNode());
20432
+ AddToWorklist(NewVal.getNode());
20433
+ WorklistRemover DeadNodes(*this);
20434
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
20435
+ ++OpsNarrowed;
20436
+ return NewST;
20432
20437
}
20433
20438
20434
20439
return SDValue();
0 commit comments