[DAGCombiner] Refactor and improve ReduceLoadOpStoreWidth (#119564)

bjope · web-flow · commit 3ad239914895 · 2024-12-16T12:15:11.000+01:00
This patch make a couple of improvements to ReduceLoadOpStoreWidth.

When determining the minimum size of "NewBW" we now take byte boundaries
into account. If we for example touch bits 6-10 we shouldn't accept
NewBW=8, because we would fail later when detecting that we can't access
bits from two different bytes in memory using a single load. Instead we
make sure to align LSB/MSB according to byte size boundaries up front
before searching for a viable "NewBW".

In the past we only tried to find a "ShAmt" that was a multiple of
"NewBW", but now we use a sliding window technique to scan for a viable
"ShAmt" that is a multiple of the byte size. This can help out finding
more opportunities for optimization (specially if the original type
isn't byte sized, and for big-endian targets when the original
load/store is aligned on the most significant bit).
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20342,17 +20342,21 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
         ST->getPointerInfo().getAddrSpace())
       return SDValue();
 
-    // Find the type to narrow it the load / op / store to.
+    // Find the type NewVT to narrow the load / op / store to.
     SDValue N1 = Value.getOperand(1);
     unsigned BitWidth = N1.getValueSizeInBits();
     APInt Imm = N1->getAsAPIntVal();
     if (Opc == ISD::AND)
-      Imm ^= APInt::getAllOnes(BitWidth);
+      Imm.flipAllBits();
     if (Imm == 0 || Imm.isAllOnes())
       return SDValue();
-    unsigned ShAmt = Imm.countr_zero();
-    unsigned MSB = BitWidth - Imm.countl_zero() - 1;
-    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    // Find least/most significant bit that need to be part of the narrowed
+    // operation. We assume target will need to address/access full bytes, so
+    // we make sure to align LSB and MSB at byte boundaries.
+    unsigned BitsPerByteMask = 7u;
+    unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
+    unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
+    unsigned NewBW = NextPowerOf2(MSB - LSB);
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     // The narrowing should be profitable, the load/store operation should be
     // legal (or custom) and the store size should be equal to the NewVT width.
@@ -20367,68 +20371,69 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     if (NewBW >= BitWidth)
       return SDValue();
 
-    // TODO: For big-endian we probably want to align given the most significant
-    // bit being modified instead of adjusting ShAmt based on least significant
-    // bits. This to reduce the risk of failing on the alignment check below. If
-    // for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to
-    // find NewBW=16, and we want to load/store with a PtrOff set to 2. But then
-    // ShAmt should be set to 8, which isn't a multiple of NewBW.  But given
-    // that isNarrowingProfitable doesn't seem to be overridden for any in-tree
-    // big-endian target, then the support for big-endian here isn't covered by
-    // any in-tree lit tests, so it is unfortunately not highly optimized
-    // either. It should be possible to improve that by using
-    // ReduceLoadOpStoreWidthForceNarrowingProfitable.
-
-    // If the lsb that is modified does not start at the type bitwidth boundary,
-    // align to start at the previous boundary.
-    ShAmt = ShAmt - (ShAmt % NewBW);
-
-    // Make sure we do not access memory outside the memory touched by the
-    // original load/store.
-    if (ShAmt + NewBW > VT.getStoreSizeInBits())
-      return SDValue();
+    // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
+    // large enough to cover all bits that should be modified. This type might
+    // however be larger than really needed (such as i32 while we actually only
+    // need to modify one byte). Now we need to find our how to align the memory
+    // accesses to satisfy preferred alignments as well as avoiding to access
+    // memory outside the store size of the orignal access.
+
+    unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();
+
+    // Let ShAmt denote amount of bits to skip, counted from the least
+    // significant bits of Imm. And let PtrOff how much the pointer needs to be
+    // offsetted (in bytes) for the new access.
+    unsigned ShAmt = 0;
+    uint64_t PtrOff = 0;
+    for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
+      // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
+      if (ShAmt > LSB)
+        return SDValue();
+      if (ShAmt + NewBW < MSB)
+        continue;
 
-    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
-                                   std::min(BitWidth, ShAmt + NewBW));
-    if ((Imm & Mask) == Imm) {
-      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
-      if (Opc == ISD::AND)
-        NewImm ^= APInt::getAllOnes(NewBW);
-      uint64_t PtrOff = ShAmt / 8;
-      // For big endian targets, we need to adjust the offset to the pointer to
-      // load the correct bytes.
-      if (DAG.getDataLayout().isBigEndian())
-        PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+      // Calculate PtrOff.
+      unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
+                                         ? VTStoreSize - NewBW - ShAmt
+                                         : ShAmt;
+      PtrOff = PtrAdjustmentInBits / 8;
 
+      // Now check if narrow access is allowed and fast, considering alignments.
       unsigned IsFast = 0;
       Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
-                                  LD->getAddressSpace(), NewAlign,
-                                  LD->getMemOperand()->getFlags(), &IsFast) ||
-          !IsFast)
-        return SDValue();
-
-      SDValue NewPtr =
-          DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
-      SDValue NewLD =
-          DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
-                      LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
-                      LD->getMemOperand()->getFlags(), LD->getAAInfo());
-      SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
-                                   DAG.getConstant(NewImm, SDLoc(Value),
-                                                   NewVT));
-      SDValue NewST =
-          DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
-                       ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
-
-      AddToWorklist(NewPtr.getNode());
-      AddToWorklist(NewLD.getNode());
-      AddToWorklist(NewVal.getNode());
-      WorklistRemover DeadNodes(*this);
-      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
-      ++OpsNarrowed;
-      return NewST;
+      if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
+                                 LD->getAddressSpace(), NewAlign,
+                                 LD->getMemOperand()->getFlags(), &IsFast) &&
+          IsFast)
+        break;
     }
+    // If loop above did not find any accepted ShAmt we need to exit here.
+    if (ShAmt + NewBW > VTStoreSize)
+      return SDValue();
+
+    APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
+    if (Opc == ISD::AND)
+      NewImm.flipAllBits();
+    Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
+    SDValue NewPtr =
+        DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
+    SDValue NewLD =
+        DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
+                    LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                    LD->getMemOperand()->getFlags(), LD->getAAInfo());
+    SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
+                                 DAG.getConstant(NewImm, SDLoc(Value), NewVT));
+    SDValue NewST =
+        DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
+                     ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
+
+    AddToWorklist(NewPtr.getNode());
+    AddToWorklist(NewLD.getNode());
+    AddToWorklist(NewVal.getNode());
+    WorklistRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
+    ++OpsNarrowed;
+    return NewST;
   }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
@@ -32,10 +32,10 @@ define i16 @test(ptr %p1) {
 ;
 ; CHECK-LE-NARROW-LABEL: test:
 ; CHECK-LE-NARROW:       @ %bb.0: @ %entry
-; CHECK-LE-NARROW-NEXT:    ldrh r1, [r0, #8]
-; CHECK-LE-NARROW-NEXT:    movw r2, #65534
-; CHECK-LE-NARROW-NEXT:    orr r1, r1, r2
-; CHECK-LE-NARROW-NEXT:    strh r1, [r0, #8]
+; CHECK-LE-NARROW-NEXT:    ldr r1, [r0, #6]
+; CHECK-LE-NARROW-NEXT:    orr r1, r1, #16646144
+; CHECK-LE-NARROW-NEXT:    orr r1, r1, #-16777216
+; CHECK-LE-NARROW-NEXT:    str r1, [r0, #6]
 ; CHECK-LE-NARROW-NEXT:    mov r0, #0
 ; CHECK-LE-NARROW-NEXT:    bx lr
 ;
@@ -50,10 +50,10 @@ define i16 @test(ptr %p1) {
 ;
 ; CHECK-BE-NARROW-LABEL: test:
 ; CHECK-BE-NARROW:       @ %bb.0: @ %entry
-; CHECK-BE-NARROW-NEXT:    ldrh r1, [r0]
-; CHECK-BE-NARROW-NEXT:    movw r2, #65534
-; CHECK-BE-NARROW-NEXT:    orr r1, r1, r2
-; CHECK-BE-NARROW-NEXT:    strh r1, [r0]
+; CHECK-BE-NARROW-NEXT:    ldr r1, [r0]
+; CHECK-BE-NARROW-NEXT:    orr r1, r1, #16646144
+; CHECK-BE-NARROW-NEXT:    orr r1, r1, #-16777216
+; CHECK-BE-NARROW-NEXT:    str r1, [r0]
 ; CHECK-BE-NARROW-NEXT:    mov r0, #0
 ; CHECK-BE-NARROW-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
@@ -906,13 +906,13 @@ entry:
 define void @or64mi_legacy(ptr %a) {
 ; CHECK-LABEL: or64mi_legacy:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or64mi_legacy:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -6,22 +6,12 @@ define void @i24_or(ptr %a) {
 ; X86-LABEL: i24_or:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %ecx
-; X86-NEXT:    movzbl 2(%eax), %edx
-; X86-NEXT:    shll $16, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl $384, %edx # imm = 0x180
-; X86-NEXT:    movw %dx, (%eax)
+; X86-NEXT:    orw $384, (%eax) # imm = 0x180
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i24_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    shll $16, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    orl $384, %ecx # imm = 0x180
-; X64-NEXT:    movw %cx, (%rdi)
+; X64-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; X64-NEXT:    retq
   %aa = load i24, ptr %a, align 1
   %b = or i24 %aa, 384
@@ -103,12 +93,12 @@ define void @i56_or(ptr %a) {
 ; X86-LABEL: i56_or:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl $384, (%eax) # imm = 0x180
+; X86-NEXT:    orw $384, (%eax) # imm = 0x180
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i56_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    orl $384, (%rdi) # imm = 0x180
+; X64-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; X64-NEXT:    retq
   %aa = load i56, ptr %a, align 1
   %b = or i56 %aa, 384
diff --git a/llvm/test/CodeGen/X86/pr35763.ll b/llvm/test/CodeGen/X86/pr35763.ll
@@ -14,15 +14,7 @@ define dso_local void @PR35763() {
 ; CHECK-NEXT:    movzwl z+2(%rip), %ecx
 ; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    movq %rcx, tf_3_var_136(%rip)
-; CHECK-NEXT:    movl z+6(%rip), %eax
-; CHECK-NEXT:    movzbl z+10(%rip), %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    orq %rax, %rcx
-; CHECK-NEXT:    movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
-; CHECK-NEXT:    andq %rcx, %rax
-; CHECK-NEXT:    movl %eax, z+6(%rip)
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movb %al, z+10(%rip)
+; CHECK-NEXT:    andl $-33554177, z+7(%rip) # imm = 0xFE0000FF
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i16, ptr @z, align 8
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll
@@ -23,10 +23,7 @@ define void @test2() nounwind uwtable ssp {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl L_s2$non_lazy_ptr, %eax
-; CHECK-NEXT:    movzbl 22(%eax), %ecx
-; CHECK-NEXT:    andl $-4, %ecx
-; CHECK-NEXT:    movb %cl, 22(%eax)
-; CHECK-NEXT:    movw $0, 20(%eax)
+; CHECK-NEXT:    andl $-67108609, 19(%eax) ## imm = 0xFC0000FF
 ; CHECK-NEXT:    retl
   %bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
   %bf.clear36 = and i56 %bf.load35, -1125895611875329