[Hexagon] Undo shift folding where it could simplify addressing mode

Krzysztof Parzyszek · Krzysztof Parzyszek · commit cdc2ace6920c · 2017-02-24T23:34:24.000Z
For example, avoid (single shift): r0 = and(##536870908,lsr(r0,#3)) r0 = memw(r1+r0<<#0) in favor of (two shifts): r0 = lsr(r0,#5) r0 = memw(r1+r0<<#2) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296196 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1023,8 +1023,8 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
     }
   }
 
-  // Transform: (store ch addr (add x (add (shl y c) e)))
-  //        to: (store ch addr (add x (shl (add y d) c))),
+  // Transform: (store ch val (add x (add (shl y c) e)))
+  //        to: (store ch val (add x (shl (add y d) c))),
   // where e = (shl d c) for some integer d.
   // The purpose of this is to enable generation of loads/stores with
   // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
@@ -1033,7 +1033,7 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
     if (I->getOpcode() != ISD::STORE)
       continue;
 
-    // I matched: (store ch addr Off)
+    // I matched: (store ch val Off)
     SDValue Off = I->getOperand(2);
     // Off needs to match: (add x (add (shl y c) (shl d c))))
     if (Off.getOpcode() != ISD::ADD)
@@ -1076,6 +1076,78 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
     ReplaceNode(T0.getNode(), NewShl.getNode());
   }
 
+  // Transform (load ch (add x (and (srl y c) Mask)))
+  //       to: (load ch (add x (shl (srl y d) d-c)))
+  // where
+  // Mask = 00..0 111..1 0.0
+  //          |     |     +-- d-c 0s, and d-c is 0, 1 or 2.
+  //          |     +-------- 1s
+  //          +-------------- at most c 0s
+  // Motivating example:
+  // DAG combiner optimizes (add x (shl (srl y 5) 2))
+  //                     to (add x (and (srl y 3) 1FFFFFFC))
+  // which results in a constant-extended and(##...,lsr). This transformation
+  // undoes this simplification for cases where the shl can be folded into
+  // an addressing mode.
+  for (SDNode *N : Nodes) {
+    unsigned Opc = N->getOpcode();
+    if (Opc != ISD::LOAD && Opc != ISD::STORE)
+      continue;
+    SDValue Addr = Opc == ISD::LOAD ? N->getOperand(1) : N->getOperand(2);
+    // Addr must match: (add x T0)
+    if (Addr.getOpcode() != ISD::ADD)
+      continue;
+    SDValue T0 = Addr.getOperand(1);
+    // T0 must match: (and T1 Mask)
+    if (T0.getOpcode() != ISD::AND)
+      continue;
+
+    // We have an AND.
+    //
+    // Check the first operand. It must be: (srl y c).
+    SDValue S = T0.getOperand(0);
+    if (S.getOpcode() != ISD::SRL)
+      continue;
+    ConstantSDNode *SN = dyn_cast<ConstantSDNode>(S.getOperand(1).getNode());
+    if (SN == nullptr)
+      continue;
+    if (SN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t CV = SN->getZExtValue();
+
+    // Check the second operand: the supposed mask.
+    ConstantSDNode *MN = dyn_cast<ConstantSDNode>(T0.getOperand(1).getNode());
+    if (MN == nullptr)
+      continue;
+    if (MN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t Mask = MN->getZExtValue();
+    // Examine the mask.
+    uint32_t TZ = countTrailingZeros(Mask);
+    uint32_t M1 = countTrailingOnes(Mask >> TZ);
+    uint32_t LZ = countLeadingZeros(Mask);
+    // Trailing zeros + middle ones + leading zeros must equal the width.
+    if (TZ + M1 + LZ != 32)
+      continue;
+    // The number of trailing zeros will be encoded in the addressing mode.
+    if (TZ > 2)
+      continue;
+    // The number of leading zeros must be at most c.
+    if (LZ > CV)
+      continue;
+
+    // All looks good.
+    SDValue Y = S.getOperand(0);
+    EVT VT = Addr.getValueType();
+    SDLoc dl(S);
+    // TZ = D-C, so D = TZ+C.
+    SDValue D = DAG.getConstant(TZ+CV, dl, VT);
+    SDValue DC = DAG.getConstant(TZ, dl, VT);
+    SDValue NewSrl = DAG.getNode(ISD::SRL, dl, VT, Y, D);
+    SDValue NewShl = DAG.getNode(ISD::SHL, dl, VT, NewSrl, DC);
+    ReplaceNode(T0.getNode(), NewShl.getNode());
+  }
+
   if (EnableAddressRebalancing) {
     rebalanceAddressTrees();
 
diff --git a/test/CodeGen/Hexagon/undo-dag-shift.ll b/test/CodeGen/Hexagon/undo-dag-shift.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; DAG combiner folds sequences of shifts, which can sometimes obscure
+; optimization opportunities. For example
+;
+;   unsigned int c(unsigned int b, unsigned int *a) {
+;     unsigned int bitidx = b >> 5;
+;     return a[bitidx];
+;   }
+;
+; produces
+;   (add x (shl (srl y 5) 2))
+; which is then folded into
+;   (add x (and (srl y 3) 1FFFFFFC))
+;
+; That results in a constant-extended and:
+;   r0 = and(##536870908,lsr(r0,#3))
+;   r0 = memw(r1+r0<<#0)
+; whereas
+;   r0 = lsr(r0,#5)
+;   r0 = memw(r1+r0<<#2)
+; is more desirable.
+
+target triple = "hexagon"
+
+; CHECK-LABEL: load_0
+; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2)
+define i32 @load_0(i32 %b, i32* nocapture readonly %a) #0 {
+entry:
+  %shr = lshr i32 %b, 5
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; This would require r0<<#3, which is not legal.
+; CHECK-LABEL: load_1
+; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#0)
+define i32 @load_1(i32 %b, [3 x i32]* nocapture readonly %a) #0 {
+entry:
+  %shr = lshr i32 %b, 5
+  %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %a, i32 %shr, i32 0
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: store_0
+; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2)
+define void @store_0(i32 %b, i32* nocapture %a, i32 %v) #1 {
+entry:
+  %shr = lshr i32 %b, 5
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr
+  store i32 %v, i32* %arrayidx, align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind readonly "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+