Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit cdc2ace

Browse files
author
Krzysztof Parzyszek
committed
[Hexagon] Undo shift folding where it could simplify addressing mode
For example, avoid (single shift): r0 = and(##536870908,lsr(r0,#3)) r0 = memw(r1+r0<<#0) in favor of (two shifts): r0 = lsr(r0,#5) r0 = memw(r1+r0<<#2) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296196 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 53ff96a commit cdc2ace

File tree

2 files changed

+134
-3
lines changed

2 files changed

+134
-3
lines changed

lib/Target/Hexagon/HexagonISelDAGToDAG.cpp

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,8 +1023,8 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
10231023
}
10241024
}
10251025

1026-
// Transform: (store ch addr (add x (add (shl y c) e)))
1027-
// to: (store ch addr (add x (shl (add y d) c))),
1026+
// Transform: (store ch val (add x (add (shl y c) e)))
1027+
// to: (store ch val (add x (shl (add y d) c))),
10281028
// where e = (shl d c) for some integer d.
10291029
// The purpose of this is to enable generation of loads/stores with
10301030
// shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
@@ -1033,7 +1033,7 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
10331033
if (I->getOpcode() != ISD::STORE)
10341034
continue;
10351035

1036-
// I matched: (store ch addr Off)
1036+
// I matched: (store ch val Off)
10371037
SDValue Off = I->getOperand(2);
10381038
// Off needs to match: (add x (add (shl y c) (shl d c))))
10391039
if (Off.getOpcode() != ISD::ADD)
@@ -1076,6 +1076,78 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
10761076
ReplaceNode(T0.getNode(), NewShl.getNode());
10771077
}
10781078

1079+
// Transform (load ch (add x (and (srl y c) Mask)))
1080+
// to: (load ch (add x (shl (srl y d) d-c)))
1081+
// where
1082+
// Mask = 00..0 111..1 0.0
1083+
// | | +-- d-c 0s, and d-c is 0, 1 or 2.
1084+
// | +-------- 1s
1085+
// +-------------- at most c 0s
1086+
// Motivating example:
1087+
// DAG combiner optimizes (add x (shl (srl y 5) 2))
1088+
// to (add x (and (srl y 3) 1FFFFFFC))
1089+
// which results in a constant-extended and(##...,lsr). This transformation
1090+
// undoes this simplification for cases where the shl can be folded into
1091+
// an addressing mode.
1092+
for (SDNode *N : Nodes) {
1093+
unsigned Opc = N->getOpcode();
1094+
if (Opc != ISD::LOAD && Opc != ISD::STORE)
1095+
continue;
1096+
SDValue Addr = Opc == ISD::LOAD ? N->getOperand(1) : N->getOperand(2);
1097+
// Addr must match: (add x T0)
1098+
if (Addr.getOpcode() != ISD::ADD)
1099+
continue;
1100+
SDValue T0 = Addr.getOperand(1);
1101+
// T0 must match: (and T1 Mask)
1102+
if (T0.getOpcode() != ISD::AND)
1103+
continue;
1104+
1105+
// We have an AND.
1106+
//
1107+
// Check the first operand. It must be: (srl y c).
1108+
SDValue S = T0.getOperand(0);
1109+
if (S.getOpcode() != ISD::SRL)
1110+
continue;
1111+
ConstantSDNode *SN = dyn_cast<ConstantSDNode>(S.getOperand(1).getNode());
1112+
if (SN == nullptr)
1113+
continue;
1114+
if (SN->getAPIntValue().getBitWidth() != 32)
1115+
continue;
1116+
uint32_t CV = SN->getZExtValue();
1117+
1118+
// Check the second operand: the supposed mask.
1119+
ConstantSDNode *MN = dyn_cast<ConstantSDNode>(T0.getOperand(1).getNode());
1120+
if (MN == nullptr)
1121+
continue;
1122+
if (MN->getAPIntValue().getBitWidth() != 32)
1123+
continue;
1124+
uint32_t Mask = MN->getZExtValue();
1125+
// Examine the mask.
1126+
uint32_t TZ = countTrailingZeros(Mask);
1127+
uint32_t M1 = countTrailingOnes(Mask >> TZ);
1128+
uint32_t LZ = countLeadingZeros(Mask);
1129+
// Trailing zeros + middle ones + leading zeros must equal the width.
1130+
if (TZ + M1 + LZ != 32)
1131+
continue;
1132+
// The number of trailing zeros will be encoded in the addressing mode.
1133+
if (TZ > 2)
1134+
continue;
1135+
// The number of leading zeros must be at most c.
1136+
if (LZ > CV)
1137+
continue;
1138+
1139+
// All looks good.
1140+
SDValue Y = S.getOperand(0);
1141+
EVT VT = Addr.getValueType();
1142+
SDLoc dl(S);
1143+
// TZ = D-C, so D = TZ+C.
1144+
SDValue D = DAG.getConstant(TZ+CV, dl, VT);
1145+
SDValue DC = DAG.getConstant(TZ, dl, VT);
1146+
SDValue NewSrl = DAG.getNode(ISD::SRL, dl, VT, Y, D);
1147+
SDValue NewShl = DAG.getNode(ISD::SHL, dl, VT, NewSrl, DC);
1148+
ReplaceNode(T0.getNode(), NewShl.getNode());
1149+
}
1150+
10791151
if (EnableAddressRebalancing) {
10801152
rebalanceAddressTrees();
10811153

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; RUN: llc -march=hexagon < %s | FileCheck %s
2+
3+
; DAG combiner folds sequences of shifts, which can sometimes obscure
4+
; optimization opportunities. For example
5+
;
6+
; unsigned int c(unsigned int b, unsigned int *a) {
7+
; unsigned int bitidx = b >> 5;
8+
; return a[bitidx];
9+
; }
10+
;
11+
; produces
12+
; (add x (shl (srl y 5) 2))
13+
; which is then folded into
14+
; (add x (and (srl y 3) 1FFFFFFC))
15+
;
16+
; That results in a constant-extended and:
17+
; r0 = and(##536870908,lsr(r0,#3))
18+
; r0 = memw(r1+r0<<#0)
19+
; whereas
20+
; r0 = lsr(r0,#5)
21+
; r0 = memw(r1+r0<<#2)
22+
; is more desirable.
23+
24+
target triple = "hexagon"
25+
26+
; CHECK-LABEL: load_0
27+
; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2)
28+
define i32 @load_0(i32 %b, i32* nocapture readonly %a) #0 {
29+
entry:
30+
%shr = lshr i32 %b, 5
31+
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr
32+
%0 = load i32, i32* %arrayidx, align 4
33+
ret i32 %0
34+
}
35+
36+
; This would require r0<<#3, which is not legal.
37+
; CHECK-LABEL: load_1
38+
; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#0)
39+
define i32 @load_1(i32 %b, [3 x i32]* nocapture readonly %a) #0 {
40+
entry:
41+
%shr = lshr i32 %b, 5
42+
%arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %a, i32 %shr, i32 0
43+
%0 = load i32, i32* %arrayidx, align 4
44+
ret i32 %0
45+
}
46+
47+
; CHECK-LABEL: store_0
48+
; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2)
49+
define void @store_0(i32 %b, i32* nocapture %a, i32 %v) #1 {
50+
entry:
51+
%shr = lshr i32 %b, 5
52+
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr
53+
store i32 %v, i32* %arrayidx, align 4
54+
ret void
55+
}
56+
57+
attributes #0 = { norecurse nounwind readonly "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
58+
attributes #1 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
59+

0 commit comments

Comments
 (0)