Skip to content

Commit a415b7f

Browse files
authored
[WebAssembly] Add more lowerings for wide-arithmetic (#132430)
This commit is the result of investigation and discussion on WebAssembly/wide-arithmetic#6 where alternatives to the `i64.add128` instruction were discussed but ultimately deferred to a future proposal. In spite of this though I wanted to apply a few changes to the LLVM backend here with `wide-arithmetic` enabled for a few minor changes: * A lowering for the `ISD::UADDO` node is added which uses `add128` where the upper bits of the two operands are constant zeros and the result of the 128-bit addition is the result of the overflowing addition. * The high bits of a `I64_ADD128` node are now flagged as "known zero" if the upper bits of the inputs are also zero, assisting this `UADDO` lowering to ensure the backend knows that the carry result is a 1-bit result. A few tests were then added to showcase various lowerings for various operations that can be done with wide-arithmetic. They don't all optimize super well at this time but I wanted to add them as a reference here regardless to have them on-hand for future evaluations if necessary.
1 parent 5145367 commit a415b7f

File tree

3 files changed

+176
-3
lines changed

3 files changed

+176
-3
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
170170
setOperationAction(ISD::SUB, MVT::i128, Custom);
171171
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
172172
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
173+
setOperationAction(ISD::UADDO, MVT::i64, Custom);
173174
}
174175

175176
if (Subtarget->hasNontrappingFPToInt())
@@ -1109,6 +1110,18 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
11091110
}
11101111
}
11111112
}
1113+
1114+
// For 128-bit addition if the upper bits are all zero then it's known that
1115+
// the upper bits of the result will have all bits guaranteed zero except the
1116+
// first.
1117+
case WebAssemblyISD::I64_ADD128:
1118+
if (Op.getResNo() == 1) {
1119+
SDValue LHS_HI = Op.getOperand(1);
1120+
SDValue RHS_HI = Op.getOperand(3);
1121+
if (isNullConstant(LHS_HI) && isNullConstant(RHS_HI))
1122+
Known.Zero.setBitsFrom(1);
1123+
}
1124+
break;
11121125
}
11131126
}
11141127

@@ -1678,6 +1691,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
16781691
case ISD::SMUL_LOHI:
16791692
case ISD::UMUL_LOHI:
16801693
return LowerMUL_LOHI(Op, DAG);
1694+
case ISD::UADDO:
1695+
return LowerUADDO(Op, DAG);
16811696
}
16821697
}
16831698

@@ -1794,10 +1809,33 @@ SDValue WebAssemblyTargetLowering::LowerMUL_LOHI(SDValue Op,
17941809
}
17951810
SDValue LHS = Op.getOperand(0);
17961811
SDValue RHS = Op.getOperand(1);
1797-
SDValue Hi =
1812+
SDValue Lo =
17981813
DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS);
1799-
SDValue Lo(Hi.getNode(), 1);
1800-
SDValue Ops[] = {Hi, Lo};
1814+
SDValue Hi(Lo.getNode(), 1);
1815+
SDValue Ops[] = {Lo, Hi};
1816+
return DAG.getMergeValues(Ops, DL);
1817+
}
1818+
1819+
// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled.
1820+
//
1821+
// This enables generating a single wasm instruction for this operation where
1822+
// the upper half of both operands are constant zeros. The upper half of the
1823+
// result is then whether the overflow happened.
1824+
SDValue WebAssemblyTargetLowering::LowerUADDO(SDValue Op,
1825+
SelectionDAG &DAG) const {
1826+
assert(Subtarget->hasWideArithmetic());
1827+
assert(Op.getValueType() == MVT::i64);
1828+
assert(Op.getOpcode() == ISD::UADDO);
1829+
SDLoc DL(Op);
1830+
SDValue LHS = Op.getOperand(0);
1831+
SDValue RHS = Op.getOperand(1);
1832+
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
1833+
SDValue Result =
1834+
DAG.getNode(WebAssemblyISD::I64_ADD128, DL,
1835+
DAG.getVTList(MVT::i64, MVT::i64), LHS, Zero, RHS, Zero);
1836+
SDValue CarryI64(Result.getNode(), 1);
1837+
SDValue CarryI32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, CarryI64);
1838+
SDValue Ops[] = {Result, CarryI32};
18011839
return DAG.getMergeValues(Ops, DL);
18021840
}
18031841

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
133133
SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
134134
SDValue LowerMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
135135
SDValue Replace128Op(SDNode *N, SelectionDAG &DAG) const;
136+
SDValue LowerUADDO(SDValue Op, SelectionDAG &DAG) const;
136137

137138
// Custom DAG combine hooks
138139
SDValue

llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,137 @@ define i64 @mul_i128_only_lo(i128 %a, i128 %b) {
130130
%d = trunc i128 %c to i64
131131
ret i64 %d
132132
}
133+
134+
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
135+
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
136+
137+
; This is a codegen test to see the effect of overflowing adds on signed
138+
; integers with wide-arithmetic enabled. At this time it doesn't actually
139+
; generate anything differently than without wide-arithmetic but this has also
140+
; been useful for evaluating the proposal.
141+
define { i64, i1 } @add_wide_s(i64 %a, i64 %b) {
142+
; CHECK-LABEL: add_wide_s:
143+
; CHECK: .functype add_wide_s (i32, i64, i64) -> ()
144+
; CHECK-NEXT: .local i64
145+
; CHECK-NEXT: # %bb.0:
146+
; CHECK-NEXT: local.get 0
147+
; CHECK-NEXT: local.get 1
148+
; CHECK-NEXT: local.get 2
149+
; CHECK-NEXT: i64.add
150+
; CHECK-NEXT: local.tee 3
151+
; CHECK-NEXT: i64.store 0
152+
; CHECK-NEXT: local.get 0
153+
; CHECK-NEXT: local.get 2
154+
; CHECK-NEXT: i64.const 0
155+
; CHECK-NEXT: i64.lt_s
156+
; CHECK-NEXT: local.get 3
157+
; CHECK-NEXT: local.get 1
158+
; CHECK-NEXT: i64.lt_s
159+
; CHECK-NEXT: i32.xor
160+
; CHECK-NEXT: i32.store8 8
161+
; CHECK-NEXT: # fallthrough-return
162+
%pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
163+
ret { i64, i1 } %pair
164+
}
165+
166+
define { i64, i1 } @add_wide_u(i64 %a, i64 %b) {
167+
; CHECK-LABEL: add_wide_u:
168+
; CHECK: .functype add_wide_u (i32, i64, i64) -> ()
169+
; CHECK-NEXT: # %bb.0:
170+
; CHECK-NEXT: local.get 1
171+
; CHECK-NEXT: i64.const 0
172+
; CHECK-NEXT: local.get 2
173+
; CHECK-NEXT: i64.const 0
174+
; CHECK-NEXT: i64.add128
175+
; CHECK-NEXT: local.set 1
176+
; CHECK-NEXT: local.set 2
177+
; CHECK-NEXT: local.get 0
178+
; CHECK-NEXT: local.get 1
179+
; CHECK-NEXT: i64.store8 8
180+
; CHECK-NEXT: local.get 0
181+
; CHECK-NEXT: local.get 2
182+
; CHECK-NEXT: i64.store 0
183+
; CHECK-NEXT: # fallthrough-return
184+
%pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
185+
ret { i64, i1 } %pair
186+
}
187+
188+
; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM
189+
; intrinsics. In theory this should optimize better (to the equivalent below)
190+
; but it doesn't currently.
191+
define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) {
192+
; CHECK-LABEL: add_wide3_u_via_intrinsics:
193+
; CHECK: .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> ()
194+
; CHECK-NEXT: # %bb.0:
195+
; CHECK-NEXT: local.get 0
196+
; CHECK-NEXT: local.get 1
197+
; CHECK-NEXT: i64.const 0
198+
; CHECK-NEXT: local.get 2
199+
; CHECK-NEXT: i64.const 0
200+
; CHECK-NEXT: i64.add128
201+
; CHECK-NEXT: local.set 2
202+
; CHECK-NEXT: i64.const 0
203+
; CHECK-NEXT: local.get 3
204+
; CHECK-NEXT: i64.const 0
205+
; CHECK-NEXT: i64.add128
206+
; CHECK-NEXT: local.set 1
207+
; CHECK-NEXT: i64.store 0
208+
; CHECK-NEXT: local.get 0
209+
; CHECK-NEXT: local.get 2
210+
; CHECK-NEXT: local.get 1
211+
; CHECK-NEXT: i64.add
212+
; CHECK-NEXT: i64.store 8
213+
; CHECK-NEXT: # fallthrough-return
214+
%pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
215+
%t0 = extractvalue { i64, i1 } %pair, 0
216+
%carry1 = extractvalue { i64, i1 } %pair, 1
217+
218+
%pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c)
219+
%ret1 = extractvalue { i64, i1 } %pair2, 0
220+
%carry2 = extractvalue { i64, i1 } %pair2, 1
221+
222+
%carry1_64 = zext i1 %carry1 to i64
223+
%carry2_64 = zext i1 %carry2 to i64
224+
%ret2 = add i64 %carry1_64, %carry2_64
225+
226+
%r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0
227+
%r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1
228+
ret { i64, i64 } %r1
229+
}
230+
231+
; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit
232+
; integer addition. This optimizes better than the above currently.
233+
define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) {
234+
; CHECK-LABEL: add_wide3_u_via_i128:
235+
; CHECK: .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> ()
236+
; CHECK-NEXT: # %bb.0:
237+
; CHECK-NEXT: local.get 1
238+
; CHECK-NEXT: i64.const 0
239+
; CHECK-NEXT: local.get 2
240+
; CHECK-NEXT: i64.const 0
241+
; CHECK-NEXT: i64.add128
242+
; CHECK-NEXT: local.get 3
243+
; CHECK-NEXT: i64.const 0
244+
; CHECK-NEXT: i64.add128
245+
; CHECK-NEXT: local.set 1
246+
; CHECK-NEXT: local.set 2
247+
; CHECK-NEXT: local.get 0
248+
; CHECK-NEXT: local.get 1
249+
; CHECK-NEXT: i64.store 8
250+
; CHECK-NEXT: local.get 0
251+
; CHECK-NEXT: local.get 2
252+
; CHECK-NEXT: i64.store 0
253+
; CHECK-NEXT: # fallthrough-return
254+
%a128 = zext i64 %a to i128
255+
%b128 = zext i64 %b to i128
256+
%c128 = zext i64 %c to i128
257+
%t0 = add i128 %a128, %b128
258+
%t1 = add i128 %t0, %c128
259+
%result = trunc i128 %t1 to i64
260+
%t2 = lshr i128 %t1, 64
261+
%carry = trunc i128 %t2 to i64
262+
263+
%ret0 = insertvalue { i64, i64 } poison, i64 %result, 0
264+
%ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1
265+
ret { i64, i64 } %ret1
266+
}

0 commit comments

Comments
 (0)