-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[WebAssembly] Add more lowerings for wide-arithmetic #132430
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-webassembly Author: Alex Crichton (alexcrichton) ChangesThis commit is the result of investigation and discussion on WebAssembly/wide-arithmetic#6 where alternatives to the
A few tests were then added to showcase various lowerings for various operations that can be done with wide-arithmetic. They don't all optimize super well at this time but I wanted to add them as a reference here regardless to have them on-hand for future evaluations if necessary. Full diff: https://github.com/llvm/llvm-project/pull/132430.diff 3 Files Affected:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 9ae46e709d823..3a720af1de770 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -170,6 +170,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(ISD::SUB, MVT::i128, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
+ setOperationAction(ISD::UADDO, MVT::i64, Custom);
}
if (Subtarget->hasNontrappingFPToInt())
@@ -1109,6 +1110,18 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
}
}
}
+
+ // For 128-bit addition if the upper bits are all zero then it's known that
+ // the upper bits of the result will have all bits guaranteed zero except the
+ // first.
+ case WebAssemblyISD::I64_ADD128:
+ if (Op.getResNo() == 1) {
+ SDValue LHS_HI = Op.getOperand(1);
+ SDValue RHS_HI = Op.getOperand(3);
+ if (isNullConstant(LHS_HI) && isNullConstant(RHS_HI))
+ Known.Zero.setBitsFrom(1);
+ }
+ break;
}
}
@@ -1678,6 +1691,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
return LowerMUL_LOHI(Op, DAG);
+ case ISD::UADDO:
+ return LowerUADDO(Op, DAG);
}
}
@@ -1794,10 +1809,32 @@ SDValue WebAssemblyTargetLowering::LowerMUL_LOHI(SDValue Op,
}
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- SDValue Hi =
+ SDValue Lo =
DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS);
- SDValue Lo(Hi.getNode(), 1);
- SDValue Ops[] = {Hi, Lo};
+ SDValue Hi(Lo.getNode(), 1);
+ SDValue Ops[] = {Lo, Hi};
+ return DAG.getMergeValues(Ops, DL);
+}
+
+// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled.
+//
+// This enables generating a single wasm instruction for this operation where
+// the upper bits of both operands are constant zeros. The upper bits of the
+// result are whether the overflow happened.
+SDValue WebAssemblyTargetLowering::LowerUADDO(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->hasWideArithmetic());
+ assert(Op.getValueType() == MVT::i64);
+ assert(Op.getOpcode() == ISD::UADDO);
+ SDLoc DL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue Result =
+ DAG.getNode(WebAssemblyISD::I64_ADD128, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, Zero, RHS, Zero);
+ SDValue CarryI64(Result.getNode(), 1);
+ SDValue CarryI32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, CarryI64);
+ SDValue Ops[] = {Result, CarryI32};
return DAG.getMergeValues(Ops, DL);
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 90d31e38a7076..72401a7a259c0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -133,6 +133,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
SDValue Replace128Op(SDNode *N, SelectionDAG &DAG) const;
+ SDValue LowerUADDO(SDValue Op, SelectionDAG &DAG) const;
// Custom DAG combine hooks
SDValue
diff --git a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
index deff551d0eabd..71974b012a2b6 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
@@ -130,3 +130,137 @@ define i64 @mul_i128_only_lo(i128 %a, i128 %b) {
%d = trunc i128 %c to i64
ret i64 %d
}
+
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
+
+; This is a codegen test to see the effect of overflowing adds on signed
+; integers with wide-arithmetic enabled. At this time it doesn't actually
+; generate anything differently than without wide-arithmetic but this has also
+; been useful for evaluating the proposal.
+define { i64, i1 } @add_wide_s(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_s:
+; CHECK: .functype add_wide_s (i32, i64, i64) -> ()
+; CHECK-NEXT: .local i64
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.add
+; CHECK-NEXT: local.tee 3
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.lt_s
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.lt_s
+; CHECK-NEXT: i32.xor
+; CHECK-NEXT: i32.store8 8
+; CHECK-NEXT: # fallthrough-return
+ %pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
+ ret { i64, i1 } %pair
+}
+
+define { i64, i1 } @add_wide_u(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_u:
+; CHECK: .functype add_wide_u (i32, i64, i64) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 1
+; CHECK-NEXT: local.set 2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.store8 8
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: # fallthrough-return
+ %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+ ret { i64, i1 } %pair
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM
+; intrinsics. In theory this should optimize better (to the equivalent below)
+; but it doesn't currently.
+define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_intrinsics:
+; CHECK: .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 1
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.add
+; CHECK-NEXT: i64.store 8
+; CHECK-NEXT: # fallthrough-return
+ %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+ %t0 = extractvalue { i64, i1 } %pair, 0
+ %carry1 = extractvalue { i64, i1 } %pair, 1
+
+ %pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c)
+ %ret1 = extractvalue { i64, i1 } %pair2, 0
+ %carry2 = extractvalue { i64, i1 } %pair2, 1
+
+ %carry1_64 = zext i1 %carry1 to i64
+ %carry2_64 = zext i1 %carry2 to i64
+ %ret2 = add i64 %carry1_64, %carry2_64
+
+ %r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0
+ %r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1
+ ret { i64, i64 } %r1
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit
+; integer addition. This optimizes better than the above currently.
+define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_i128:
+; CHECK: .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 1
+; CHECK-NEXT: local.set 2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.store 8
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: # fallthrough-return
+ %a128 = zext i64 %a to i128
+ %b128 = zext i64 %b to i128
+ %c128 = zext i64 %c to i128
+ %t0 = add i128 %a128, %b128
+ %t1 = add i128 %t0, %c128
+ %result = trunc i128 %t1 to i64
+ %t2 = lshr i128 %t1, 64
+ %carry = trunc i128 %t2 to i64
+
+ %ret0 = insertvalue { i64, i64 } poison, i64 %result, 0
+ %ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1
+ ret { i64, i64 } %ret1
+}
|
SDValue Hi = | ||
SDValue Lo = | ||
DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS); | ||
SDValue Lo(Hi.getNode(), 1); | ||
SDValue Ops[] = {Hi, Lo}; | ||
SDValue Hi(Lo.getNode(), 1); | ||
SDValue Ops[] = {Lo, Hi}; | ||
return DAG.getMergeValues(Ops, DL); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll note that this change is unrelated to this commit but is a drive-by fix for the names here since the Hi/Lo variable names were wrong, despite construction here actually being correct. I swapped them for future clarity.
✅ With the latest revision this PR passed the C/C++ code formatter. |
9c1219c
to
6254941
Compare
// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled. | ||
// | ||
// This enables generating a single wasm instruction for this operation where | ||
// the upper bits of both operands are constant zeros. The upper bits of the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this comment might be clearer if you called it the "upper half" rather than "upper bits" since when I first read this I was thinking in terms of the most significant bits (and as then briefly confused since it's of course the least significant bit of the upper half that might be set). But maybe that's just me, feel free to leave it if you disagree.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point! I've updated the comment here
This commit is the result of investigation and discussion on WebAssembly/wide-arithmetic#6 where alternatives to the `i64.add128` instruction were discussed but ultimately deferred to a future proposal. In spite of this though I wanted to apply a few changes to the LLVM backend here with `wide-arithmetic` enabled for a few minor changes: * A lowering for the `ISD::UADDO` node is added which uses `add128` where the upper bits of the two operands are constant zeros and the result of the 128-bit addition is the result of the overflowing addition. * The high bits of a `I64_ADD128` node are now flagged as "known zero" if the upper bits of the inputs are also zero, assisting this `UADDO` lowering to ensure the backend knows that the carry result is a 1-bit result. A few tests were then added to showcase various lowerings for various operations that can be done with wide-arithmetic. They don't all optimize super well at this time but I wanted to add them as a reference here regardless to have them on-hand for future evaluations if necessary.
6254941
to
e42d0bc
Compare
Oh also, I'm unable to merge myself, so if this could be merged for me that'd be appreciated! |
// For 128-bit addition if the upper bits are all zero then it's known that | ||
// the upper bits of the result will have all bits guaranteed zero except the | ||
// first. | ||
case WebAssemblyISD::I64_ADD128: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This case triggers an "unannotated fall-through between switch labels" warning. Is it supposed to fall through from the previous case or is the previous case supposed to break?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh the previous case is supposed to break, it's not supposed to fall through. Sorry about that!
I've sent #133783 to fix this
This fixes an issue introduced in llvm#132430 where a `break;` statement was accidentally missing causing unintended fall-through.
This fixes an issue introduced in #132430 where a `break;` statement was accidentally missing causing unintended fall-through.
This commit is the result of investigation and discussion on WebAssembly/wide-arithmetic#6 where alternatives to the
i64.add128
instruction were discussed but ultimately deferred to a future proposal. In spite of this though I wanted to apply a few changes to the LLVM backend here withwide-arithmetic
enabled for a few minor changes:ISD::UADDO
node is added which usesadd128
where the upper bits of the two operands are constant zeros and the result of the 128-bit addition is the result of the overflowing addition.I64_ADD128
node are now flagged as "known zero" if the upper bits of the inputs are also zero, assisting thisUADDO
lowering to ensure the backend knows that the carry result is a 1-bit result.A few tests were then added to showcase various lowerings for various operations that can be done with wide-arithmetic. They don't all optimize super well at this time but I wanted to add them as a reference here regardless to have them on-hand for future evaluations if necessary.