Skip to content

[WebAssembly] Add more lowerings for wide-arithmetic #132430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(ISD::SUB, MVT::i128, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
setOperationAction(ISD::UADDO, MVT::i64, Custom);
}

if (Subtarget->hasNontrappingFPToInt())
Expand Down Expand Up @@ -1109,6 +1110,18 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
}
}
}

// For 128-bit addition if the upper bits are all zero then it's known that
// the upper bits of the result will have all bits guaranteed zero except the
// first.
case WebAssemblyISD::I64_ADD128:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This case triggers an "unannotated fall-through between switch labels" warning. Is it supposed to fall through from the previous case or is the previous case supposed to break?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh the previous case is supposed to break, it's not supposed to fall through. Sorry about that!

I've sent #133783 to fix this

if (Op.getResNo() == 1) {
SDValue LHS_HI = Op.getOperand(1);
SDValue RHS_HI = Op.getOperand(3);
if (isNullConstant(LHS_HI) && isNullConstant(RHS_HI))
Known.Zero.setBitsFrom(1);
}
break;
}
}

Expand Down Expand Up @@ -1678,6 +1691,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
return LowerMUL_LOHI(Op, DAG);
case ISD::UADDO:
return LowerUADDO(Op, DAG);
}
}

Expand Down Expand Up @@ -1794,10 +1809,33 @@ SDValue WebAssemblyTargetLowering::LowerMUL_LOHI(SDValue Op,
}
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Hi =
SDValue Lo =
DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS);
SDValue Lo(Hi.getNode(), 1);
SDValue Ops[] = {Hi, Lo};
SDValue Hi(Lo.getNode(), 1);
SDValue Ops[] = {Lo, Hi};
return DAG.getMergeValues(Ops, DL);
Comment on lines -1797 to +1816
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll note that this change is unrelated to this commit but is a drive-by fix for the names here since the Hi/Lo variable names were wrong, despite construction here actually being correct. I swapped them for future clarity.

}

// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled.
//
// This enables generating a single wasm instruction for this operation where
// the upper half of both operands are constant zeros. The upper half of the
// result is then whether the overflow happened.
SDValue WebAssemblyTargetLowering::LowerUADDO(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->hasWideArithmetic());
assert(Op.getValueType() == MVT::i64);
assert(Op.getOpcode() == ISD::UADDO);
SDLoc DL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
SDValue Result =
DAG.getNode(WebAssemblyISD::I64_ADD128, DL,
DAG.getVTList(MVT::i64, MVT::i64), LHS, Zero, RHS, Zero);
SDValue CarryI64(Result.getNode(), 1);
SDValue CarryI32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, CarryI64);
SDValue Ops[] = {Result, CarryI32};
return DAG.getMergeValues(Ops, DL);
}

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
SDValue Replace128Op(SDNode *N, SelectionDAG &DAG) const;
SDValue LowerUADDO(SDValue Op, SelectionDAG &DAG) const;

// Custom DAG combine hooks
SDValue
Expand Down
134 changes: 134 additions & 0 deletions llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,137 @@ define i64 @mul_i128_only_lo(i128 %a, i128 %b) {
%d = trunc i128 %c to i64
ret i64 %d
}

declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)

; This is a codegen test to see the effect of overflowing adds on signed
; integers with wide-arithmetic enabled. At this time it doesn't actually
; generate anything differently than without wide-arithmetic but this has also
; been useful for evaluating the proposal.
define { i64, i1 } @add_wide_s(i64 %a, i64 %b) {
; CHECK-LABEL: add_wide_s:
; CHECK: .functype add_wide_s (i32, i64, i64) -> ()
; CHECK-NEXT: .local i64
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i64.add
; CHECK-NEXT: local.tee 3
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: i64.lt_s
; CHECK-NEXT: local.get 3
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i64.lt_s
; CHECK-NEXT: i32.xor
; CHECK-NEXT: i32.store8 8
; CHECK-NEXT: # fallthrough-return
%pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
ret { i64, i1 } %pair
}

define { i64, i1 } @add_wide_u(i64 %a, i64 %b) {
; CHECK-LABEL: add_wide_u:
; CHECK: .functype add_wide_u (i32, i64, i64) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: i64.add128
; CHECK-NEXT: local.set 1
; CHECK-NEXT: local.set 2
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i64.store8 8
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
ret { i64, i1 } %pair
}

; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM
; intrinsics. In theory this should optimize better (to the equivalent below)
; but it doesn't currently.
define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) {
; CHECK-LABEL: add_wide3_u_via_intrinsics:
; CHECK: .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: i64.add128
; CHECK-NEXT: local.set 2
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: i64.add128
; CHECK-NEXT: local.set 1
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i64.add
; CHECK-NEXT: i64.store 8
; CHECK-NEXT: # fallthrough-return
%pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%t0 = extractvalue { i64, i1 } %pair, 0
%carry1 = extractvalue { i64, i1 } %pair, 1

%pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c)
%ret1 = extractvalue { i64, i1 } %pair2, 0
%carry2 = extractvalue { i64, i1 } %pair2, 1

%carry1_64 = zext i1 %carry1 to i64
%carry2_64 = zext i1 %carry2 to i64
%ret2 = add i64 %carry1_64, %carry2_64

%r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0
%r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1
ret { i64, i64 } %r1
}

; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit
; integer addition. This optimizes better than the above currently.
define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) {
; CHECK-LABEL: add_wide3_u_via_i128:
; CHECK: .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: i64.add128
; CHECK-NEXT: local.get 3
; CHECK-NEXT: i64.const 0
; CHECK-NEXT: i64.add128
; CHECK-NEXT: local.set 1
; CHECK-NEXT: local.set 2
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i64.store 8
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 2
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%a128 = zext i64 %a to i128
%b128 = zext i64 %b to i128
%c128 = zext i64 %c to i128
%t0 = add i128 %a128, %b128
%t1 = add i128 %t0, %c128
%result = trunc i128 %t1 to i64
%t2 = lshr i128 %t1, 64
%carry = trunc i128 %t2 to i64

%ret0 = insertvalue { i64, i64 } poison, i64 %result, 0
%ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1
ret { i64, i64 } %ret1
}