[WebAssembly] Add more lowerings for wide-arithmetic (#132430)

alexcrichton · web-flow · commit a415b7f86e8c · 2025-03-31T11:36:32.000-07:00
This commit is the result of investigation and discussion on WebAssembly/wide-arithmetic#6 where alternatives to the `i64.add128` instruction were discussed but ultimately deferred to a future proposal. In spite of this though I wanted to apply a few changes to the LLVM backend here with `wide-arithmetic` enabled for a few minor changes: * A lowering for the `ISD::UADDO` node is added which uses `add128` where the upper bits of the two operands are constant zeros and the result of the 128-bit addition is the result of the overflowing addition. * The high bits of a `I64_ADD128` node are now flagged as "known zero" if the upper bits of the inputs are also zero, assisting this `UADDO` lowering to ensure the backend knows that the carry result is a 1-bit result. A few tests were then added to showcase various lowerings for various operations that can be done with wide-arithmetic. They don't all optimize super well at this time but I wanted to add them as a reference here regardless to have them on-hand for future evaluations if necessary.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -170,6 +170,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setOperationAction(ISD::SUB, MVT::i128, Custom);
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
     setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
+    setOperationAction(ISD::UADDO, MVT::i64, Custom);
   }
 
   if (Subtarget->hasNontrappingFPToInt())
@@ -1109,6 +1110,18 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
     }
     }
   }
+
+  // For 128-bit addition if the upper bits are all zero then it's known that
+  // the upper bits of the result will have all bits guaranteed zero except the
+  // first.
+  case WebAssemblyISD::I64_ADD128:
+    if (Op.getResNo() == 1) {
+      SDValue LHS_HI = Op.getOperand(1);
+      SDValue RHS_HI = Op.getOperand(3);
+      if (isNullConstant(LHS_HI) && isNullConstant(RHS_HI))
+        Known.Zero.setBitsFrom(1);
+    }
+    break;
   }
 }
 
@@ -1678,6 +1691,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
     return LowerMUL_LOHI(Op, DAG);
+  case ISD::UADDO:
+    return LowerUADDO(Op, DAG);
   }
 }
 
@@ -1794,10 +1809,33 @@ SDValue WebAssemblyTargetLowering::LowerMUL_LOHI(SDValue Op,
   }
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  SDValue Hi =
+  SDValue Lo =
       DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS);
-  SDValue Lo(Hi.getNode(), 1);
-  SDValue Ops[] = {Hi, Lo};
+  SDValue Hi(Lo.getNode(), 1);
+  SDValue Ops[] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled.
+//
+// This enables generating a single wasm instruction for this operation where
+// the upper half of both operands are constant zeros. The upper half of the
+// result is then whether the overflow happened.
+SDValue WebAssemblyTargetLowering::LowerUADDO(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Subtarget->hasWideArithmetic());
+  assert(Op.getValueType() == MVT::i64);
+  assert(Op.getOpcode() == ISD::UADDO);
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  SDValue Result =
+      DAG.getNode(WebAssemblyISD::I64_ADD128, DL,
+                  DAG.getVTList(MVT::i64, MVT::i64), LHS, Zero, RHS, Zero);
+  SDValue CarryI64(Result.getNode(), 1);
+  SDValue CarryI32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, CarryI64);
+  SDValue Ops[] = {Result, CarryI32};
   return DAG.getMergeValues(Ops, DL);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -133,6 +133,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue Replace128Op(SDNode *N, SelectionDAG &DAG) const;
+  SDValue LowerUADDO(SDValue Op, SelectionDAG &DAG) const;
 
   // Custom DAG combine hooks
   SDValue
diff --git a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
@@ -130,3 +130,137 @@ define i64 @mul_i128_only_lo(i128 %a, i128 %b) {
   %d = trunc i128 %c to i64
   ret i64 %d
 }
+
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
+
+; This is a codegen test to see the effect of overflowing adds on signed
+; integers with wide-arithmetic enabled. At this time it doesn't actually
+; generate anything differently than without wide-arithmetic but this has also
+; been useful for evaluating the proposal.
+define { i64, i1 } @add_wide_s(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_s:
+; CHECK:         .functype add_wide_s (i32, i64, i64) -> ()
+; CHECK-NEXT:    .local i64
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.lt_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.lt_s
+; CHECK-NEXT:    i32.xor
+; CHECK-NEXT:    i32.store8 8
+; CHECK-NEXT:    # fallthrough-return
+  %pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
+  ret { i64, i1 } %pair
+}
+
+define { i64, i1 } @add_wide_u(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_u:
+; CHECK:         .functype add_wide_u (i32, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.store8 8
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  ret { i64, i1 } %pair
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM
+; intrinsics. In theory this should optimize better (to the equivalent below)
+; but it doesn't currently.
+define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_intrinsics:
+; CHECK:         .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    i64.store 8
+; CHECK-NEXT:    # fallthrough-return
+  %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %t0 = extractvalue { i64, i1 } %pair, 0
+  %carry1 = extractvalue { i64, i1 } %pair, 1
+
+  %pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c)
+  %ret1 = extractvalue { i64, i1 } %pair2, 0
+  %carry2 = extractvalue { i64, i1 } %pair2, 1
+
+  %carry1_64 = zext i1 %carry1 to i64
+  %carry2_64 = zext i1 %carry2 to i64
+  %ret2 = add i64 %carry1_64, %carry2_64
+
+  %r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0
+  %r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1
+  ret { i64, i64 } %r1
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit
+; integer addition. This optimizes better than the above currently.
+define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_i128:
+; CHECK:         .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.store 8
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %a128 = zext i64 %a to i128
+  %b128 = zext i64 %b to i128
+  %c128 = zext i64 %c to i128
+  %t0 = add i128 %a128, %b128
+  %t1 = add i128 %t0, %c128
+  %result = trunc i128 %t1 to i64
+  %t2 = lshr i128 %t1, 64
+  %carry = trunc i128 %t2 to i64
+
+  %ret0 = insertvalue { i64, i64 } poison, i64 %result, 0
+  %ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1
+  ret { i64, i64 } %ret1
+}