[WebAssembly] Add intrinsics to wasm_simd128.h for all FP16 instructions #106465

brendandahl · 2024-08-28T22:58:35Z

Getting this to work required a few additional changes:

Add builtins for any instructions that can't be done with plain C currently.
Add support for the saturating version of fp_to_<s,i>_I16x8. Other vector sizes supported this already.
Support bitcast of f16x8 to v128. Needed to return a __f16x8 as v128_t.

…ons. Getting this to work required a few additional changes: - Add builtins for any instructions that can't be done with plain C currently. - Add support for the saturating version of fp_to_<s,i>_I16x8. Other vector sizes supported this already. - Support bitcast of f16x8 to v128. Needed to return a __f16x8 as v128_t.

llvmbot · 2024-08-28T22:59:05Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-backend-webassembly

@llvm/pr-subscribers-backend-x86

Author: Brendan Dahl (brendandahl)

Changes

Getting this to work required a few additional changes:

Add builtins for any instructions that can't be done with plain C currently.
Add support for the saturating version of fp_to_<s,i>_I16x8. Other vector sizes supported this already.
Support bitcast of f16x8 to v128. Needed to return a __f16x8 as v128_t.

Patch is 23.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106465.diff

7 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsWebAssembly.def (+9)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+12)
(modified) clang/lib/Headers/wasm_simd128.h (+147)
(modified) cross-project-tests/intrinsic-header-tests/wasm_simd128.c (+137-1)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+8-1)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+17-11)
(modified) llvm/test/CodeGen/WebAssembly/half-precision.ll (+18)

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 034d32c6291b3d..2e80eef2c8b9bc 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -124,6 +124,7 @@ TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "UiV8s", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_bitmask_i32x4, "UiV4i", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_bitmask_i64x2, "UiV2LLi", "nc", "simd128")
 
+TARGET_BUILTIN(__builtin_wasm_abs_f16x8, "V8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_abs_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128")
 
@@ -140,6 +141,10 @@ TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "fp16")
 
+TARGET_BUILTIN(__builtin_wasm_ceil_f16x8, "V8hV8h", "nc", "fp16")
+TARGET_BUILTIN(__builtin_wasm_floor_f16x8, "V8hV8h", "nc", "fp16")
+TARGET_BUILTIN(__builtin_wasm_trunc_f16x8, "V8hV8h", "nc", "fp16")
+TARGET_BUILTIN(__builtin_wasm_nearest_f16x8, "V8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_f32x4, "V4fV4f", "nc", "simd128")
@@ -151,9 +156,13 @@ TARGET_BUILTIN(__builtin_wasm_nearest_f64x2, "V2dV2d", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_dot_s_i32x4_i16x8, "V4iV8sV8s", "nc", "simd128")
 
+TARGET_BUILTIN(__builtin_wasm_sqrt_f16x8, "V8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_sqrt_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_sqrt_f64x2, "V2dV2d", "nc", "simd128")
 
+TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i16x8_f16x8, "V8sV8h", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i16x8_f16x8, "V8sV8h", "nc", "simd128")
+
 TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i32x4_f32x4, "V4iV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i32x4_f32x4, "V4iV4f", "nc", "simd128")
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2a733e4d834cfa..bb5367c29b1c3a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -21208,6 +21208,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
+  case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i16x8_f16x8:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
     Value *Src = EmitScalarExpr(E->getArg(0));
     llvm::Type *ResT = ConvertType(E->getType());
@@ -21219,6 +21220,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
+  case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i16x8_f16x8:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
     Value *Src = EmitScalarExpr(E->getArg(0));
     llvm::Type *ResT = ConvertType(E->getType());
@@ -21266,6 +21268,10 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_ceil_f16x8:
+  case WebAssembly::BI__builtin_wasm_floor_f16x8:
+  case WebAssembly::BI__builtin_wasm_trunc_f16x8:
+  case WebAssembly::BI__builtin_wasm_nearest_f16x8:
   case WebAssembly::BI__builtin_wasm_ceil_f32x4:
   case WebAssembly::BI__builtin_wasm_floor_f32x4:
   case WebAssembly::BI__builtin_wasm_trunc_f32x4:
@@ -21276,18 +21282,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_nearest_f64x2: {
     unsigned IntNo;
     switch (BuiltinID) {
+    case WebAssembly::BI__builtin_wasm_ceil_f16x8:
     case WebAssembly::BI__builtin_wasm_ceil_f32x4:
     case WebAssembly::BI__builtin_wasm_ceil_f64x2:
       IntNo = Intrinsic::ceil;
       break;
+    case WebAssembly::BI__builtin_wasm_floor_f16x8:
     case WebAssembly::BI__builtin_wasm_floor_f32x4:
     case WebAssembly::BI__builtin_wasm_floor_f64x2:
       IntNo = Intrinsic::floor;
       break;
+    case WebAssembly::BI__builtin_wasm_trunc_f16x8:
     case WebAssembly::BI__builtin_wasm_trunc_f32x4:
     case WebAssembly::BI__builtin_wasm_trunc_f64x2:
       IntNo = Intrinsic::trunc;
       break;
+    case WebAssembly::BI__builtin_wasm_nearest_f16x8:
     case WebAssembly::BI__builtin_wasm_nearest_f32x4:
     case WebAssembly::BI__builtin_wasm_nearest_f64x2:
       IntNo = Intrinsic::nearbyint;
@@ -21486,12 +21496,14 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType());
     return Builder.CreateCall(Callee, {Vec});
   }
+  case WebAssembly::BI__builtin_wasm_abs_f16x8:
   case WebAssembly::BI__builtin_wasm_abs_f32x4:
   case WebAssembly::BI__builtin_wasm_abs_f64x2: {
     Value *Vec = EmitScalarExpr(E->getArg(0));
     Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
     return Builder.CreateCall(Callee, {Vec});
   }
+  case WebAssembly::BI__builtin_wasm_sqrt_f16x8:
   case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
   case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
     Value *Vec = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 2327bec52522d2..8d19609bf2168e 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -33,6 +33,7 @@ typedef unsigned long long __u64x2
     __attribute__((__vector_size__(16), __aligned__(16)));
 typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
 typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __fp16 __f16x8 __attribute__((__vector_size__(16), __aligned__(16)));
 
 typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
 typedef unsigned char __u8x8
@@ -1878,6 +1879,152 @@ wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t __a, v128_t __b, v128_t __c) {
       (__i8x16)__a, (__i8x16)__b, (__i32x4)__c);
 }
 
+// FP16 intrinsics
+#define __FP16_FN_ATTRS                                                        \
+  __attribute__((__always_inline__, __nodebug__, __target__("fp16"),           \
+                 __min_vector_width__(128)))
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) {
+  return (v128_t)__builtin_wasm_splat_f16x8(__a);
+}
+
+static __inline__ float __FP16_FN_ATTRS wasm_f16x8_extract_lane(v128_t __a,
+                                                                int __i)
+    __REQUIRE_CONSTANT(__i) {
+  return __builtin_wasm_extract_lane_f16x8(__a, __i);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_replace_lane(v128_t __a,
+                                                                 int __i,
+                                                                 float __b)
+    __REQUIRE_CONSTANT(__i) {
+  return (v128_t)__builtin_wasm_replace_lane_f16x8(__a, __i, __b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) {
+  return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_neg(v128_t __a) {
+  return (v128_t)(-(__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sqrt(v128_t __a) {
+  return (v128_t)__builtin_wasm_sqrt_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ceil(v128_t __a) {
+  return (v128_t)__builtin_wasm_ceil_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_floor(v128_t __a) {
+  return (v128_t)__builtin_wasm_floor_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_trunc(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_nearest(v128_t __a) {
+  return (v128_t)__builtin_wasm_nearest_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_eq(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a == (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ne(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a != (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_lt(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a < (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_gt(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a > (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_le(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a <= (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ge(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a >= (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_add(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a + (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sub(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a - (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_mul(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a * (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_div(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a / (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_min(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)__builtin_wasm_min_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_max(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)__builtin_wasm_max_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmin(v128_t __a,
+                                                         v128_t __b) {
+  return (v128_t)__builtin_wasm_pmin_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmax(v128_t __a,
+                                                         v128_t __b) {
+  return (v128_t)__builtin_wasm_pmax_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_i16x8_trunc_sat_f16x8(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_saturate_s_i16x8_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_u16x8_trunc_sat_f16x8(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_saturate_u_i16x8_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_i16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector((__i16x8)__a, __f16x8);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
+                                                                 v128_t __b,
+                                                                 v128_t __c) {
+  return (v128_t)__builtin_wasm_relaxed_madd_f16x8((__f16x8)__a, (__f16x8)__b,
+                                                   (__f16x8)__c);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_nmadd(v128_t __a,
+                                                                  v128_t __b,
+                                                                  v128_t __c) {
+  return (v128_t)__builtin_wasm_relaxed_nmadd_f16x8((__f16x8)__a, (__f16x8)__b,
+                                                    (__f16x8)__c);
+}
+
 // Deprecated intrinsics
 
 static __inline__ v128_t __DEPRECATED_FN_ATTRS("wasm_i8x16_swizzle")
diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
index fb15e0143d3653..b601d90cfcc927 100644
--- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
+++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
@@ -2,7 +2,7 @@
 // expected-no-diagnostics
 
 // RUN: %clang %s -O2 -S -o - -target wasm32-unknown-unknown \
-// RUN: -msimd128 -mrelaxed-simd -Wcast-qual -Werror | FileCheck %s
+// RUN: -msimd128 -mrelaxed-simd -mfp16 -Wcast-qual -Werror | FileCheck %s
 
 #include <wasm_simd128.h>
 
@@ -1385,3 +1385,139 @@ v128_t test_i16x8_relaxed_dot_i8x16_i7x16(v128_t a, v128_t b) {
 v128_t test_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t a, v128_t b, v128_t c) {
   return wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a, b, c);
 }
+
+// CHECK-LABEL: test_f16x8_splat:
+// CHECK: f16x8.splat{{$}}
+v128_t test_f16x8_splat(float a) { return wasm_f16x8_splat(a); }
+
+// CHECK-LABEL: test_f16x8_extract_lane:
+// CHECK: f16x8.extract_lane 7{{$}}
+int16_t test_f16x8_extract_lane(v128_t a) {
+  return wasm_f16x8_extract_lane(a, 7);
+}
+
+// CHECK-LABEL: test_f16x8_replace_lane:
+// CHECK: f16x8.replace_lane 7{{$}}
+v128_t test_f16x8_replace_lane(v128_t a, float b) {
+  return wasm_f16x8_replace_lane(a, 7, b);
+}
+
+// CHECK-LABEL: test_f16x8_abs:
+// CHECK: f16x8.abs{{$}}
+v128_t test_f16x8_abs(v128_t a) { return wasm_f16x8_abs(a); }
+
+// CHECK-LABEL: test_f16x8_neg:
+// CHECK: f16x8.neg{{$}}
+v128_t test_f16x8_neg(v128_t a) { return wasm_f16x8_neg(a); }
+
+// CHECK-LABEL: test_f16x8_sqrt:
+// CHECK: f16x8.sqrt{{$}}
+v128_t test_f16x8_sqrt(v128_t a) { return wasm_f16x8_sqrt(a); }
+
+// CHECK-LABEL: test_f16x8_ceil:
+// CHECK: f16x8.ceil{{$}}
+v128_t test_f16x8_ceil(v128_t a) { return wasm_f16x8_ceil(a); }
+
+// CHECK-LABEL: test_f16x8_floor:
+// CHECK: f16x8.floor{{$}}
+v128_t test_f16x8_floor(v128_t a) { return wasm_f16x8_floor(a); }
+
+// CHECK-LABEL: test_f16x8_trunc:
+// CHECK: f16x8.trunc{{$}}
+v128_t test_f16x8_trunc(v128_t a) { return wasm_f16x8_trunc(a); }
+
+// CHECK-LABEL: test_f16x8_nearest:
+// CHECK: f16x8.nearest{{$}}
+v128_t test_f16x8_nearest(v128_t a) { return wasm_f16x8_nearest(a); }
+
+// CHECK-LABEL: test_f16x8_add:
+// CHECK: f16x8.add{{$}}
+v128_t test_f16x8_add(v128_t a, v128_t b) { return wasm_f16x8_add(a, b); }
+
+// CHECK-LABEL: test_f16x8_sub:
+// CHECK: f16x8.sub{{$}}
+v128_t test_f16x8_sub(v128_t a, v128_t b) { return wasm_f16x8_sub(a, b); }
+
+// CHECK-LABEL: test_f16x8_mul:
+// CHECK: f16x8.mul{{$}}
+v128_t test_f16x8_mul(v128_t a, v128_t b) { return wasm_f16x8_mul(a, b); }
+
+// CHECK-LABEL: test_f16x8_div:
+// CHECK: f16x8.div{{$}}
+v128_t test_f16x8_div(v128_t a, v128_t b) { return wasm_f16x8_div(a, b); }
+
+// CHECK-LABEL: test_f16x8_min:
+// CHECK: f16x8.min{{$}}
+v128_t test_f16x8_min(v128_t a, v128_t b) { return wasm_f16x8_min(a, b); }
+
+// CHECK-LABEL: test_f16x8_max:
+// CHECK: f16x8.max{{$}}
+v128_t test_f16x8_max(v128_t a, v128_t b) { return wasm_f16x8_max(a, b); }
+
+// CHECK-LABEL: test_f16x8_pmin:
+// CHECK: f16x8.pmin{{$}}
+v128_t test_f16x8_pmin(v128_t a, v128_t b) { return wasm_f16x8_pmin(a, b); }
+
+// CHECK-LABEL: test_f16x8_pmax:
+// CHECK: f16x8.pmax{{$}}
+v128_t test_f16x8_pmax(v128_t a, v128_t b) { return wasm_f16x8_pmax(a, b); }
+
+// CHECK-LABEL: test_f16x8_eq:
+// CHECK: f16x8.eq{{$}}
+v128_t test_f16x8_eq(v128_t a, v128_t b) { return wasm_f16x8_eq(a, b); }
+
+// CHECK-LABEL: test_f16x8_ne:
+// CHECK: f16x8.ne{{$}}
+v128_t test_f16x8_ne(v128_t a, v128_t b) { return wasm_f16x8_ne(a, b); }
+
+// CHECK-LABEL: test_f16x8_lt:
+// CHECK: f16x8.lt{{$}}
+v128_t test_f16x8_lt(v128_t a, v128_t b) { return wasm_f16x8_lt(a, b); }
+
+// CHECK-LABEL: test_f16x8_gt:
+// CHECK: f16x8.gt{{$}}
+v128_t test_f16x8_gt(v128_t a, v128_t b) { return wasm_f16x8_gt(a, b); }
+
+// CHECK-LABEL: test_f16x8_le:
+// CHECK: f16x8.le{{$}}
+v128_t test_f16x8_le(v128_t a, v128_t b) { return wasm_f16x8_le(a, b); }
+
+// CHECK-LABEL: test_f16x8_ge:
+// CHECK: f16x8.ge{{$}}
+v128_t test_f16x8_ge(v128_t a, v128_t b) { return wasm_f16x8_ge(a, b); }
+
+// CHECK-LABEL: test_i16x8_trunc_sat_f16x8:
+// CHECK: i16x8.trunc_sat_f16x8_s{{$}}
+v128_t test_i16x8_trunc_sat_f16x8(v128_t a) {
+  return wasm_i16x8_trunc_sat_f16x8(a);
+}
+
+// CHECK-LABEL: test_u16x8_trunc_sat_f16x8:
+// CHECK: i16x8.trunc_sat_f16x8_u{{$}}
+v128_t test_u16x8_trunc_sat_f16x8(v128_t a) {
+  return wasm_u16x8_trunc_sat_f16x8(a);
+}
+
+// CHECK-LABEL: test_f16x8_convert_i16x8:
+// CHECK: f16x8.convert_i16x8_s{{$}}
+v128_t test_f16x8_convert_i16x8(v128_t a) {
+  return wasm_f16x8_convert_i16x8(a);
+}
+
+// CHECK-LABEL: test_f16x8_convert_u16x8:
+// CHECK: f16x8.convert_i16x8_u{{$}}
+v128_t test_f16x8_convert_u16x8(v128_t a) {
+  return wasm_f16x8_convert_u16x8(a);
+}
+
+// CHECK-LABEL: test_f16x8_relaxed_madd:
+// CHECK: f16x8.relaxed_madd{{$}}
+v128_t test_f16x8_relaxed_madd(v128_t a, v128_t b, v128_t c) {
+  return wasm_f16x8_relaxed_madd(a, b, c);
+}
+
+// CHECK-LABEL: test_f16x8_relaxed_nmadd:
+// CHECK: f16x8.relaxed_nmadd{{$}}
+v128_t test_f16x8_relaxed_nmadd(v128_t a, v128_t b, v128_t c) {
+  return wasm_f16x8_relaxed_nmadd(a, b, c);
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4578ff7f715146..5cc084f3ab1387 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -275,8 +275,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
         setOperationAction(Op, T, Expand);
 
     // But saturating fp_to_int converstions are
-    for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT})
+    for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}) {
       setOperationAction(Op, MVT::v4i32, Custom);
+      if (Subtarget->hasFP16()) {
+        setOperationAction(Op, MVT::v8i16, Custom);
+      }
+    }
 
     // Support vector extending
     for (auto T : MVT::integer_fixedlen_vector_valuetypes()) {
@@ -2475,6 +2479,9 @@ SDValue WebAssemblyTargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
   if (ResT == MVT::v4i32 && SatVT == MVT::i32)
     return Op;
 
+  if (ResT == MVT::v8i16 && SatVT == MVT::i16)
+    return Op;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 887278e9c12ef3..da4b8d228f627d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -165,8 +165,9 @@ def F16x8 : Vec {
  let prefix = "f16x8";
 }
 
-// TODO: Include F16x8 here when half precision is better supported.
-defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
+// TODO: Remove StdVecs when the F16x8 works every where StdVecs is used.
+defvar StdVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
+defvar AllVecs = !listconcat(StdVecs, [F16x8]);
 defvar IntVecs = [I8x16, I16x8, I32x4, I64x2];
 
 //===----------------------------------------------------------------------===//
@@ -188,7 +189,7 @@ defm LOAD_V128_A64 :
 }
 
 // Def load patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
 defm : LoadPat<vec.vt, load, "LOAD_V128">;
 }
 
@@ -217,7 +218,7 @@ defm "" : SIMDLoadSplat<16, 8>;
 defm "" : SIMDLoadSplat<32, 9>;
 defm "" : SIMDLoadSplat<64, 10>;
 
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
   defvar inst = "LOAD"#vec.lane_bits#"_SPLAT";
   defm : LoadPat<vec.vt,
                  PatFrag<(ops node:$addr), (splat_vector (vec.lane_vt (vec.lane_load node:$addr)))>,
@@ -389,7 +390,7 @@ defm STORE_V128_A64 :
 }
 
 // Def store patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
 defm : StorePat<vec.vt, store, "STORE_V128">;
 }
 
@@ -513,7 +514,7 @@ defm "" : ConstVec<F64x2,
                   "$i0, $i1">;
 
 // Match splat(x) -> const.v128(x, ..., x)
-foreach vec = AllVecs in {
+for...
[truncated]

brendandahl · 2024-08-28T23:01:52Z

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

@@ -165,8 +165,9 @@ def F16x8 : Vec {
 let prefix = "f16x8";
 }

-// TODO: Include F16x8 here when half precision is better supported.
-defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
+// TODO: Remove StdVecs when the F16x8 works every where StdVecs is used.


It's not obvious from this patch, but now AllVecs is only used in one place for bitcast (which means it now works for f16x8 vectors too).

Alternatively, I can leave AllVecs alone and just concat F16x8 down where bitcast is supported.

sunfishcode · 2024-08-29T17:59:41Z

Would it make sense to put these declarations behind #ifdef __wasm_fp16__ so that they aren't declared if fp16 support isn't enabled?

brendandahl · 2024-08-29T18:23:30Z

Would it make sense to put these declarations behind #ifdef __wasm_fp16__ so that they aren't declared if fp16 support isn't enabled?

I could do that, if that's preferred. I followed what the relaxed instructions did and use the target attribute __target__("fp16").

sunfishcode · 2024-08-29T18:25:25Z

Oh, I missed that. In that case, that seems sufficient. Thanks!

aheejin · 2024-08-29T23:16:41Z

Looks good to me, but I'm not an expert here.. Maybe @tlively can take a look?

tlively

Looks great! Glad you found the cross-project-tests.

llvm-ci · 2024-08-30T15:46:18Z

LLVM Buildbot has detected a new failure on builder mlir-nvidia-gcc7 running on mlir-nvidia while building clang,cross-project-tests,llvm at step 6 "test-build-check-mlir-build-only-check-mlir".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/2900

Here is the relevant piece of the build log for the reference

Step 6 (test-build-check-mlir-build-only-check-mlir) failure: test (failure)
******************** TEST 'MLIR :: Integration/GPU/CUDA/async.mlir' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=fatbin"  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-cpu-runner    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so    --entry-point-result=void -O0  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt '-pass-pipeline=builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary=format=fatbin
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-cpu-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0
# .---command stderr------------
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventSynchronize(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# `-----------------------------
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# .---command stderr------------
# | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir:68:12: error: CHECK: expected string not found in input
# |  // CHECK: [84, 84]
# |            ^
# | <stdin>:1:1: note: scanning from here
# | Unranked Memref base@ = 0x58d295c0f420 rank = 1 offset = 0 sizes = [2] strides = [1] data = 
# | ^
# | <stdin>:2:1: note: possible intended match here
# | [0, 0]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Unranked Memref base@ = 0x58d295c0f420 rank = 1 offset = 0 sizes = [2] strides = [1] data =  
# | check:68'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: [0, 0] 
# | check:68'0     ~~~~~~~
# | check:68'1     ?       possible intended match
...

brendandahl requested review from dschuff and aheejin August 28, 2024 22:58

brendandahl commented Aug 28, 2024

View reviewed changes

Add cast for vectors.

1ccb5f5

tlively approved these changes Aug 29, 2024

View reviewed changes

brendandahl merged commit 5703d85 into llvm:main Aug 30, 2024
8 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[WebAssembly] Add intrinsics to wasm_simd128.h for all FP16 instructions #106465

[WebAssembly] Add intrinsics to wasm_simd128.h for all FP16 instructions #106465

Uh oh!

brendandahl commented Aug 28, 2024

Uh oh!

llvmbot commented Aug 28, 2024 •

edited

Loading

Uh oh!

brendandahl Aug 28, 2024

Uh oh!

sunfishcode commented Aug 29, 2024

Uh oh!

brendandahl commented Aug 29, 2024

Uh oh!

sunfishcode commented Aug 29, 2024

Uh oh!

aheejin commented Aug 29, 2024

Uh oh!

tlively left a comment

Uh oh!

Uh oh!

llvm-ci commented Aug 30, 2024

Uh oh!

Uh oh!

[WebAssembly] Add intrinsics to wasm_simd128.h for all FP16 instructions #106465

[WebAssembly] Add intrinsics to wasm_simd128.h for all FP16 instructions #106465

Uh oh!

Conversation

brendandahl commented Aug 28, 2024

Uh oh!

llvmbot commented Aug 28, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

brendandahl Aug 28, 2024

Choose a reason for hiding this comment

Uh oh!

sunfishcode commented Aug 29, 2024

Uh oh!

brendandahl commented Aug 29, 2024

Uh oh!

sunfishcode commented Aug 29, 2024

Uh oh!

aheejin commented Aug 29, 2024

Uh oh!

tlively left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Aug 30, 2024

Uh oh!

Uh oh!

llvmbot commented Aug 28, 2024 •

edited

Loading