[WebAssembly] Custom combines for f64x2.promote_low_f32x4

tlively · tlively · commit e5220104d070 · 2021-07-09T18:59:29.000-07:00
Replace the clang builtin function and LLVM intrinsic previously used to select the f64x2.promote_low_f32x4 instruction with custom combines from standard SelectionDAG nodes. Implement the new combines to share code with the similar combines for f64x2.convert_low_i32x4_{s,u}. Resolves PR50232. Differential Revision: https://reviews.llvm.org/D105675
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -192,7 +192,6 @@ TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8UsV4iV4i", "nc", "simd128
 TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4, "V4iV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4, "V4UiV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_demote_zero_f64x2_f32x4, "V4fV2d", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_promote_low_f32x4_f64x2, "V2dV4f", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4iiC*", "n", "simd128")
 TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLiC*", "n", "simd128")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17728,11 +17728,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_demote_zero);
     return Builder.CreateCall(Callee, Vec);
   }
-  case WebAssembly::BI__builtin_wasm_promote_low_f32x4_f64x2: {
-    Value *Vec = EmitScalarExpr(E->getArg(0));
-    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_promote_low);
-    return Builder.CreateCall(Callee, Vec);
-  }
   case WebAssembly::BI__builtin_wasm_load32_zero: {
     Value *Ptr = EmitScalarExpr(E->getArg(0));
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load32_zero);
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
@@ -43,6 +43,7 @@ typedef unsigned short __u16x4
 typedef int __i32x2 __attribute__((__vector_size__(8), __aligned__(8)));
 typedef unsigned int __u32x2
     __attribute__((__vector_size__(8), __aligned__(8)));
+typedef float __f32x2 __attribute__((__vector_size__(8), __aligned__(8)));
 
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("simd128"),        \
@@ -1155,7 +1156,8 @@ wasm_f32x4_demote_f64x2_zero(v128_t __a) {
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_f64x2_promote_low_f32x4(v128_t __a) {
-  return (v128_t)__builtin_wasm_promote_low_f32x4_f64x2((__f32x4)__a);
+  return (v128_t) __builtin_convertvector(
+      (__f32x2){((__f32x4)__a)[0], ((__f32x4)__a)[1]}, __f64x2);
 }
 
 #define wasm_i8x16_shuffle(__a, __b, __c0, __c1, __c2, __c3, __c4, __c5, __c6, \
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
@@ -898,12 +898,6 @@ f32x4 wasm_demote_zero_f64x2_f32x4(f64x2 x) {
   // WEBASSEMBLY: ret
 }
 
-f64x2 wasm_promote_low_f32x4_f64x2(f32x4 x) {
-  return __builtin_wasm_promote_low_f32x4_f64x2(x);
-  // WEBASSEMBLY: call <2 x double> @llvm.wasm.promote.low(<4 x float> %x)
-  // WEBASSEMBLY: ret
-}
-
 i32x4 load32_zero(const int *p) {
   return __builtin_wasm_load32_zero(p);
   // WEBASSEMBLY: call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -263,13 +263,10 @@ def int_wasm_extadd_pairwise_unsigned :
             [LLVMSubdivide2VectorType<0>],
             [IntrNoMem, IntrSpeculatable]>;
 
-// TODO: Remove these if possible if they are merged to the spec.
+// TODO: Remove this if possible.
 def int_wasm_demote_zero :
   Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty],
             [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_promote_low :
-  Intrinsic<[llvm_v2f64_ty], [llvm_v4f32_ty],
-            [IntrNoMem, IntrSpeculatable]>;
 
 //===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -37,6 +37,7 @@ HANDLE_NODETYPE(EXTEND_HIGH_S)
 HANDLE_NODETYPE(EXTEND_HIGH_U)
 HANDLE_NODETYPE(CONVERT_LOW_S)
 HANDLE_NODETYPE(CONVERT_LOW_U)
+HANDLE_NODETYPE(PROMOTE_LOW)
 HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
 HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
 HANDLE_NODETYPE(THROW)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -149,9 +149,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTargetDAGCombine(ISD::SIGN_EXTEND);
     setTargetDAGCombine(ISD::ZERO_EXTEND);
 
-    // Combine int_to_fp of extract_vectors and vice versa into conversions ops
+    // Combine int_to_fp or fp_extend of extract_vectors and vice versa into
+    // conversions ops
     setTargetDAGCombine(ISD::SINT_TO_FP);
     setTargetDAGCombine(ISD::UINT_TO_FP);
+    setTargetDAGCombine(ISD::FP_EXTEND);
     setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
 
     // Combine concat of {s,u}int_to_fp_sat to i32x4.trunc_sat_f64x2_zero_{s,u}
@@ -2186,60 +2188,109 @@ performVectorConvertLowCombine(SDNode *N,
   if (ResVT != MVT::v2f64)
     return SDValue();
 
-  if (N->getOpcode() == ISD::SINT_TO_FP || N->getOpcode() == ISD::UINT_TO_FP) {
-    // Combine this:
-    //
-    //   (v2f64 ({s,u}int_to_fp
-    //     (v2i32 (extract_subvector (v4i32 $x), 0))))
-    //
-    // into (f64x2.convert_low_i32x4_{s,u} $x).
-    auto Extract = N->getOperand(0);
-    if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-      return SDValue();
-    if (Extract.getValueType() != MVT::v2i32)
-      return SDValue();
-    auto Source = Extract.getOperand(0);
-    if (Source.getValueType() != MVT::v4i32)
-      return SDValue();
-    auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
-    if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
-      return SDValue();
-
-    unsigned Op = N->getOpcode() == ISD::SINT_TO_FP
-                      ? WebAssemblyISD::CONVERT_LOW_S
-                      : WebAssemblyISD::CONVERT_LOW_U;
-
-    return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+  auto GetWasmConversionOp = [](unsigned Op) {
+    switch (Op) {
+    case ISD::SINT_TO_FP:
+      return WebAssemblyISD::CONVERT_LOW_S;
+    case ISD::UINT_TO_FP:
+      return WebAssemblyISD::CONVERT_LOW_U;
+    case ISD::FP_EXTEND:
+      return WebAssemblyISD::PROMOTE_LOW;
+    }
+    llvm_unreachable("unexpected op");
+  };
 
-  } else if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+  if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
     // Combine this:
     //
     //   (v2f64 (extract_subvector
     //     (v4f64 ({s,u}int_to_fp (v4i32 $x))), 0))
     //
     // into (f64x2.convert_low_i32x4_{s,u} $x).
-    auto IntToFP = N->getOperand(0);
-    if (IntToFP.getOpcode() != ISD::SINT_TO_FP &&
-        IntToFP.getOpcode() != ISD::UINT_TO_FP)
+    //
+    // Or this:
+    //
+    //  (v2f64 (extract_subvector
+    //    (v4f64 (fp_extend (v4f32 $x))), 0))
+    //
+    // into (f64x2.promote_low_f32x4 $x).
+    auto Conversion = N->getOperand(0);
+    auto ConversionOp = Conversion.getOpcode();
+    MVT ExpectedSourceType;
+    switch (ConversionOp) {
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+      ExpectedSourceType = MVT::v4i32;
+      break;
+    case ISD::FP_EXTEND:
+      ExpectedSourceType = MVT::v4f32;
+      break;
+    default:
       return SDValue();
-    if (IntToFP.getValueType() != MVT::v4f64)
+    }
+
+    if (Conversion.getValueType() != MVT::v4f64)
       return SDValue();
-    auto Source = IntToFP.getOperand(0);
-    if (Source.getValueType() != MVT::v4i32)
+
+    auto Source = Conversion.getOperand(0);
+    if (Source.getValueType() != ExpectedSourceType)
       return SDValue();
+
     auto IndexNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
       return SDValue();
 
-    unsigned Op = IntToFP->getOpcode() == ISD::SINT_TO_FP
-                      ? WebAssemblyISD::CONVERT_LOW_S
-                      : WebAssemblyISD::CONVERT_LOW_U;
-
+    auto Op = GetWasmConversionOp(ConversionOp);
     return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+  }
 
-  } else {
+  // Combine this:
+  //
+  //   (v2f64 ({s,u}int_to_fp
+  //     (v2i32 (extract_subvector (v4i32 $x), 0))))
+  //
+  // into (f64x2.convert_low_i32x4_{s,u} $x).
+  //
+  // Or this:
+  //
+  //   (v2f64 (fp_extend
+  //     (v2f32 (extract_subvector (v4f32 $x), 0))))
+  //
+  // into (f64x2.promote_low_f32x4 $x).
+  auto ConversionOp = N->getOpcode();
+  MVT ExpectedExtractType;
+  MVT ExpectedSourceType;
+  switch (ConversionOp) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    ExpectedExtractType = MVT::v2i32;
+    ExpectedSourceType = MVT::v4i32;
+    break;
+  case ISD::FP_EXTEND:
+    ExpectedExtractType = MVT::v2f32;
+    ExpectedSourceType = MVT::v4f32;
+    break;
+  default:
     llvm_unreachable("unexpected opcode");
   }
+
+  auto Extract = N->getOperand(0);
+  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return SDValue();
+
+  if (Extract.getValueType() != ExpectedExtractType)
+    return SDValue();
+
+  auto Source = Extract.getOperand(0);
+  if (Source.getValueType() != ExpectedSourceType)
+    return SDValue();
+
+  auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
+  if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
+    return SDValue();
+
+  unsigned Op = GetWasmConversionOp(ConversionOp);
+  return DAG.getNode(Op, SDLoc(N), ResVT, Source);
 }
 
 static SDValue
@@ -2298,6 +2349,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return performVectorExtendCombine(N, DCI);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
+  case ISD::FP_EXTEND:
   case ISD::EXTRACT_SUBVECTOR:
     return performVectorConvertLowCombine(N, DCI);
   case ISD::CONCAT_VECTORS:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1288,11 +1288,13 @@ defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_signed,
 defm "" : SIMDConvert<I32x4, I16x8, int_wasm_extadd_pairwise_unsigned,
                       "extadd_pairwise_i16x8_u", 0x7f>;
 
-// Prototype f64x2 conversions
+// f64x2 <-> f32x4 conversions
 defm "" : SIMDConvert<F32x4, F64x2, int_wasm_demote_zero,
                       "demote_zero_f64x2", 0x5e>;
-defm "" : SIMDConvert<F64x2, F32x4, int_wasm_promote_low,
-                      "promote_low_f32x4", 0x5f>;
+
+def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
+def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>;
+defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>;
 
 //===----------------------------------------------------------------------===//
 // Saturating Rounding Q-Format Multiplication
diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
@@ -126,3 +126,25 @@ define <2 x double> @convert_low_u_v2f64_2(<4 x i32> %x) {
   %a = shufflevector <4 x double> %v, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %a
 }
+
+; CHECK-LABEL: promote_low_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .functype promote_low_v2f64 (v128) -> (v128){{$}}
+; SIMD128-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+define <2 x double> @promote_low_v2f64(<4 x float> %x) {
+  %v = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %a = fpext <2 x float> %v to <2 x double>
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: promote_low_v2f64_2:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .functype promote_low_v2f64_2 (v128) -> (v128){{$}}
+; SIMD128-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+define <2 x double> @promote_low_v2f64_2(<4 x float> %x) {
+  %v = fpext <4 x float> %x to <4 x double>
+  %a = shufflevector <4 x double> %v, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %a
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -806,13 +806,3 @@ define <2 x double> @nearest_v2f64(<2 x double> %a) {
   %v = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a)
   ret <2 x double> %v
 }
-
-; CHECK-LABEL: promote_low_v2f64:
-; CHECK-NEXT: .functype promote_low_v2f64 (v128) -> (v128){{$}}
-; CHECK-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0{{$}}
-; CHECK-NEXT: return $pop[[R]]{{$}}
-declare <2 x double> @llvm.wasm.promote.low(<4 x float>)
-define <2 x double> @promote_low_v2f64(<4 x float> %a) {
-  %v = call <2 x double> @llvm.wasm.promote.low(<4 x float> %a)
-  ret <2 x double> %v
-}