GPUOpen-Drivers
diff --git a/‎clang/test/Driver/print-supported-extensions-riscv.c
Lines changed: 1 addition & 0 deletions b/‎clang/test/Driver/print-supported-extensions-riscv.c
Lines changed: 1 addition & 0 deletions
diff --git a/‎clang/test/Preprocessor/riscv-target-features.c
Lines changed: 9 additions & 0 deletions b/‎clang/test/Preprocessor/riscv-target-features.c
Lines changed: 9 additions & 0 deletions
diff --git a/‎llvm/docs/RISCVUsage.rst
Lines changed: 1 addition & 0 deletions b/‎llvm/docs/RISCVUsage.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/docs/ReleaseNotes.md
Lines changed: 1 addition & 0 deletions b/‎llvm/docs/ReleaseNotes.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/include/llvm/ADT/ArrayRef.h
Lines changed: 9 additions & 6 deletions b/‎llvm/include/llvm/ADT/ArrayRef.h
Lines changed: 9 additions & 6 deletions
diff --git a/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/include/llvm/Support/Compiler.h
Lines changed: 6 additions & 0 deletions b/‎llvm/include/llvm/Support/Compiler.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 29 additions & 9 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 29 additions & 9 deletions
diff --git a/‎llvm/lib/Target/RISCV/RISCVFeatures.td
Lines changed: 7 additions & 0 deletions b/‎llvm/lib/Target/RISCV/RISCVFeatures.td
Lines changed: 7 additions & 0 deletions
diff --git a/‎llvm/lib/Target/X86/X86ISelLowering.cpp
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/X86/X86ISelLowering.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
Lines changed: 3 additions & 1 deletion b/‎llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
Lines changed: 3 additions & 1 deletion
diff --git a/‎llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
Lines changed: 16 additions & 16 deletions b/‎llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
Lines changed: 16 additions & 16 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
Lines changed: 8 additions & 8 deletions b/‎llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
Lines changed: 8 additions & 8 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
Lines changed: 3 additions & 3 deletions b/‎llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
Lines changed: 3 additions & 3 deletions
@@ -110,6 +110,7 @@
 // CHECK-NEXT:     zvl8192b             1.0       'Zvl' (Minimum Vector Length) 8192
 // CHECK-NEXT:     zhinx                1.0       'Zhinx' (Half Float in Integer)
 // CHECK-NEXT:     zhinxmin             1.0       'Zhinxmin' (Half Float in Integer Minimal)
+// CHECK-NEXT:     sha                  1.0       'Sha' (Augmented Hypervisor)
 // CHECK-NEXT:     shcounterenw         1.0       'Shcounterenw' (Support writeable hcounteren enable bit for any hpmcounter that is not read-only zero)
 // CHECK-NEXT:     shgatpa              1.0       'Sgatpa' (SvNNx4 mode supported for all modes supported by satp, as well as Bare)
 // CHECK-NEXT:     shtvala              1.0       'Shtvala' (htval provides all needed values)
 
@@ -20,6 +20,7 @@
 // CHECK-NOT: __riscv_m {{.*$}}
 // CHECK-NOT: __riscv_mul {{.*$}}
 // CHECK-NOT: __riscv_muldiv {{.*$}}
+// CHECK-NOT: __riscv_sha {{.*$}}
 // CHECK-NOT: __riscv_shcounterenw {{.*$}}
 // CHECK-NOT: __riscv_shgatpa {{.*$}}
 // CHECK-NOT: __riscv_shtvala {{.*$}}
@@ -323,6 +324,14 @@
 // CHECK-M-EXT: __riscv_mul 1
 // CHECK-M-EXT: __riscv_muldiv 1
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32isha -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SHCOUNTERENW-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64isha -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-SHCOUNTERENW-EXT %s
+// CHECK-SHA-EXT: __riscv_sha 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32ishcounterenw -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SHCOUNTERENW-EXT %s
 
@@ -119,6 +119,7 @@ on support follow.
      ``E``             Supported (`See note <#riscv-rve-note>`__)
      ``H``             Assembly Support
      ``M``             Supported
+     ``Sha``           Supported
      ``Shcounterenw``  Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Shgatpa``       Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Shtvala``       Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
 
@@ -183,6 +183,7 @@ Changes to the RISC-V Backend
 * The `Zacas` extension is no longer marked as experimental.
 * The `Smmpm`, `Smnpm`, `Ssnpm`, `Supm`, and `Sspm` pointer masking extensions
   are no longer marked as experimental.
+* The `Sha` extension is now supported.
 
 Changes to the WebAssembly Backend
 ----------------------------------
 
@@ -70,15 +70,16 @@ namespace llvm {
     /*implicit*/ ArrayRef(std::nullopt_t) {}
 
     /// Construct an ArrayRef from a single element.
-    /*implicit*/ ArrayRef(const T &OneElt)
-      : Data(&OneElt), Length(1) {}
+    /*implicit*/ ArrayRef(const T &OneElt LLVM_LIFETIME_BOUND)
+        : Data(&OneElt), Length(1) {}
 
     /// Construct an ArrayRef from a pointer and length.
-    constexpr /*implicit*/ ArrayRef(const T *data, size_t length)
+    constexpr /*implicit*/ ArrayRef(const T *data LLVM_LIFETIME_BOUND,
+                                    size_t length)
         : Data(data), Length(length) {}
 
     /// Construct an ArrayRef from a range.
-    constexpr ArrayRef(const T *begin, const T *end)
+    constexpr ArrayRef(const T *begin LLVM_LIFETIME_BOUND, const T *end)
         : Data(begin), Length(end - begin) {
       assert(begin <= end);
     }
@@ -103,7 +104,8 @@ namespace llvm {
 
     /// Construct an ArrayRef from a C array.
     template <size_t N>
-    /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+    /*implicit*/ constexpr ArrayRef(const T (&Arr LLVM_LIFETIME_BOUND)[N])
+        : Data(Arr), Length(N) {}
 
     /// Construct an ArrayRef from a std::initializer_list.
 #if LLVM_GNUC_PREREQ(9, 0, 0)
@@ -113,7 +115,8 @@ namespace llvm {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winit-list-lifetime"
 #endif
-    constexpr /*implicit*/ ArrayRef(std::initializer_list<T> Vec)
+    constexpr /*implicit*/ ArrayRef(
+        std::initializer_list<T> Vec LLVM_LIFETIME_BOUND)
         : Data(Vec.begin() == Vec.end() ? (T *)nullptr : Vec.begin()),
           Length(Vec.size()) {}
 #if LLVM_GNUC_PREREQ(9, 0, 0)
 
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 516212
+#define LLVM_MAIN_REVISION 516218
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
 
@@ -413,6 +413,12 @@
 #define LLVM_GSL_POINTER
 #endif
 
+#if LLVM_HAS_CPP_ATTRIBUTE(clang::lifetimebound)
+#define LLVM_LIFETIME_BOUND [[clang::lifetimebound]]
+#else
+#define LLVM_LIFETIME_BOUND
+#endif
+
 #if LLVM_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L
 #define LLVM_CTOR_NODISCARD [[nodiscard]]
 #else
 
@@ -75,6 +75,13 @@ static cl::opt<size_t> InlineMaxBB(
     cl::desc("Maximum number of BBs allowed in a function after inlining"
              " (compile time constraint)"));
 
+// This default unroll factor is based on microbenchmarks on gfx1030.
+static cl::opt<unsigned> MemcpyLoopUnroll(
+    "amdgpu-memcpy-loop-unroll",
+    cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
+             "operations when lowering memcpy as a loop"),
+    cl::init(16), cl::Hidden);
+
 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
                               unsigned Depth = 0) {
   const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -409,13 +416,8 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
   return 1024;
 }
 
-// FIXME: Really we would like to issue multiple 128-bit loads and stores per
-// iteration. Should we report a larger size and let it legalize?
-//
 // FIXME: Should we use narrower types for local/region, or account for when
 // unaligned access is legal?
-//
-// FIXME: This could use fine tuning and microbenchmarks.
 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
     unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -442,17 +444,29 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
   }
 
-  // Global memory works best with 16-byte accesses. Private memory will also
-  // hit this, although they'll be decomposed.
-  return FixedVectorType::get(Type::getInt32Ty(Context), 4);
+  // Global memory works best with 16-byte accesses.
+  // If the operation has a fixed known length that is large enough, it is
+  // worthwhile to return an even wider type and let legalization lower it into
+  // multiple accesses, effectively unrolling the memcpy loop. Private memory
+  // also hits this, although accesses may be decomposed.
+  //
+  // Don't unroll if Length is not a constant, since unrolling leads to worse
+  // performance for length values that are smaller or slightly larger than the
+  // total size of the type returned here. Mitigating that would require a more
+  // complex lowering for variable-length memcpy and memmove.
+  unsigned I32EltsInVector = 4;
+  if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))
+    return FixedVectorType::get(Type::getInt32Ty(Context),
+                                MemcpyLoopUnroll * I32EltsInVector);
+
+  return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
 }
 
 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
     Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicCpySize) const {
-  assert(RemainingBytes < 16);
 
   if (AtomicCpySize)
     BaseT::getMemcpyLoopResidualLoweringType(
@@ -462,6 +476,12 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
   Align MinAlign = std::min(SrcAlign, DestAlign);
 
   if (MinAlign != Align(2)) {
+    Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
+    while (RemainingBytes >= 16) {
+      OpsOut.push_back(I32x4Ty);
+      RemainingBytes -= 16;
+    }
+
     Type *I64Ty = Type::getInt64Ty(Context);
     while (RemainingBytes >= 8) {
       OpsOut.push_back(I64Ty);
 
@@ -1029,6 +1029,13 @@ def FeatureStdExtSvpbmt
     : RISCVExtension<"svpbmt", 1, 0,
                      "'Svpbmt' (Page-Based Memory Types)">;
 
+def FeatureStdExtSha
+    : RISCVExtension<"sha", 1, 0,
+                     "'Sha' (Augmented Hypervisor)",
+                     [FeatureStdExtH, FeatureStdExtSsstateen, FeatureStdExtShcounterenw,
+                      FeatureStdExtShvstvala, FeatureStdExtShtvala, FeatureStdExtShvstvecd,
+                      FeatureStdExtShvsatpa, FeatureStdExtShgatpa]>;
+
 // Pointer Masking extensions
 
 // A supervisor-level extension that provides pointer masking for the next lower
 
@@ -2393,6 +2393,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);
     for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
       setF16Action(VT, Expand);
+      if (!Subtarget.hasBF16())
+        setOperationAction(ISD::VSELECT, VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
 
@@ -3753,7 +3753,9 @@ Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) {
   }
 
   // Replace all dominated uses of the condition with true/false
-  if (BI.getSuccessor(0) != BI.getSuccessor(1)) {
+  // Ignore constant expressions to avoid iterating over uses on other
+  // functions.
+  if (!isa<Constant>(Cond) && BI.getSuccessor(0) != BI.getSuccessor(1)) {
     for (auto &U : make_early_inc_range(Cond->uses())) {
       BasicBlockEdge Edge0(BI.getParent(), BI.getSuccessor(0));
       if (DT.dominates(Edge0, U)) {
 
@@ -131,7 +131,7 @@ define double @t1_strict(double %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %x, metadata !"fpexcept.strict") #0
-  %conv1 = call double @llvm.experimental.constrained.sitofp.i64.f64(i64 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret double %conv1
 }
 
@@ -143,7 +143,7 @@ define float @t2_strict(float %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %x, metadata !"fpexcept.strict") #0
-  %conv1 = call float @llvm.experimental.constrained.sitofp.i32.f32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret float %conv1
 }
 
@@ -155,7 +155,7 @@ define half @t3_strict(half %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict") #0
-  %conv1 = call half @llvm.experimental.constrained.sitofp.i32.f16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret half %conv1
 }
 
@@ -167,7 +167,7 @@ define double @t4_strict(double %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, metadata !"fpexcept.strict") #0
-  %conv1 = call double @llvm.experimental.constrained.uitofp.i64.f64(i64 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret double %conv1
 }
 
@@ -179,7 +179,7 @@ define float @t5_strict(float %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict") #0
-  %conv1 = call float @llvm.experimental.constrained.uitofp.i32.f32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret float %conv1
 }
 
@@ -191,7 +191,7 @@ define half @t6_strict(half %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict") #0
-  %conv1 = call half @llvm.experimental.constrained.uitofp.i32.f16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret half %conv1
 }
 
@@ -216,7 +216,7 @@ define bfloat @t7_strict(bfloat %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptosi.i32.bf16(bfloat %x, metadata !"fpexcept.strict") #0
-  %conv1 = call bfloat @llvm.experimental.constrained.sitofp.i32.bf16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call bfloat @llvm.experimental.constrained.sitofp.bf16.i32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret bfloat %conv1
 }
 
@@ -241,7 +241,7 @@ define bfloat @t8_strict(bfloat %x) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptoui.i32.bf16(bfloat %x, metadata !"fpexcept.strict") #0
-  %conv1 = call bfloat @llvm.experimental.constrained.uitofp.i32.bf16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %conv1 = call bfloat @llvm.experimental.constrained.uitofp.bf16.i32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret bfloat %conv1
 }
 
@@ -255,11 +255,11 @@ declare i32 @llvm.experimental.constrained.fptosi.i32.f32(float, metadata)
 declare i32 @llvm.experimental.constrained.fptoui.i32.f32(float, metadata)
 declare i64 @llvm.experimental.constrained.fptosi.i64.f64(double, metadata)
 declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata)
-declare bfloat @llvm.experimental.constrained.sitofp.i32.bf16(i32, metadata, metadata)
-declare bfloat @llvm.experimental.constrained.uitofp.i32.bf16(i32, metadata, metadata)
-declare half @llvm.experimental.constrained.sitofp.i32.f16(i32, metadata, metadata)
-declare half @llvm.experimental.constrained.uitofp.i32.f16(i32, metadata, metadata)
-declare float @llvm.experimental.constrained.sitofp.i32.f32(i32, metadata, metadata)
-declare float @llvm.experimental.constrained.uitofp.i32.f32(i32, metadata, metadata)
-declare double @llvm.experimental.constrained.sitofp.i64.f64(i64, metadata, metadata)
-declare double @llvm.experimental.constrained.uitofp.i64.f64(i64, metadata, metadata)
+declare bfloat @llvm.experimental.constrained.sitofp.bf16.i32(i32, metadata, metadata)
+declare bfloat @llvm.experimental.constrained.uitofp.bf16.i32(i32, metadata, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata)
+declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata)
+declare float @llvm.experimental.constrained.uitofp.f32.i32(i32, metadata, metadata)
+declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata)
+declare double @llvm.experimental.constrained.uitofp.f64.i64(i64, metadata, metadata)
@@ -595,7 +595,7 @@ define i32 @lrint_f16(half %x) #0 {
 ; CHECK-FP16-NEXT:    frintx h0, h0
 ; CHECK-FP16-NEXT:    fcvtzs w0, h0
 ; CHECK-FP16-NEXT:    ret
-  %val = call i32 @llvm.experimental.constrained.lrint.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %val = call i32 @llvm.experimental.constrained.lrint.i32.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret i32 %val
 }
 
@@ -612,7 +612,7 @@ define i64 @llrint_f16(half %x) #0 {
 ; CHECK-FP16-NEXT:    frintx h0, h0
 ; CHECK-FP16-NEXT:    fcvtzs x0, h0
 ; CHECK-FP16-NEXT:    ret
-  %val = call i64 @llvm.experimental.constrained.llrint.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %val = call i64 @llvm.experimental.constrained.llrint.i64.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret i64 %val
 }
 
@@ -693,7 +693,7 @@ define i32 @lround_f16(half %x) #0 {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtas w0, h0
 ; CHECK-FP16-NEXT:    ret
-  %val = call i32 @llvm.experimental.constrained.lround.f16(half %x, metadata !"fpexcept.strict") #0
+  %val = call i32 @llvm.experimental.constrained.lround.i32.f16(half %x, metadata !"fpexcept.strict") #0
   ret i32 %val
 }
 
@@ -708,7 +708,7 @@ define i64 @llround_f16(half %x) #0 {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtas x0, h0
 ; CHECK-FP16-NEXT:    ret
-  %val = call i64 @llvm.experimental.constrained.llround.f16(half %x, metadata !"fpexcept.strict") #0
+  %val = call i64 @llvm.experimental.constrained.llround.i64.f16(half %x, metadata !"fpexcept.strict") #0
   ret i64 %val
 }
 
@@ -1277,14 +1277,14 @@ declare half @llvm.experimental.constrained.exp.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.exp2.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata)
-declare i32 @llvm.experimental.constrained.lrint.f16(half, metadata, metadata)
-declare i64 @llvm.experimental.constrained.llrint.f16(half, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lrint.i32.f16(half, metadata, metadata)
+declare i64 @llvm.experimental.constrained.llrint.i64.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.maxnum.f16(half, half, metadata)
 declare half @llvm.experimental.constrained.minnum.f16(half, half, metadata)
 declare half @llvm.experimental.constrained.ceil.f16(half, metadata)
 declare half @llvm.experimental.constrained.floor.f16(half, metadata)
-declare i32 @llvm.experimental.constrained.lround.f16(half, metadata)
-declare i64 @llvm.experimental.constrained.llround.f16(half, metadata)
+declare i32 @llvm.experimental.constrained.lround.i32.f16(half, metadata)
+declare i64 @llvm.experimental.constrained.llround.i64.f16(half, metadata)
 declare half @llvm.experimental.constrained.round.f16(half, metadata)
 declare half @llvm.experimental.constrained.roundeven.f16(half, metadata)
 declare half @llvm.experimental.constrained.trunc.f16(half, metadata)
 
@@ -279,7 +279,7 @@ define <4 x i1> @fcmps_v4f32(<4 x float> %x, <4 x float> %y) #0 {
 ; CHECK-NEXT:    xtn v0.4h, v4.4s
 ; CHECK-NEXT:    ret
 entry:
-  %val = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x float> %x, <4 x float> %y, metadata !"oeq", metadata !"fpexcept.strict")
+  %val = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %x, <4 x float> %y, metadata !"oeq", metadata !"fpexcept.strict")
   ret <4 x i1> %val
 }
 
@@ -825,8 +825,8 @@ declare <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float>, meta
 declare <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float>, metadata)
 declare <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float>, metadata)
 declare <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float>, metadata)
-declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x float>, <4 x float>, metadata, metadata)
-declare <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x float>, <4 x float>, metadata, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float>, <4 x float>, metadata, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float>, <4 x float>, metadata, metadata)
 
 declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)