llvm · lukel97 · Nov 6, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1339,9 +1339,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                             ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS},
                            VT, Custom);
 
-        // FIXME: mload, mstore, mgather, mscatter, vp_gather/scatter can be
+        // FIXME: mload, mstore, vp_gather/scatter can be
         // hoisted to here.
-        setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+        setOperationAction({ISD::LOAD, ISD::STORE, ISD::MGATHER, ISD::MSCATTER},
+                           VT, Custom);
         setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
                             ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
                             ISD::EXPERIMENTAL_VP_STRIDED_STORE},
@@ -1408,8 +1409,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction({ISD::BUILD_VECTOR, ISD::SCALAR_TO_VECTOR}, VT,
                            Custom);
 
-        setOperationAction(
-            {ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom);
+        setOperationAction({ISD::MLOAD, ISD::MSTORE}, VT, Custom);
 
         setOperationAction({ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom);
 

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -270,7 +270,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    return TLI->isLegalElementTypeForRVV(ElemType);
+    // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
+    return TLI->isLegalElementTypeForRVV(ElemType) ||
+           (DataTypeVT.getVectorElementType() == MVT::bf16 &&
+            ST->hasVInstructionsBF16Minimal()) ||
+           (DataTypeVT.getVectorElementType() == MVT::f16 &&
+            ST->hasVInstructionsF16Minimal());
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) {

diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -14,6 +15,12 @@ define i32 @masked_gather() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
@@ -51,6 +58,12 @@ define i32 @masked_gather() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32BF16.u = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16BF16.u = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8BF16.u = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4BF16.u = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2BF16.u = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16.u = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef)
@@ -85,6 +98,13 @@ define i32 @masked_gather() {
   %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
   %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x float> undef)
 
+  %V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x bfloat> undef)
+  %V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x bfloat> undef)
+  %V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x bfloat> undef)
+  %V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x bfloat> undef)
+  %V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x bfloat> undef)
+  %V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x bfloat> undef)
+
   %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x half> undef)
   %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
   %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
@@ -130,6 +150,13 @@ define i32 @masked_gather() {
   %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x float> undef)
   %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x float> undef)
 
+  %V32BF16.u = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x bfloat> undef)
+  %V16BF16.u = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x bfloat> undef)
+  %V8BF16.u = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x bfloat> undef)
+  %V4BF16.u = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x bfloat> undef)
+  %V2BF16.u = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x bfloat> undef)
+  %V1BF16.u = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x bfloat> undef)
+
   %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef)
   %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef)
   %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef)

diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -14,6 +15,12 @@ define i32 @masked_scatter() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 4, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
@@ -51,6 +58,12 @@ define i32 @masked_scatter() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
@@ -85,6 +98,13 @@ define i32 @masked_scatter() {
   call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
   call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 4, <1 x i1> undef)
 
+  call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
+
   call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
   call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
@@ -130,6 +150,13 @@ define i32 @masked_scatter() {
   call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
   call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
 
+  call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
   call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
   call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)