Revert "[mlir][x86vector] AVX Convert/Broadcast BF16 to F32 instructions" (#136781)

jplehr · web-flow · commit 7e86afa6526d · 2025-04-23T00:46:40.000+02:00
Reverts #135143 This broke multiple bots, see PR.
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
@@ -83,7 +83,7 @@ def MaskCompressOp : AVX512_Op<"mask.compress", [Pure,
     }
   }];
   let extraClassDeclaration = [{
-    SmallVector<Value> getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&);
+    SmallVector<Value> getIntrinsicOperands(::mlir::RewriterBase&);
   }];
 }
 
@@ -404,127 +404,8 @@ def DotOp : AVX_LowOp<"dot", [Pure,
     }
   }];
   let extraClassDeclaration = [{
-    SmallVector<Value> getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&);
+    SmallVector<Value> getIntrinsicOperands(::mlir::RewriterBase&);
   }];
 }
 
-
-//----------------------------------------------------------------------------//
-// AVX: Convert packed BF16 even-indexed/odd-indexed elements into packed F32
-//----------------------------------------------------------------------------//
-
-def CvtPackedEvenIndexedBF16ToF32Op : AVX_Op<"cvt.packed.even.indexed.bf16_to_f32", [MemoryEffects<[MemRead]>, 
-  DeclareOpInterfaceMethods<OneToOneIntrinsicOpInterface>]> {
-  let summary = "AVX: Convert packed BF16 even-indexed elements into packed F32 Data.";
-  let description = [{
-    #### From the Intel Intrinsics Guide:
-
-    Convert packed BF16 (16-bit) floating-point even-indexed elements stored at
-    memory locations starting at location `__A` to packed single-precision
-    (32-bit) floating-point elements, and store the results in `dst`.
-
-    Example:
-    ```mlir
-    %dst = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : memref<16xbf16> -> vector<8xf32>
-    ```
-  }];
-  let arguments = (ins AnyMemRef:$a);
-  let results = (outs VectorOfLengthAndType<[4, 8], [F32]>:$dst);
-  let assemblyFormat =
-    "$a  attr-dict`:` type($a)`->` type($dst)";
-
-  let extraClassDefinition = [{
-    std::string $cppClass::getIntrinsicName() {
-      std::string intr = "llvm.x86.vcvtneebf162ps";
-      VectorType vecType = getDst().getType();
-      unsigned elemBitWidth = vecType.getElementTypeBitWidth();
-      unsigned opBitWidth = vecType.getShape()[0] * elemBitWidth;
-      intr += std::to_string(opBitWidth);
-      return intr;
-    }
-  }];
-
-  let extraClassDeclaration = [{
-        SmallVector<Value> getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&);
-  }];
-}
-
-def CvtPackedOddIndexedBF16ToF32Op : AVX_Op<"cvt.packed.odd.indexed.bf16_to_f32", [MemoryEffects<[MemRead]>, 
-  DeclareOpInterfaceMethods<OneToOneIntrinsicOpInterface>]> {
-  let summary = "AVX: Convert packed BF16 odd-indexed elements into packed F32 Data.";
-  let description = [{
-    #### From the Intel Intrinsics Guide:
-
-    Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at
-    memory locations starting at location `__A` to packed single-precision
-    (32-bit) floating-point elements, and store the results in `dst`.
-
-    Example:
-    ```mlir
-    %dst = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : memref<16xbf16> -> vector<8xf32>
-    ```
-  }];
-  let arguments = (ins AnyMemRef:$a);
-  let results = (outs VectorOfLengthAndType<[4, 8], [F32]>:$dst);
-  let assemblyFormat =
-    "$a  attr-dict`:` type($a)`->` type($dst)";
-
-  let extraClassDefinition = [{
-    std::string $cppClass::getIntrinsicName() {
-      std::string intr = "llvm.x86.vcvtneobf162ps";
-      VectorType vecType = getDst().getType();
-      unsigned elemBitWidth = vecType.getElementTypeBitWidth();
-      unsigned opBitWidth = vecType.getShape()[0] * elemBitWidth;
-      intr += std::to_string(opBitWidth);
-      return intr;
-    }
-  }];
-
-  let extraClassDeclaration = [{
-        SmallVector<Value> getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&);
-  }];
-}
-
-//----------------------------------------------------------------------------//
-// AVX: Convert BF16 to F32 and broadcast into packed F32
-//----------------------------------------------------------------------------//
-
-def BcstBF16ToPackedF32Op : AVX_Op<"bcst.bf16_to_f32.packed", [MemoryEffects<[MemRead]>,
-  DeclareOpInterfaceMethods<OneToOneIntrinsicOpInterface>]> {
-  let summary = "AVX: Broadcasts BF16 into packed F32 Data.";
-  let description = [{
-    #### From the Intel Intrinsics Guide:
-
-    Convert scalar BF16 (16-bit) floating-point element stored at memory locations
-    starting at location `__A` to a single-precision (32-bit) floating-point,
-    broadcast it to packed single-precision (32-bit) floating-point elements,
-    and store the results in `dst`.
-
-    Example:
-    ```mlir
-    %dst = x86vector.avx.bcst.bf16_to_f32.packed %a : memref<1xbf16> -> vector<8xf32>
-    ```
-  }];
-  let arguments = (ins AnyMemRef:$a);
-  let results = (outs VectorOfLengthAndType<[4, 8], [F32]>:$dst);
-  let assemblyFormat =
-    "$a  attr-dict`:` type($a)`->` type($dst)";
-
-  let extraClassDefinition = [{
-    std::string $cppClass::getIntrinsicName() {
-      std::string intr = "llvm.x86.vbcstnebf162ps";
-      VectorType vecType = getDst().getType();
-      unsigned elemBitWidth = vecType.getElementTypeBitWidth();
-      unsigned opBitWidth = vecType.getShape()[0] * elemBitWidth;
-      intr += std::to_string(opBitWidth);
-      return intr;
-    }
-  }];
-
-    let extraClassDeclaration = [{
-        SmallVector<Value> getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&);
-  }];
-
-}
-
 #endif // X86VECTOR_OPS
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86VectorDialect.h b/mlir/include/mlir/Dialect/X86Vector/X86VectorDialect.h
@@ -14,8 +14,6 @@
 #define MLIR_DIALECT_X86VECTOR_X86VECTORDIALECT_H_
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86VectorInterfaces.td b/mlir/include/mlir/Dialect/X86Vector/X86VectorInterfaces.td
@@ -58,7 +58,7 @@ def OneToOneIntrinsicOpInterface : OpInterface<"OneToOneIntrinsicOp"> {
       }],
       /*retType=*/"SmallVector<Value>",
       /*methodName=*/"getIntrinsicOperands",
-      /*args=*/(ins "::mlir::RewriterBase &":$rewriter, "const LLVMTypeConverter &":$typeConverter),
+      /*args=*/(ins "::mlir::RewriterBase &":$rewriter),
       /*methodBody=*/"",
       /*defaultImplementation=*/"return SmallVector<Value>($_op->getOperands());"
     >,
diff --git a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
@@ -31,26 +31,6 @@ void x86vector::X86VectorDialect::initialize() {
       >();
 }
 
-static SmallVector<Value>
-getMemrefBuffPtr(Location loc, ::mlir::TypedValue<::mlir::MemRefType> memrefVal,
-                 RewriterBase &rewriter,
-                 const LLVMTypeConverter &typeConverter) {
-  SmallVector<Value> operands;
-  auto opType = memrefVal.getType();
-
-  Type llvmStructType = typeConverter.convertType(opType);
-  Value llvmStruct =
-      rewriter
-          .create<UnrealizedConversionCastOp>(loc, llvmStructType, memrefVal)
-          .getResult(0);
-  MemRefDescriptor memRefDescriptor(llvmStruct);
-
-  Value ptr = memRefDescriptor.bufferPtr(rewriter, loc, typeConverter, opType);
-  operands.push_back(ptr);
-
-  return operands;
-}
-
 LogicalResult x86vector::MaskCompressOp::verify() {
   if (getSrc() && getConstantSrc())
     return emitError("cannot use both src and constant_src");
@@ -65,8 +45,8 @@ LogicalResult x86vector::MaskCompressOp::verify() {
   return success();
 }
 
-SmallVector<Value> x86vector::MaskCompressOp::getIntrinsicOperands(
-    RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) {
+SmallVector<Value>
+x86vector::MaskCompressOp::getIntrinsicOperands(RewriterBase &rewriter) {
   auto loc = getLoc();
 
   auto opType = getA().getType();
@@ -84,8 +64,7 @@ SmallVector<Value> x86vector::MaskCompressOp::getIntrinsicOperands(
 }
 
 SmallVector<Value>
-x86vector::DotOp::getIntrinsicOperands(RewriterBase &rewriter,
-                                       const LLVMTypeConverter &typeConverter) {
+x86vector::DotOp::getIntrinsicOperands(RewriterBase &rewriter) {
   SmallVector<Value> operands(getOperands());
   // Dot product of all elements, broadcasted to all elements.
   Value scale =
@@ -95,22 +74,5 @@ x86vector::DotOp::getIntrinsicOperands(RewriterBase &rewriter,
   return operands;
 }
 
-SmallVector<Value> x86vector::BcstBF16ToPackedF32Op::getIntrinsicOperands(
-    RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) {
-  return getMemrefBuffPtr(getLoc(), getA(), rewriter, typeConverter);
-}
-
-SmallVector<Value>
-x86vector::CvtPackedOddIndexedBF16ToF32Op::getIntrinsicOperands(
-    RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) {
-  return getMemrefBuffPtr(getLoc(), getA(), rewriter, typeConverter);
-}
-
-SmallVector<Value>
-x86vector::CvtPackedEvenIndexedBF16ToF32Op::getIntrinsicOperands(
-    RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) {
-  return getMemrefBuffPtr(getLoc(), getA(), rewriter, typeConverter);
-}
-
 #define GET_OP_CLASSES
 #include "mlir/Dialect/X86Vector/X86Vector.cpp.inc"
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
@@ -96,8 +96,8 @@ struct OneToOneIntrinsicOpConversion
   LogicalResult matchAndRewrite(x86vector::OneToOneIntrinsicOp op,
                                 PatternRewriter &rewriter) const override {
     return intrinsicRewrite(op, rewriter.getStringAttr(op.getIntrinsicName()),
-                            op.getIntrinsicOperands(rewriter, typeConverter),
-                            typeConverter, rewriter);
+                            op.getIntrinsicOperands(rewriter), typeConverter,
+                            rewriter);
   }
 
 private:
@@ -114,8 +114,7 @@ void mlir::populateX86VectorLegalizeForLLVMExportPatterns(
 
 void mlir::configureX86VectorLegalizeForExportTarget(
     LLVMConversionTarget &target) {
-  target.addIllegalOp<
-      MaskCompressOp, MaskRndScaleOp, MaskScaleFOp, Vp2IntersectOp, DotBF16Op,
-      CvtPackedF32ToBF16Op, CvtPackedEvenIndexedBF16ToF32Op,
-      CvtPackedOddIndexedBF16ToF32Op, BcstBF16ToPackedF32Op, RsqrtOp, DotOp>();
+  target.addIllegalOp<MaskCompressOp, MaskRndScaleOp, MaskScaleFOp,
+                      Vp2IntersectOp, DotBF16Op, CvtPackedF32ToBF16Op, RsqrtOp,
+                      DotOp>();
 }
diff --git a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir
@@ -95,60 +95,6 @@ func.func @avx512bf16_cvt_packed_f32_to_bf16_512(
   return %0 : vector<16xbf16>
 }
 
-// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128
-func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128(
-  %a: memref<8xbf16>) -> vector<4xf32>
-{
-  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneebf162ps128"
-  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : memref<8xbf16> -> vector<4xf32>
-  return %0 : vector<4xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256
-func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256(
-  %a: memref<16xbf16>) -> vector<8xf32>
-{
-  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneebf162ps256"
-  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : memref<16xbf16> -> vector<8xf32>
-  return %0 : vector<8xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128
-func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128(
-  %a: memref<8xbf16>) -> vector<4xf32>
-{
-  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneobf162ps128"
-  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : memref<8xbf16> -> vector<4xf32>
-  return %0 : vector<4xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256
-func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256(
-  %a: memref<16xbf16>) -> vector<8xf32>
-{
-  // CHECK: llvm.call_intrinsic "llvm.x86.vcvtneobf162ps256"
-  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : memref<16xbf16> -> vector<8xf32>
-  return %0 : vector<8xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_bsct_bf16_to_f32_packed_128
-func.func @avxbf16_bsct_bf16_to_f32_packed_128(
-  %a: memref<1xbf16>) -> vector<4xf32>
-{
-  // CHECK: llvm.call_intrinsic "llvm.x86.vbcstnebf162ps128"
-  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : memref<1xbf16> -> vector<4xf32>
-  return %0 : vector<4xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_bsct_bf16_to_f32_packed_256
-func.func @avxbf16_bsct_bf16_to_f32_packed_256(
-  %a: memref<1xbf16>) -> vector<8xf32>
-{
-  // CHECK: llvm.call_intrinsic "llvm.x86.vbcstnebf162ps256"
-  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : memref<1xbf16> -> vector<8xf32>
-  return %0 : vector<8xf32>
-}
-
 // CHECK-LABEL: func @avx_rsqrt
 func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>)
 {
diff --git a/mlir/test/Dialect/X86Vector/roundtrip.mlir b/mlir/test/Dialect/X86Vector/roundtrip.mlir
@@ -94,66 +94,6 @@ func.func @avx512bf16_cvt_packed_f32_to_bf16_512(
   return %0 : vector<16xbf16>
 }
 
-// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128
-func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_128(
-  %a: memref<8xbf16>) -> vector<4xf32>
-{
-  // CHECK: x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 {{.*}} :
-  // CHECK-SAME: memref<8xbf16> -> vector<4xf32>
-  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : memref<8xbf16> -> vector<4xf32>
-  return %0 : vector<4xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256
-func.func @avxbf16_cvt_packed_even_indexed_bf16_to_f32_256(
-  %a: memref<16xbf16>) -> vector<8xf32>
-{
-  // CHECK: x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 {{.*}} :
-  // CHECK-SAME: memref<16xbf16> -> vector<8xf32>
-  %0 = x86vector.avx.cvt.packed.even.indexed.bf16_to_f32 %a : memref<16xbf16> -> vector<8xf32>
-  return %0 : vector<8xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128
-func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_128(
-  %a: memref<8xbf16>) -> vector<4xf32>
-{
-  // CHECK: x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 {{.*}} :
-  // CHECK-SAME: memref<8xbf16> -> vector<4xf32>
-  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : memref<8xbf16> -> vector<4xf32>
-  return %0 : vector<4xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256
-func.func @avxbf16_cvt_packed_odd_indexed_bf16_to_f32_256(
-  %a: memref<16xbf16>) -> vector<8xf32>
-{
-  // CHECK: x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 {{.*}} :
-  // CHECK-SAME: memref<16xbf16> -> vector<8xf32>
-  %0 = x86vector.avx.cvt.packed.odd.indexed.bf16_to_f32 %a : memref<16xbf16> -> vector<8xf32>
-  return %0 : vector<8xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_bcst_bf16_to_f32_128
-func.func @avxbf16_bcst_bf16_to_f32_128(
-  %a: memref<1xbf16>) -> vector<4xf32>
-{
-  // CHECK: x86vector.avx.bcst.bf16_to_f32.packed {{.*}} :
-  // CHECK-SAME: memref<1xbf16> -> vector<4xf32>
-  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : memref<1xbf16> -> vector<4xf32>
-  return %0 : vector<4xf32>
-}
-
-// CHECK-LABEL: func @avxbf16_bcst_bf16_to_f32_256
-func.func @avxbf16_bcst_bf16_to_f32_256(
-  %a: memref<1xbf16>) -> vector<8xf32>
-{
-  // CHECK: x86vector.avx.bcst.bf16_to_f32.packed {{.*}} :
-  // CHECK-SAME: memref<1xbf16> -> vector<8xf32>
-  %0 = x86vector.avx.bcst.bf16_to_f32.packed %a : memref<1xbf16> -> vector<8xf32>
-  return %0 : vector<8xf32>
-}
-
 // CHECK-LABEL: func @avx_rsqrt
 func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>)
 {
diff --git a/mlir/test/Target/LLVMIR/x86vector.mlir b/mlir/test/Target/LLVMIR/x86vector.mlir