replace arm_sve.intr.zip1 with target-agnostic interleave2 intrinsic

c-rhodes · c-rhodes · commit 26f705d17db8 · 2024-01-29T13:23:42.000Z
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -958,13 +958,8 @@ def FMopaWide2WayOp
     product as follows:
 
 	```mlir
-    %undef = llvm.mlir.undef : vector<[8]xf16>
-    %a0_ins = vector.scalable.insert %a0, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %a1_ins = vector.scalable.insert %a1, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %a_packed = "arm_sve.intr.zip1"(%a0_ins, %a1_ins) : (vector<[8]xf16>, vector<[8]xf16>) -> vector<[8]xf16>
-    %b0_ins = vector.scalable.insert %b0, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %b1_ins = vector.scalable.insert %b1, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %b_packed = "arm_sve.intr.zip1"(%b0_ins, %b1_ins) : (vector<[8]xf16>, vector<[8]xf16>) -> vector<[8]xf16>
+    %a_packed = "llvm.intr.experimental.vector.interleave2"(%a0, %a1) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+    %b_packed = "llvm.intr.experimental.vector.interleave2"(%b0, %b1) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
     %0 = arm_sme.fmopa_wide_2way %a_packed, %b_packed : vector<[8]xf16>, vector<[8]xf16> into vector<[4]x[4]xf32>
 	```
 
diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
@@ -143,13 +143,8 @@ def OuterProductWidening
     Becomes:
 
     ```mlir
-    %undef = llvm.mlir.undef : vector<[8]xf16>
-    %a0_ins = vector.scalable.insert %a0, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %a1_ins = vector.scalable.insert %a1, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %a_packed = "arm_sve.intr.zip1"(%a0_ins, %a1_ins) : (vector<[8]xf16>, vector<[8]xf16>) -> vector<[8]xf16>
-    %b0_ins = vector.scalable.insert %b0, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %b1_ins = vector.scalable.insert %b1, %undef[0] : vector<[4]xf16> into vector<[8]xf16>
-    %b_packed = "arm_sve.intr.zip1"(%b0_ins, %b1_ins) : (vector<[8]xf16>, vector<[8]xf16>) -> vector<[8]xf16>
+    %a_packed = "llvm.intr.experimental.vector.interleave2"(%a0, %a1) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+    %b_packed = "llvm.intr.experimental.vector.interleave2"(%b0, %b1) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
     %0 = arm_sme.fmopa_wide_2way %a_packed, %b_packed : vector<[8]xf16>, vector<[8]xf16> into vector<[4]x[4]xf32>
     ```
 
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductWidening.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductWidening.cpp
@@ -48,10 +48,12 @@ namespace {
 //
 // Becomes:
 //
-//  %a_packed = arm_sve.zip %a0, %a1 : vector<[8]xf16> to vector<[8]xf16>
-//  %b_packed = arm_sve.zip %b0, %b1 : vector<[8]xf16> to vector<[8]xf16>
-//  %0 = arm_sme.fmopa_wide_2way %a_packed, %b_packed : vector<[8]xf16>,
-//                                                      vector<[4]xf32>
+//  %a_packed = "llvm.intr.experimental.vector.interleave2"(%a0, %a1)
+//    : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+//  %b_packed = "llvm.intr.experimental.vector.interleave2"(%b0, %b1)
+//    : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+//  %0 = arm_sme.fmopa_wide_2way %a_packed, %b_packed
+//    : vector<[8]xf16>, vector<[8]xf16> into vector<[4]x[4]xf32>
 class OuterProduct2WayWidening
     : public OpRewritePattern<arm_sme::OuterProductOp> {
 public:
@@ -113,15 +115,9 @@ class OuterProduct2WayWidening
 
     auto loc = op.getLoc();
 
-    // zip(lhs, rhs)
     auto packInputs = [&](VectorType type, Value lhs, Value rhs) {
-      auto undef = rewriter.create<LLVM::UndefOp>(loc, type);
-      auto insertLHS =
-          rewriter.create<vector::ScalableInsertOp>(loc, lhs, undef, 0);
-      auto insertRHS =
-          rewriter.create<vector::ScalableInsertOp>(loc, rhs, undef, 0);
-      return rewriter.create<arm_sve::Zip1IntrOp>(loc, type, insertLHS,
-                                                  insertRHS);
+      return rewriter.create<LLVM::experimental_vector_interleave2>(loc, type,
+                                                                    lhs, rhs);
     };
 
     auto extOp = op.getLhs().getDefiningOp();
diff --git a/mlir/test/Dialect/ArmSME/outer-product-widening.mlir b/mlir/test/Dialect/ArmSME/outer-product-widening.mlir
@@ -4,20 +4,10 @@
 // CHECK-SAME:    %[[A0:.*]]: vector<[4]xf16>, %[[B0:.*]]: vector<[4]xf16>, %[[A1:.*]]: vector<[4]xf16>, %[[B1:.*]]: vector<[4]xf16>,
 // CHECK-SAME:    %[[A0_MASK:.*]]: vector<[4]xi1>, %[[B0_MASK:.*]]: vector<[4]xi1>, %[[A1_MASK:.*]]: vector<[4]xi1>, %[[B1_MASK:.*]]: vector<[4]xi1>
 // CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<[4]x[4]xf32>
-// CHECK-DAG: %[[VEC_UNDEF:.*]] = llvm.mlir.undef : vector<[8]xf16>
-// CHECK-DAG: %[[A0_INSERT:.*]] = vector.scalable.insert %[[A0]], %[[VEC_UNDEF]][0] : vector<[4]xf16> into vector<[8]xf16>
-// CHECK-DAG: %[[B0_INSERT:.*]] = vector.scalable.insert %[[B0]], %[[VEC_UNDEF]][0] : vector<[4]xf16> into vector<[8]xf16>
-// CHECK-DAG: %[[A1_INSERT:.*]] = vector.scalable.insert %[[A1]], %[[VEC_UNDEF]][0] : vector<[4]xf16> into vector<[8]xf16>
-// CHECK-DAG: %[[B1_INSERT:.*]] = vector.scalable.insert %[[B1]], %[[VEC_UNDEF]][0] : vector<[4]xf16> into vector<[8]xf16>
-// CHECK-DAG: %[[LHS:.*]] = "arm_sve.intr.zip1"(%[[A0_INSERT]], %[[A1_INSERT]]) : (vector<[8]xf16>, vector<[8]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[RHS:.*]] = "arm_sve.intr.zip1"(%[[B0_INSERT]], %[[B1_INSERT]]) : (vector<[8]xf16>, vector<[8]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[MASK_UNDEF:.*]] = llvm.mlir.undef : vector<[8]xi1>
-// CHECK-DAG: %[[A0_MASK_INSERT:.*]] = vector.scalable.insert %[[A0_MASK]], %[[MASK_UNDEF]][0] : vector<[4]xi1> into vector<[8]xi1>
-// CHECK-DAG: %[[B0_MASK_INSERT:.*]] = vector.scalable.insert %[[B0_MASK]], %[[MASK_UNDEF]][0] : vector<[4]xi1> into vector<[8]xi1>
-// CHECK-DAG: %[[A1_MASK_INSERT:.*]] = vector.scalable.insert %[[A1_MASK]], %[[MASK_UNDEF]][0] : vector<[4]xi1> into vector<[8]xi1>
-// CHECK-DAG: %[[B1_MASK_INSERT:.*]] = vector.scalable.insert %[[B1_MASK]], %[[MASK_UNDEF]][0] : vector<[4]xi1> into vector<[8]xi1>
-// CHECK-DAG: %[[LHS_MASK:.*]] = "arm_sve.intr.zip1"(%[[A0_MASK_INSERT]], %[[A1_MASK_INSERT]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS_MASK:.*]] = "arm_sve.intr.zip1"(%[[B0_MASK_INSERT]], %[[B1_MASK_INSERT]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0]], %[[A1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0]], %[[B1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0_MASK]], %[[A1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0_MASK]], %[[B1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
 // CHECK-DAG: arm_sme.fmopa_wide_2way %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[8]xf16>, vector<[8]xf16> into vector<[4]x[4]xf32>
 func.func @outerproduct_add_widening_2way_f16f16f32(
     %a0 : vector<[4]xf16>, %b0 : vector<[4]xf16>,