[fixup] Check we are generating the expected number and kind of LLVM intrinsics

momchil-velikov · momchil-velikov · commit a34bec555dc2 · 2025-06-04T17:00:17.000Z
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-smmla-4x8x4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-smmla-4x8x4.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
 
-// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
 
 #packed_maps = [
   affine_map<(d0, d1, d2) -> (d0, d2)>,
@@ -20,6 +20,45 @@
 
 func.func private @setArmVLBits(%bits : i32)
 
+func.func private @prepareAccTestData(%in: vector<4x4xi32>) -> vector<4x[4]xi32> {
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  %mem = memref.alloca() : memref<4x4xi32>
+  vector.transfer_write %in, %mem[%c0, %c0] : vector<4x4xi32>, memref<4x4xi32>
+
+  %flat_mem = memref.collapse_shape %mem [[0, 1]] : memref<4x4xi32> into memref<16xi32>
+  %flat_vec = vector.transfer_read %flat_mem[%c0], %c0_i32 {in_bounds = [true]} : memref<16xi32>, vector<[16]xi32>
+  %out = vector.shape_cast %flat_vec : vector<[16]xi32> to vector<4x[4]xi32>
+
+  return %out : vector<4x[4]xi32>
+}
+
+func.func private @prepareLHSTestData(%in: vector<4x8xi8>) -> vector<4x8xi8> {
+  %c0 = arith.constant 0 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %mem = memref.alloca() : memref<4x8xi8>
+  vector.transfer_write %in, %mem[%c0, %c0] : vector<4x8xi8>, memref<4x8xi8>
+
+  %out = vector.transfer_read %mem[%c0, %c0], %c0_i8 : memref<4x8xi8>, vector<4x8xi8>
+
+  return %out :  vector<4x8xi8>
+}
+
+func.func private @prepareRHSTestData(%in: vector<4x8xi8>) -> vector<[32]xi8> {
+  %c0 = arith.constant 0 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %mem = memref.alloca() : memref<4x8xi8>
+  vector.transfer_write %in, %mem[%c0, %c0] : vector<4x8xi8>, memref<4x8xi8>
+
+  %flat_mem = memref.collapse_shape %mem [[0, 1]] : memref<4x8xi8> into memref<32xi8>
+  %flat_vec = vector.transfer_read %flat_mem[%c0], %c0_i8 {in_bounds = [true]} : memref<32xi8>, vector<[32]xi8>
+
+  return %flat_vec : vector<[32]xi8>
+}
+
 func.func @main() {
   %c128 = arith.constant 128 : i32
   func.call @setArmVLBits(%c128) : (i32) -> ()
@@ -28,68 +67,32 @@ func.func @main() {
   %c0_i32 = arith.constant 0 : i32
   %c0_i8 = arith.constant 0 : i8
 
-// Accumulator test data
+  // Accumulator test data
   %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
                                    [ -8,  25, -34,  26],
                                    [-20, -36,  -3,  39],
                                    [-48, -31, -25, -21]]> : vector<4x4xi32>
-  %acc_m = memref.alloca() : memref<4x4xi32>
-  vector.transfer_write %acc_cst, %acc_m[%c0, %c0] : vector<4x4xi32>, memref<4x4xi32>
-
-  %acc_m1 = memref.collapse_shape %acc_m [[0, 1]] : memref<4x4xi32> into memref<16xi32>
-  %acc_flat = vector.transfer_read %acc_m1[%c0], %c0_i32 {in_bounds = [true]} : memref<16xi32>, vector<[16]xi32>
-  %acc = vector.shape_cast %acc_flat : vector<[16]xi32> to vector<4x[4]xi32>
-
-  vector.print str "ACC:\n"
-  %acc0 = vector.extract %acc[0] : vector<[4]xi32> from vector<4x[4]xi32>
-  %acc1 = vector.extract %acc[1] : vector<[4]xi32> from vector<4x[4]xi32>
-  %acc2 = vector.extract %acc[2] : vector<[4]xi32> from vector<4x[4]xi32>
-  %acc3 = vector.extract %acc[3] : vector<[4]xi32> from vector<4x[4]xi32>
-  vector.print %acc0 : vector<[4]xi32>
-  vector.print %acc1 : vector<[4]xi32>
-  vector.print %acc2 : vector<[4]xi32>
-  vector.print %acc3 : vector<[4]xi32>
+
+  %acc = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> vector<4x[4]xi32>
 
   // LHS test data
   %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
-                                   [-20,  17, -32, -47,  37,  22,  -7, -21],
-                                   [ -7, -35,  20,  -4,  39,  46, -23,  40],
-                                   [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
-
-  %lhs_m = memref.alloca() : memref<4x8xi8>
-  vector.transfer_write %lhs_cst, %lhs_m[%c0, %c0] : vector<4x8xi8>, memref<4x8xi8>
-  %lhs = vector.transfer_read %lhs_m[%c0, %c0], %c0_i8 : memref<4x8xi8>, vector<4x8xi8>
-
-  vector.print str "LHS:\n"
-  %lhs0 = vector.extract %lhs[0] : vector<8xi8> from vector<4x8xi8>
-  %lhs1 = vector.extract %lhs[1] : vector<8xi8> from vector<4x8xi8>
-  %lhs2 = vector.extract %lhs[2] : vector<8xi8> from vector<4x8xi8>
-  %lhs3 = vector.extract %lhs[3] : vector<8xi8> from vector<4x8xi8>
-  vector.print %lhs0 : vector<8xi8>
-  vector.print %lhs1 : vector<8xi8>
-  vector.print %lhs2 : vector<8xi8>
-  vector.print %lhs3 : vector<8xi8>
+                               [-20,  17, -32, -47,  37,  22,  -7, -21],
+                               [ -7, -35,  20,  -4,  39,  46, -23,  40],
+                               [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
+
+  %lhs = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> vector<4x8xi8>
 
   // RHS test data
   %rhs_cst = arith.constant dense<[[-17, -50,  -1,  48, -13,  22,  39,  33],
                                    [-35, -24,  37, -32,  33,  30, -11, -17],
                                    [-28,  31,   3, -44, -15, -27,  22,  35],
                                    [-23,  39,  48,  26, -23,  32, -39, -38]]> : vector<4x8xi8>
-
-  %rhs_m = memref.alloca() : memref<4x8xi8>
-  vector.transfer_write %rhs_cst, %rhs_m[%c0, %c0] : vector<4x8xi8>, memref<4x8xi8>
-
-  %rhs_m1 = memref.collapse_shape %rhs_m [[0, 1]] : memref<4x8xi8> into memref<32xi8>
-  %rhs_flat = vector.transfer_read %rhs_m1[%c0], %c0_i8 {in_bounds = [true]} : memref<32xi8>, vector<[32]xi8>
-
-  vector.print str "RHS:\n"
-  %rhs0 = vector.scalable.extract %rhs_flat[0] : vector<[16]xi8> from vector<[32]xi8>
-  %rhs1 = vector.scalable.extract %rhs_flat[16] : vector<[16]xi8> from vector<[32]xi8>
-  vector.print %rhs0 : vector<[16]xi8>
-  vector.print %rhs1 : vector<[16]xi8>
-
+  %rhs_flat = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> vector<[32]xi8>
   %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
 
+// CHECK-IR-COUNT-4: arm_sve.intr.smmla
+
   // Matrix multiplication
   %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
   %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-smmla-8x8x8-vs2.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-smmla-8x8x8-vs2.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
 
-// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
 
 #packed_maps = [
   affine_map<(d0, d1, d2) -> (d0, d2)>,
@@ -28,7 +28,6 @@ func.func @main() {
   %c0_i32 = arith.constant 0 : i32
   %c0_i8 = arith.constant 0 : i8
 
-
   // Accumulator test data
   %acc_cst = arith.constant dense<[[-44,  20,  44, -46,  -8,  25, -34,  26],
                                    [-20, -36,  -3,  39, -48, -31, -25, -21],
@@ -119,6 +118,8 @@ func.func @main() {
 
   %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
 
+// CHECK-IR-COUNT-8: arm_sve.intr.smmla
+
   // Matrix multiplication
   %0 = arith.extsi %lhs : vector<8x8xi8> to vector<8x8xi32>
   %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
@@ -146,7 +147,6 @@ func.func @main() {
   vector.print %u6 : vector<[4]xi32>
   vector.print %u7 : vector<[4]xi32>
 
-
 // CHECK: ( -2294, -1282,  2728,  -410, -1328,   882, -5498,   732 )
 // CHECK: (  1012, -4237,  4154,  2624,  5225, -2338,  2011,  1374 )
 // CHECK: (    -8, -1611,  2905,    -1, -1068, -3155, -2428,   153 )
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-summla-4x8x4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-summla-4x8x4.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
 
-// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
 
 #packed_maps = [
   affine_map<(d0, d1, d2) -> (d0, d2)>,
@@ -28,7 +28,7 @@ func.func @main() {
   %c0_i32 = arith.constant 0 : i32
   %c0_i8 = arith.constant 0 : i8
 
-// Accumulator test data
+  // Accumulator test data
   %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
                                    [ -8,  25, -34,  26],
                                    [-20, -36,  -3,  39],
@@ -90,6 +90,8 @@ func.func @main() {
 
   %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
 
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+
   // Matrix multiplication
   %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
   %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-ummla-4x8x4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-ummla-4x8x4.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
 
-// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
 
 #packed_maps = [
   affine_map<(d0, d1, d2) -> (d0, d2)>,
@@ -29,8 +29,7 @@ func.func @main() {
   %c0_i32 = arith.constant 0 : i32
   %c0_i8 = arith.constant 0 : i8
 
-
-// Accumulator test data
+  // Accumulator test data
   %acc_cst = arith.constant dense<[[16, 16, 48, 40],
                                    [40, 24, 35, 12],
                                    [33, 24, 29, 19],
@@ -92,6 +91,8 @@ func.func @main() {
 
   %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
 
+// CHECK-IR-COUNT-4: arm_sve.intr.ummla
+
   // Matrix multiplication
   %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
   %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-usmmla-4x8x4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/contraction-usmmla-4x8x4.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
 
-// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
 
 #packed_maps = [
   affine_map<(d0, d1, d2) -> (d0, d2)>,
@@ -28,7 +28,7 @@ func.func @main() {
   %c0_i32 = arith.constant 0 : i32
   %c0_i8 = arith.constant 0 : i8
 
-// Accumulator test data
+  // Accumulator test data
   %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
                                    [ -8,  25, -34,  26],
                                    [-20, -36,  -3,  39],
@@ -90,6 +90,8 @@ func.func @main() {
 
   %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
 
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+
   // Matrix multiplication
   %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
   %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
@@ -109,9 +111,9 @@ func.func @main() {
   vector.print %u2 : vector<[4]xi32>
   vector.print %u3 : vector<[4]xi32>
 
- // CHECK: ( 28403,  445,  -2759, -11409 )
- // CHECK: ( 34908, 1047,    142,  -7274 )
- // CHECK: ( 31032, 6807,  -2378,   7382 )
- // CHECK: ( 44217, 6396, -10930,    623 )
+// CHECK: ( 28403,  445,  -2759, -11409 )
+// CHECK: ( 34908, 1047,    142,  -7274 )
+// CHECK: ( 31032, 6807,  -2378,   7382 )
+// CHECK: ( 44217, 6396, -10930,    623 )
   return
 }