|
| 1 | +// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=2,1,0" -cse -canonicalize -split-input-file | FileCheck %s |
| 2 | +// RUN: mlir-opt %s -test-linalg-transform-patterns="test-pad-pattern pack-paddings=1,1,0 hoist-paddings=4,3,0" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-DOUBLE |
| 3 | + |
| 4 | +// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (5, -d0 + 24)> |
| 5 | +// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (8, -d0 + 12)> |
| 6 | +// CHECK-DAG: #[[DIV6:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 6)> |
| 7 | +#map0 = affine_map<(d0) -> (5, -d0 + 24)> |
| 8 | +#map1 = affine_map<(d0) -> (8, -d0 + 12)> |
| 9 | +#map2 = affine_map<(d0) -> (7, -d0 + 25)> |
| 10 | + |
| 11 | +// CHECK: single_tiling |
| 12 | +// CHECK-DOUBLE: single_tiling |
| 13 | + |
| 14 | +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> |
| 15 | +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> |
| 16 | +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> |
| 17 | +func @single_tiling(%arg0: tensor<24x12xf32>, |
| 18 | + %arg1: tensor<12x25xf32>, |
| 19 | + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { |
| 20 | + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index |
| 21 | + // CHECK-DAG: %[[C5:.*]] = arith.constant 5 |
| 22 | + // CHECK-DAG: %[[C8:.*]] = arith.constant 8 |
| 23 | + %c0 = arith.constant 0 : index |
| 24 | + %c12 = arith.constant 12 : index |
| 25 | + %c25 = arith.constant 25 : index |
| 26 | + %c24 = arith.constant 24 : index |
| 27 | + %c6 = arith.constant 6 : index |
| 28 | + %c7 = arith.constant 7 : index |
| 29 | + %c5 = arith.constant 5 : index |
| 30 | + |
| 31 | + // CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] = |
| 32 | + %0 = scf.for %arg3 = %c0 to %c24 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) { |
| 33 | + |
| 34 | + // Packing the first input operand for all values of IV2 (IV2x5x6). |
| 35 | + // CHECK: = linalg.init_tensor [2, 5, 6] |
| 36 | + // CHECK: %[[PT0:.*]] = scf.for %[[P0IV2:[0-9a-z]+]] = |
| 37 | + // CHECK: %[[PIDX0:.*]] = affine.apply #[[DIV6]](%[[P0IV2]]) |
| 38 | + // CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]]) |
| 39 | + // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] |
| 40 | + // CHECK-SAME: %[[IV0]], %[[P0IV2]] |
| 41 | + // CHECK-SAME: %[[TS0]], 6 |
| 42 | + // CHECK: %[[V0:.*]] = arith.subi %[[C5]], %[[TS0]] |
| 43 | + // CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold {{.*}} high[%[[V0]] |
| 44 | + // CHECK: %[[T2:.*]] = tensor.insert_slice %[[T1:.*]] into %{{.*}}[%[[PIDX0]], 0, 0] |
| 45 | + // CHECK: scf.yield %[[T2:.*]] |
| 46 | + |
| 47 | + // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = |
| 48 | + %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { |
| 49 | + |
| 50 | + // Packing the second input operand for all values of IV2 (IV2x6x8). |
| 51 | + // CHECK: = linalg.init_tensor [2, 6, 8] |
| 52 | + // CHECK: %[[PT1:.*]] = scf.for %[[P1IV2:[0-9a-z]+]] = |
| 53 | + // CHECK: %[[PIDX1:.*]] = affine.apply #[[DIV6]](%[[P1IV2]]) |
| 54 | + // CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]]) |
| 55 | + // CHECK: %[[T3:.*]] = tensor.extract_slice %[[ARG1]] |
| 56 | + // CHECK-SAME: %[[P1IV2]], %[[IV1]] |
| 57 | + // CHECK-SAME: 6, %[[TS1]] |
| 58 | + // CHECK: %[[V1:.*]] = arith.subi %[[C8]], %[[TS1]] |
| 59 | + // CHECK: %[[T4:.*]] = linalg.pad_tensor %[[T3]] nofold {{.*}} high[%[[C0]], %[[V1]] |
| 60 | + // CHECK: %[[T5:.*]] = tensor.insert_slice %[[T4:.*]] into %{{.*}}[%[[PIDX1]], 0, 0] |
| 61 | + // CHECK: scf.yield %[[T5:.*]] |
| 62 | + |
| 63 | + // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = |
| 64 | + %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) { |
| 65 | + %3 = affine.min #map0(%arg3) |
| 66 | + // Index the packed operands. |
| 67 | + // CHECK-DAG: %[[IDX:.*]] = affine.apply #[[DIV6]](%[[IV2]]) |
| 68 | + // CHECK-DAG: %[[T6:.*]] = tensor.extract_slice %[[PT0]][%[[IDX]] |
| 69 | + // CHECK-DAG: %[[T7:.*]] = tensor.extract_slice %[[PT1]][%[[IDX]] |
| 70 | + %4 = tensor.extract_slice %arg0[%arg3, %arg7] [%3, 6] [1, 1] : tensor<24x12xf32> to tensor<?x6xf32> |
| 71 | + %5 = affine.min #map1(%arg5) |
| 72 | + %6 = tensor.extract_slice %arg1[%arg7, %arg5] [6, %5] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32> |
| 73 | + |
| 74 | + // Pad the output operand without setting the nofold attribute. |
| 75 | + // CHECK-DAG: %[[T8:.*]] = tensor.extract_slice %[[ARG4]][%[[IV0]], %[[IV1]] |
| 76 | + // CHECK: %[[T9:.*]] = linalg.pad_tensor %[[T8]] low |
| 77 | + %7 = tensor.extract_slice %arg8[%arg3, %arg5] [%3, %5] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32> |
| 78 | + |
| 79 | + // Check matmul uses the packed input operands and the padded output operand. |
| 80 | + // CHECK: = linalg.matmul ins(%[[T6]], %[[T7]]{{.*}} outs(%[[T9]] |
| 81 | + %8 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%4, %6 : tensor<?x6xf32>, tensor<6x?xf32>) outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| 82 | + %9 = tensor.insert_slice %8 into %arg8[%arg3, %arg5] [%3, %5] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32> |
| 83 | + scf.yield %9 : tensor<24x25xf32> |
| 84 | + } |
| 85 | + scf.yield %2 : tensor<24x25xf32> |
| 86 | + } |
| 87 | + scf.yield %1 : tensor<24x25xf32> |
| 88 | + } |
| 89 | + return %0 : tensor<24x25xf32> |
| 90 | +} |
| 91 | + |
| 92 | +// ----- |
| 93 | + |
| 94 | +#map0 = affine_map<(d0) -> (15, -d0 + 24)> |
| 95 | +#map1 = affine_map<(d0) -> (16, -d0 + 25)> |
| 96 | +#map2 = affine_map<(d0, d1) -> (5, -d0 + d1)> |
| 97 | +#map3 = affine_map<(d0, d1) -> (d0 + d1)> |
| 98 | +#map4 = affine_map<(d0, d1) -> (6, -d0 + d1)> |
| 99 | + |
| 100 | +// CHECK: double_tiling |
| 101 | +// CHECK-DOUBLE: double_tiling |
| 102 | + |
| 103 | +// CHECK-DOUBLE-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> |
| 104 | +// CHECK-DOUBLE-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> |
| 105 | +// CHECK-DOUBLE-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> |
| 106 | +func @double_tiling(%arg0: tensor<24x12xf32>, |
| 107 | + %arg1: tensor<12x25xf32>, |
| 108 | + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { |
| 109 | + %c15 = arith.constant 15 : index |
| 110 | + %c16 = arith.constant 16 : index |
| 111 | + %c24 = arith.constant 24 : index |
| 112 | + %c25 = arith.constant 25 : index |
| 113 | + %c0 = arith.constant 0 : index |
| 114 | + %c5 = arith.constant 5 : index |
| 115 | + %c6 = arith.constant 6 : index |
| 116 | + |
| 117 | + // Packing the first input operand. |
| 118 | + // CHECK-DOUBLE: = linalg.init_tensor |
| 119 | + // CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold |
| 120 | + |
| 121 | + // CHECK-DOUBLE: scf.for %[[IV0:[0-9a-zA-Z]*]] = |
| 122 | + %0 = scf.for %arg3 = %c0 to %c24 step %c15 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) { |
| 123 | + |
| 124 | + // Packing the second input operand. |
| 125 | + // CHECK-DOUBLE: = linalg.init_tensor |
| 126 | + // CHECK-DOUBLE: = linalg.pad_tensor {{.*}} nofold |
| 127 | + |
| 128 | + // CHECK-DOUBLE: scf.for %[[IV1:[0-9a-zA-Z]*]] = |
| 129 | + %1 = scf.for %arg5 = %c0 to %c25 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { |
| 130 | + %2 = affine.min #map0(%arg3) |
| 131 | + %3 = affine.min #map1(%arg5) |
| 132 | + %4 = tensor.extract_slice %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32> |
| 133 | + |
| 134 | + // CHECK-DOUBLE: scf.for %[[IV2:[0-9a-zA-Z]*]] = |
| 135 | + %5 = scf.for %arg7 = %c0 to %2 step %c5 iter_args(%arg8 = %4) -> (tensor<?x?xf32>) { |
| 136 | + |
| 137 | + // CHECK-DOUBLE: scf.for %[[IV3:[0-9a-zA-Z]*]] = |
| 138 | + %7 = scf.for %arg9 = %c0 to %3 step %c6 iter_args(%arg10 = %arg8) -> (tensor<?x?xf32>) { |
| 139 | + %8 = affine.min #map2(%arg7, %2) |
| 140 | + %9 = affine.apply #map3(%arg7, %arg3) |
| 141 | + %10 = tensor.extract_slice %arg0[%9, 0] [%8, 12] [1, 1] : tensor<24x12xf32> to tensor<?x12xf32> |
| 142 | + %11 = affine.min #map4(%arg9, %3) |
| 143 | + %12 = affine.apply #map3(%arg9, %arg5) |
| 144 | + %13 = tensor.extract_slice %arg1[0, %12] [12, %11] [1, 1] : tensor<12x25xf32> to tensor<12x?xf32> |
| 145 | + %14 = affine.min #map2(%arg7, %2) |
| 146 | + %15 = affine.min #map4(%arg9, %3) |
| 147 | + %16 = tensor.extract_slice %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| 148 | + |
| 149 | + // Pad the output operand and perform the multiplication. |
| 150 | + // CHECK-DOUBLE: = linalg.pad_tensor |
| 151 | + // CHECK-DOUBLE: = linalg.matmul |
| 152 | + %17 = linalg.matmul {__internal_linalg_transform__ = "pad"} ins(%10, %13 : tensor<?x12xf32>, tensor<12x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| 153 | + %18 = tensor.insert_slice %17 into %arg10[%arg7, %arg9] [%14, %15] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32> |
| 154 | + scf.yield %18 : tensor<?x?xf32> |
| 155 | + } |
| 156 | + scf.yield %7 : tensor<?x?xf32> |
| 157 | + } |
| 158 | + %6 = tensor.insert_slice %5 into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32> |
| 159 | + scf.yield %6 : tensor<24x25xf32> |
| 160 | + } |
| 161 | + scf.yield %1 : tensor<24x25xf32> |
| 162 | + } |
| 163 | + return %0 : tensor<24x25xf32> |
| 164 | +} |
0 commit comments