|
2 | 2 | // RUN: -transform-preload-library='transform-library-paths=%p/td/vectorize-with-patterns.mlir' \
|
3 | 3 | // RUN: -transform-interpreter=entry-point=vectorize_with_patterns %s | FileCheck %s
|
4 | 4 |
|
5 |
| -#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)> |
6 |
| -#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> |
7 |
| -func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> { |
8 |
| - %1 = linalg.generic { |
9 |
| - indexing_maps = [#map0, #map1], |
10 |
| - iterator_types = ["parallel", "parallel", "parallel", "parallel"] |
11 |
| - } ins(%arg1 : tensor<4x3xi32>) outs(%arg2 : tensor<4x7x3x2xf32>) { |
12 |
| - ^bb0(%arg3: i32, %arg4: f32): |
13 |
| - %2 = arith.index_cast %arg3 : i32 to index |
14 |
| - %3 = tensor.extract %arg0[%2] : tensor<3xf32> |
15 |
| - linalg.yield %3 : f32 |
16 |
| - } -> tensor<4x7x3x2xf32> |
17 |
| - return %1 : tensor<4x7x3x2xf32> |
18 |
| -} |
19 |
| -// CHECK-LABEL: func.func @vectorize_1d_tensor_extract |
20 |
| -// CHECK-SAME: %[[ARG0:.*]]: tensor<3xf32> |
21 |
| -// CHECK-SAME: %[[ARG1:.*]]: tensor<4x3xi32> |
22 |
| -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index |
23 |
| -// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1> |
24 |
| -// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32> |
25 |
| -// CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]] |
26 |
| -// CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]] |
27 |
| -// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST]] |
28 |
| -// CHECK: %[[INDICES:.*]] = vector.transpose %[[BROADCAST]] |
29 |
| -// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]]] [%[[INDICES]]], %[[MASK]], %[[PASSTHRU]] |
30 |
| -// CHECK: vector.transfer_write %[[GATHER]] |
31 |
| - |
32 |
| -// ----- |
| 5 | +//===----------------------------------------------------------------------===// |
| 6 | +// Contiguous load |
| 7 | +//===----------------------------------------------------------------------===// |
33 | 8 |
|
34 | 9 | #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
|
35 | 10 | func.func @vectorize_nd_tensor_extract_transfer_read_basic(
|
@@ -112,6 +87,142 @@ func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16
|
112 | 87 |
|
113 | 88 | // -----
|
114 | 89 |
|
| 90 | +// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load. |
| 91 | +func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { |
| 92 | + %c79 = arith.constant 79 : index |
| 93 | + %1 = linalg.generic { |
| 94 | + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], |
| 95 | + iterator_types = ["parallel", "parallel"] |
| 96 | + } outs(%extracted_slice : tensor<1x4xf32>) { |
| 97 | + ^bb0(%out: f32): |
| 98 | + %2 = linalg.index 1 : index |
| 99 | + %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0) |
| 100 | + %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32> |
| 101 | + linalg.yield %extracted : f32 |
| 102 | + } -> tensor<1x4xf32> |
| 103 | + return %1 : tensor<1x4xf32> |
| 104 | +} |
| 105 | + |
| 106 | +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous( |
| 107 | +// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, |
| 108 | +// CHECK-SAME: %[[VAL_1:.*]]: index, |
| 109 | +// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { |
| 110 | +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> |
| 111 | +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 |
| 112 | +// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index |
| 113 | +// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 79 : index |
| 114 | +// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex> |
| 115 | +// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex> |
| 116 | +// CHECK: %[[VAL_10:.*]] = vector.extract %[[VAL_9]][0] : index from vector<4xindex> |
| 117 | +// CHECK: %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> |
| 118 | +// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> |
| 119 | +// CHECK: return %[[VAL_12]] : tensor<1x4xf32> |
| 120 | +// CHECK: } |
| 121 | + |
| 122 | +// ----- |
| 123 | + |
| 124 | +func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20xi32>, %input_2: tensor<257x24xf32>, %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<1x1x4xf32> { |
| 125 | + %c0 = arith.constant 0 : index |
| 126 | + %c256 = arith.constant 256 : index |
| 127 | + %output = tensor.empty() : tensor<1x1x4xf32> |
| 128 | + %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%output : tensor<1x1x4xf32>) { |
| 129 | + ^bb0(%out: f32): |
| 130 | + %13 = linalg.index 0 : index |
| 131 | + %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>(%arg0, %13, %arg2) |
| 132 | + %15 = linalg.index 2 : index |
| 133 | + %16 = linalg.index 1 : index |
| 134 | + %17 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 * 24 + d2 + d3)>(%arg1, %16, %15, %arg3) |
| 135 | + %extracted_0 = tensor.extract %input_1[%c0, %14] : tensor<1x20xi32> |
| 136 | + %18 = arith.index_cast %extracted_0 : i32 to index |
| 137 | + %19 = arith.maxsi %18, %c0 : index |
| 138 | + %20 = arith.minsi %19, %c256 : index |
| 139 | + %extracted_1 = tensor.extract %input_2[%20, %17] : tensor<257x24xf32> |
| 140 | + linalg.yield %extracted_1 : f32 |
| 141 | + } -> tensor<1x1x4xf32> |
| 142 | + return %1 : tensor<1x1x4xf32> |
| 143 | +} |
| 144 | + |
| 145 | +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_tensor_extract( |
| 146 | +// CHECK-SAME: %[[INPUT_1:.*]]: tensor<1x20xi32>, |
| 147 | +// CHECK-SAME: %[[INPUT_2:.*]]: tensor<257x24xf32>, |
| 148 | +// CHECK-SAME: %[[INPUT_3:.*]]: index, %[[INPUT_4:.*]]: index, %[[INPUT_5:.*]]: index, |
| 149 | +// CHECK: %[[EXTRACTED_0_IDX_0:.*]] = arith.constant 0 : index |
| 150 | +// CHECK: %[[SCALAR:.*]] = arith.addi %[[INPUT_3]], %[[INPUT_5]] : index |
| 151 | +// First `vector.transfer_read` from the generic Op - loop invariant scalar load. |
| 152 | +// CHECK: vector.transfer_read %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[SCALAR]]] |
| 153 | +// CHECK-SAME: tensor<1x20xi32>, vector<i32> |
| 154 | +// The following `tensor.extract` from the generic Op s a contiguous load (all Ops used |
| 155 | +// for address calculation also satisfy the required conditions). |
| 156 | +// CHECK: vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32> |
| 157 | + |
| 158 | +// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load. |
| 159 | +func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { |
| 160 | + %c16 = arith.constant 16 : index |
| 161 | + %1 = linalg.generic { |
| 162 | + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], |
| 163 | + iterator_types = ["parallel", "parallel"] |
| 164 | + } outs(%extracted_slice : tensor<1x4xf32>) { |
| 165 | + ^bb0(%out: f32): |
| 166 | + %2 = linalg.index 0 : index |
| 167 | + %3 = linalg.index 1 : index |
| 168 | + %4 = arith.maxsi %2, %c16 : index |
| 169 | + %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32> |
| 170 | + linalg.yield %extracted : f32 |
| 171 | + } -> tensor<1x4xf32> |
| 172 | + return %1 : tensor<1x4xf32> |
| 173 | +} |
| 174 | + |
| 175 | +// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous( |
| 176 | +// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, |
| 177 | +// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { |
| 178 | +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index |
| 179 | +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 |
| 180 | + |
| 181 | +// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> |
| 182 | +// CHECK-DAG: %[[CST_1:.+]] = arith.constant dense<16> : vector<4x1xindex> |
| 183 | +// CHECK-DAG: %[[IDX0:.+]] = vector.extract %[[CST_1]][0, 0] : index from vector<4x1xindex> |
| 184 | +// CHECK-DAG: %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<4xindex> |
| 185 | + |
| 186 | +// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[IDX0]], %[[IDX1]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> |
| 187 | +// CHECK: %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> |
| 188 | +// CHECK: return %[[VAL_9]] : tensor<1x4xf32> |
| 189 | +// CHECK: } |
| 190 | + |
| 191 | +// ----- |
| 192 | + |
| 193 | +//===----------------------------------------------------------------------===// |
| 194 | +// Gather load |
| 195 | +//===----------------------------------------------------------------------===// |
| 196 | + |
| 197 | +#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)> |
| 198 | +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> |
| 199 | +func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi32>, %arg2: tensor<4x7x3x2xf32>) -> tensor<4x7x3x2xf32> { |
| 200 | + %1 = linalg.generic { |
| 201 | + indexing_maps = [#map0, #map1], |
| 202 | + iterator_types = ["parallel", "parallel", "parallel", "parallel"] |
| 203 | + } ins(%arg1 : tensor<4x3xi32>) outs(%arg2 : tensor<4x7x3x2xf32>) { |
| 204 | + ^bb0(%arg3: i32, %arg4: f32): |
| 205 | + %2 = arith.index_cast %arg3 : i32 to index |
| 206 | + %3 = tensor.extract %arg0[%2] : tensor<3xf32> |
| 207 | + linalg.yield %3 : f32 |
| 208 | + } -> tensor<4x7x3x2xf32> |
| 209 | + return %1 : tensor<4x7x3x2xf32> |
| 210 | +} |
| 211 | +// CHECK-LABEL: func.func @vectorize_1d_tensor_extract |
| 212 | +// CHECK-SAME: %[[ARG0:.*]]: tensor<3xf32> |
| 213 | +// CHECK-SAME: %[[ARG1:.*]]: tensor<4x3xi32> |
| 214 | +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index |
| 215 | +// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1> |
| 216 | +// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32> |
| 217 | +// CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]] |
| 218 | +// CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]] |
| 219 | +// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST]] |
| 220 | +// CHECK: %[[INDICES:.*]] = vector.transpose %[[BROADCAST]] |
| 221 | +// CHECK: %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]]] [%[[INDICES]]], %[[MASK]], %[[PASSTHRU]] |
| 222 | +// CHECK: vector.transfer_write %[[GATHER]] |
| 223 | + |
| 224 | +// ----- |
| 225 | + |
115 | 226 | #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
|
116 | 227 | #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
|
117 | 228 | #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
|
@@ -159,7 +270,7 @@ func.func @vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load(%
|
159 | 270 | %c0 = arith.constant 0 : index
|
160 | 271 | %0 = tensor.empty() : tensor<8x1xf32>
|
161 | 272 | %1 = linalg.generic {
|
162 |
| - indexing_maps = [#map], |
| 273 | + indexing_maps = [#map], |
163 | 274 | iterator_types = ["parallel", "parallel"]
|
164 | 275 | } outs(%0 : tensor<8x1xf32>) {
|
165 | 276 | ^bb0(%arg5: f32):
|
@@ -303,78 +414,6 @@ func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32
|
303 | 414 | // CHECK: %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
|
304 | 415 | // CHECK: return %[[VAL_14]] : tensor<5xf32>
|
305 | 416 |
|
306 |
| -// ----- |
307 |
| - |
308 |
| -// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load. |
309 |
| -func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { |
310 |
| - %c79 = arith.constant 79 : index |
311 |
| - %1 = linalg.generic { |
312 |
| - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], |
313 |
| - iterator_types = ["parallel", "parallel"] |
314 |
| - } outs(%extracted_slice : tensor<1x4xf32>) { |
315 |
| - ^bb0(%out: f32): |
316 |
| - %2 = linalg.index 1 : index |
317 |
| - %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0) |
318 |
| - %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32> |
319 |
| - linalg.yield %extracted : f32 |
320 |
| - } -> tensor<1x4xf32> |
321 |
| - return %1 : tensor<1x4xf32> |
322 |
| -} |
323 |
| - |
324 |
| -// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous( |
325 |
| -// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, |
326 |
| -// CHECK-SAME: %[[VAL_1:.*]]: index, |
327 |
| -// CHECK-SAME: %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { |
328 |
| -// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> |
329 |
| -// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 |
330 |
| -// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index |
331 |
| -// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 79 : index |
332 |
| -// CHECK: %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex> |
333 |
| -// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex> |
334 |
| -// CHECK: %[[VAL_10:.*]] = vector.extract %[[VAL_9]][0] : index from vector<4xindex> |
335 |
| -// CHECK: %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> |
336 |
| -// CHECK: %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> |
337 |
| -// CHECK: return %[[VAL_12]] : tensor<1x4xf32> |
338 |
| -// CHECK: } |
339 |
| - |
340 |
| -// ----- |
341 |
| - |
342 |
| -func.func @vectorize_nd_tensor_extract_with_tensor_extract(%input_1: tensor<1x20xi32>, %input_2: tensor<257x24xf32>, %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<1x1x4xf32> { |
343 |
| - %c0 = arith.constant 0 : index |
344 |
| - %c256 = arith.constant 256 : index |
345 |
| - %output = tensor.empty() : tensor<1x1x4xf32> |
346 |
| - %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} outs(%output : tensor<1x1x4xf32>) { |
347 |
| - ^bb0(%out: f32): |
348 |
| - %13 = linalg.index 0 : index |
349 |
| - %14 = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>(%arg0, %13, %arg2) |
350 |
| - %15 = linalg.index 2 : index |
351 |
| - %16 = linalg.index 1 : index |
352 |
| - %17 = affine.apply affine_map<(d0, d1, d2, d3) -> (d0 + d1 * 24 + d2 + d3)>(%arg1, %16, %15, %arg3) |
353 |
| - %extracted_0 = tensor.extract %input_1[%c0, %14] : tensor<1x20xi32> |
354 |
| - %18 = arith.index_cast %extracted_0 : i32 to index |
355 |
| - %19 = arith.maxsi %18, %c0 : index |
356 |
| - %20 = arith.minsi %19, %c256 : index |
357 |
| - %extracted_1 = tensor.extract %input_2[%20, %17] : tensor<257x24xf32> |
358 |
| - linalg.yield %extracted_1 : f32 |
359 |
| - } -> tensor<1x1x4xf32> |
360 |
| - return %1 : tensor<1x1x4xf32> |
361 |
| -} |
362 |
| - |
363 |
| -// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_tensor_extract( |
364 |
| -// CHECK-SAME: %[[INPUT_1:.*]]: tensor<1x20xi32>, |
365 |
| -// CHECK-SAME: %[[INPUT_2:.*]]: tensor<257x24xf32>, |
366 |
| -// CHECK-SAME: %[[INPUT_3:.*]]: index, %[[INPUT_4:.*]]: index, %[[INPUT_5:.*]]: index, |
367 |
| -// CHECK: %[[EXTRACTED_0_IDX_0:.*]] = arith.constant 0 : index |
368 |
| -// CHECK: %[[SCALAR:.*]] = arith.addi %[[INPUT_3]], %[[INPUT_5]] : index |
369 |
| -// First `vector.transfer_read` from the generic Op - loop invariant scalar load. |
370 |
| -// CHECK: vector.transfer_read %[[INPUT_1]][%[[EXTRACTED_0_IDX_0]], %[[SCALAR]]] |
371 |
| -// CHECK-SAME: tensor<1x20xi32>, vector<i32> |
372 |
| -// The following `tensor.extract` from the generic Op s a contiguous load (all Ops used |
373 |
| -// for address calculation also satisfy the required conditions). |
374 |
| -// CHECK: vector.transfer_read %[[INPUT_2]][%{{.*}}, %{{.*}}, %{{.*}} {in_bounds = [true, true]} : tensor<257x24xf32>, vector<1x4xf32> |
375 |
| - |
376 |
| -// ----- |
377 |
| - |
378 | 417 | // The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load.
|
379 | 418 | func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
|
380 | 419 | %c16 = arith.constant 16 : index
|
@@ -410,8 +449,6 @@ func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16
|
410 | 449 | // CHECK: return %[[VAL_14]] : tensor<1x4xf32>
|
411 | 450 | // CHECK: }
|
412 | 451 |
|
413 |
| -// ----- |
414 |
| - |
415 | 452 | // Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load.
|
416 | 453 | func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
|
417 | 454 | %c79 = arith.constant 79 : index
|
@@ -445,41 +482,6 @@ func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32
|
445 | 482 |
|
446 | 483 | // -----
|
447 | 484 |
|
448 |
| -// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load. |
449 |
| -func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> { |
450 |
| - %c16 = arith.constant 16 : index |
451 |
| - %1 = linalg.generic { |
452 |
| - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], |
453 |
| - iterator_types = ["parallel", "parallel"] |
454 |
| - } outs(%extracted_slice : tensor<1x4xf32>) { |
455 |
| - ^bb0(%out: f32): |
456 |
| - %2 = linalg.index 0 : index |
457 |
| - %3 = linalg.index 1 : index |
458 |
| - %4 = arith.maxsi %2, %c16 : index |
459 |
| - %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32> |
460 |
| - linalg.yield %extracted : f32 |
461 |
| - } -> tensor<1x4xf32> |
462 |
| - return %1 : tensor<1x4xf32> |
463 |
| -} |
464 |
| - |
465 |
| -// CHECK-LABEL: func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous( |
466 |
| -// CHECK-SAME: %[[VAL_0:.*]]: tensor<80x16xf32>, |
467 |
| -// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> { |
468 |
| -// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index |
469 |
| -// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 |
470 |
| - |
471 |
| -// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> |
472 |
| -// CHECK-DAG: %[[CST_1:.+]] = arith.constant dense<16> : vector<4x1xindex> |
473 |
| -// CHECK-DAG: %[[IDX0:.+]] = vector.extract %[[CST_1]][0, 0] : index from vector<4x1xindex> |
474 |
| -// CHECK-DAG: %[[IDX1:.+]] = vector.extract %[[CST_0]][0] : index from vector<4xindex> |
475 |
| - |
476 |
| -// CHECK: %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[IDX0]], %[[IDX1]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> |
477 |
| -// CHECK: %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> |
478 |
| -// CHECK: return %[[VAL_9]] : tensor<1x4xf32> |
479 |
| -// CHECK: } |
480 |
| - |
481 |
| -// ----- |
482 |
| - |
483 | 485 | // The vectorizer assumes it's a gather load whenever using a block argument to calculate an index.
|
484 | 486 | #map = affine_map<(d0) -> (d0)>
|
485 | 487 | func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1: tensor<5xindex>) -> tensor<5xf32> {
|
|
0 commit comments