1
1
// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
2
2
3
- func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous (%6: tensor <80 x16 xf32 >, %arg0: index , %extracted_slice : tensor <1 x3 xf32 >) -> tensor <1 x3 xf32 > {
3
+ func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous (
4
+ %src: tensor <80 x16 xf32 >,
5
+ %output : tensor <1 x3 xf32 >,
6
+ %idx: index ) -> tensor <1 x3 xf32 > {
7
+
4
8
%c79 = arith.constant 79 : index
5
9
%1 = linalg.generic {
6
10
indexing_maps = [affine_map <(d0 , d1 ) -> (d0 , d1 )>],
7
11
iterator_types = [" parallel" , " parallel" ]
8
- } outs (%extracted_slice : tensor <1 x3 xf32 >) {
12
+ } outs (%output : tensor <1 x3 xf32 >) {
9
13
^bb0 (%out: f32 ):
10
14
%2 = linalg.index 1 : index
11
- %3 = affine.apply affine_map <(d0 , d1 ) -> (d0 + d1 )>(%2 , %arg0 )
12
- %extracted = tensor.extract %6 [%c79 , %3 ] : tensor <80 x16 xf32 >
15
+ %3 = affine.apply affine_map <(d0 , d1 ) -> (d0 + d1 )>(%2 , %idx )
16
+ %extracted = tensor.extract %src [%c79 , %3 ] : tensor <80 x16 xf32 >
13
17
linalg.yield %extracted : f32
14
18
} -> tensor <1 x3 xf32 >
15
19
return %1 : tensor <1 x3 xf32 >
16
20
}
17
21
18
22
// CHECK-LABEL: func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous
19
- // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index
20
- // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 3 : index
21
- // CHECK: %[[VAL_8:.*]] = vector.create_mask %[[VAL_4]], %[[VAL_5]] : vector<1x4xi1>
22
- // CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
23
- // CHECK: %[[VAL_11:.*]] = vector.broadcast {{.*}} : index to vector<4xindex>
24
- // CHECK: %[[VAL_12:.*]] = arith.addi {{.*}} : vector<4xindex>
25
- // CHECK: %[[VAL_20:.*]] = vector.mask %[[VAL_8]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
26
- // CHECK: %[[VAL_22:.*]] = vector.mask %[[VAL_8]] { vector.transfer_write {{.*}} {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x3xf32> } : vector<1x4xi1> -> tensor<1x3xf32>
23
+ // CHECK-SAME: %[[SRC:.*]]: tensor<80x16xf32>,
24
+ // CHECK-SAME: %[[OUTPUT:.*]]: tensor<1x3xf32>,
25
+ // CHECK-SAME: %[[IDX_IN:.*]]: index) -> tensor<1x3xf32> {
26
+
27
+ /// Create the mask
28
+ // CHECK-DAG: %[[DIM_0:.*]] = arith.constant 1 : index
29
+ // CHECK-DAG: %[[DIM_1:.*]] = arith.constant 3 : index
30
+ // CHECK-DAG: %[[C79:.*]] = arith.constant 79 : index
31
+ // CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_0]], %[[DIM_1]] : vector<1x4xi1>
32
+
33
+ /// TODO: This transfer_read is redundant - remove
34
+ // CHECK: vector.mask %[[MASK]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
35
+
36
+ /// Caluclate the index vector
37
+ // CHECK: %[[STEP:.*]] = vector.step : vector<4xindex>
38
+ // CHECK: %[[IDX_BC:.*]] = vector.broadcast %[[IDX_IN]] : index to vector<4xindex>
39
+ // CHECK: %[[IDX_VEC:.*]] = arith.addi %[[STEP]], %[[IDX_BC]] : vector<4xindex>
40
+ // CHECK: %[[C0:.*]] = arith.constant 0 : i32
41
+ // CHECK: %[[SC:.*]] = vector.shape_cast %[[IDX_VEC]] : vector<4xindex> to vector<4xindex>
42
+
43
+ /// Extract the starting point from the index vector
44
+ // CHECK: %[[IDX_START:.*]] = vector.extractelement %[[SC]]{{\[}}%[[C0]] : i32] : vector<4xindex>
45
+
46
+ // Final read and write
47
+ // CHECK: %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC]]{{\[}}%[[C79]], %[[IDX_START]]], {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32> } : vector<1x4xi1> -> vector<1x4xf32>
48
+ // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
49
+ // CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[OUTPUT]]{{\[}}%[[C0_1]], %[[C0_1]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x3xf32> } : vector<1x4xi1> -> tensor<1x3xf32>
27
50
28
51
module attributes {transform.with_named_sequence } {
29
52
transform.named_sequence @__transform_main (%arg1: !transform.any_op {transform.readonly }) {
@@ -33,7 +56,69 @@ module attributes {transform.with_named_sequence} {
33
56
}
34
57
}
35
58
36
- // -----
59
+ // -----
60
+
61
+ // Identical to the above, but with scalable vectors.
62
+
63
+ func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous_scalable (
64
+ %src: tensor <80 x16 xf32 >,
65
+ %output : tensor <1 x3 xf32 >,
66
+ %idx: index ) -> tensor <1 x3 xf32 > {
67
+
68
+ %c79 = arith.constant 79 : index
69
+ %1 = linalg.generic {
70
+ indexing_maps = [affine_map <(d0 , d1 ) -> (d0 , d1 )>],
71
+ iterator_types = [" parallel" , " parallel" ]
72
+ } outs (%output : tensor <1 x3 xf32 >) {
73
+ ^bb0 (%out: f32 ):
74
+ %2 = linalg.index 1 : index
75
+ %3 = affine.apply affine_map <(d0 , d1 ) -> (d0 + d1 )>(%2 , %idx )
76
+ %extracted = tensor.extract %src [%c79 , %3 ] : tensor <80 x16 xf32 >
77
+ linalg.yield %extracted : f32
78
+ } -> tensor <1 x3 xf32 >
79
+
80
+ return %1 : tensor <1 x3 xf32 >
81
+ }
82
+
83
+ // CHECK-LABEL: func.func @masked_static_vectorize_nd_tensor_extract_with_affine_apply_contiguous_scalable
84
+ // CHECK-SAME: %[[SRC:.*]]: tensor<80x16xf32>,
85
+ // CHECK-SAME: %[[OUTPUT:.*]]: tensor<1x3xf32>,
86
+ // CHECK-SAME: %[[IDX_IN:.*]]: index) -> tensor<1x3xf32> {
87
+
88
+ /// Create the mask
89
+ // CHECK-DAG: %[[DIM_0:.*]] = arith.constant 1 : index
90
+ // CHECK-DAG: %[[DIM_1:.*]] = arith.constant 3 : index
91
+ // CHECK-DAG: %[[C79:.*]] = arith.constant 79 : index
92
+ // CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_0]], %[[DIM_1]] : vector<1x[4]xi1>
93
+
94
+ /// TODO: This transfer_read is redundant - remove
95
+ // CHECK: vector.mask %[[MASK]] { vector.transfer_read {{.*}} {in_bounds = [true, true]} : tensor<1x3xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
96
+
97
+ /// Caluclate the index vector
98
+ // CHECK: %[[STEP:.*]] = vector.step : vector<[4]xindex>
99
+ // CHECK: %[[IDX_BC:.*]] = vector.broadcast %[[IDX_IN]] : index to vector<[4]xindex>
100
+ // CHECK: %[[IDX_VEC:.*]] = arith.addi %[[STEP]], %[[IDX_BC]] : vector<[4]xindex>
101
+ // CHECK: %[[C0:.*]] = arith.constant 0 : i32
102
+ // CHECK: %[[SC:.*]] = vector.shape_cast %[[IDX_VEC]] : vector<[4]xindex> to vector<[4]xindex>
103
+
104
+ /// Extract the starting point from the index vector
105
+ // CHECK: %[[IDX_START:.*]] = vector.extractelement %[[SC]]{{\[}}%[[C0]] : i32] : vector<[4]xindex>
106
+
107
+ // Final read and write
108
+ // CHECK: %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC]]{{\[}}%[[C79]], %[[IDX_START]]], {{.*}} {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
109
+ // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
110
+ // CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[OUTPUT]]{{\[}}%[[C0_1]], %[[C0_1]]] {in_bounds = [true, true]} : vector<1x[4]xf32>, tensor<1x3xf32> } : vector<1x[4]xi1> -> tensor<1x3xf32>
111
+
112
+
113
+ module attributes {transform.with_named_sequence } {
114
+ transform.named_sequence @__transform_main (%arg1: !transform.any_op {transform.readonly }) {
115
+ %0 = transform.structured.match ops {[" linalg.generic" ]} in %arg1 : (!transform.any_op ) -> !transform.any_op
116
+ transform.structured.vectorize %0 vector_sizes [1 , [4 ]] {vectorize_nd_extract } : !transform.any_op
117
+ transform.yield
118
+ }
119
+ }
120
+
121
+ // -----
37
122
38
123
func.func @masked_dynamic_vectorize_nd_tensor_extract_with_affine_apply_contiguous (%6: tensor <?x?xf32 >, %arg0: index , %extracted_slice : tensor <?x?xf32 >) -> tensor <?x?xf32 > {
39
124
%c79 = arith.constant 79 : index
0 commit comments