Skip to content

Commit 71d9f21

Browse files
committed
[MLIR][Linalg] Add integration tests of scalable vectorization of reduction
Note: I don't have a setup to run these tests natively (arm64-linux with sve). I am able to run them using QEMU on a x86_64-linux with below cmake variables when building llvm: -DARM_EMULATOR_EXECUTABLE="<path_to_qemu_bin>/qemu-aarch64" \ -DARM_EMULATOR_OPTIONS="-L /usr/aarch64-linux-gnu" \ -DARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE="<path_to_llvm_arm64_build>/bin/mlir-cpu-runner-arm64" \ -DARM_EMULATOR_UTILS_LIB_DIR="<path_to_llvm_arm64_build>/lib"
1 parent 370bab5 commit 71d9f21

File tree

3 files changed

+276
-0
lines changed

3 files changed

+276
-0
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
// DEFINE: %{compile} = mlir-opt %s \
2+
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
3+
// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
4+
// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
5+
// DEFINE: %{entry_point} = generic_reduce_2d_f32
6+
// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
7+
// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
8+
9+
// RUN: %{compile}
10+
11+
// RUN: %{run} | FileCheck %s --check-prefix=F32
12+
13+
func.func @generic_reduce_2d_f32() {
14+
// 2-D Tensor
15+
%M = arith.constant 16 : index
16+
%N = arith.constant 1000 : index
17+
%c0_f32 = arith.constant 0.0 : f32
18+
19+
// Allocate the input and output tensors
20+
%A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
21+
%C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
22+
23+
// Initialise the tensors
24+
%pi = arith.constant 3.1416 : f32
25+
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
26+
%C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
27+
28+
// Reduce
29+
%C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
30+
affine_map<(d0, d1) -> (d0)>],
31+
iterator_types = ["parallel", "reduction"] }
32+
ins(%A_in : tensor<?x?xf32>)
33+
outs(%C_in : tensor<?xf32>) {
34+
^bb(%in: f32, %out: f32) :
35+
%0 = arith.addf %in, %out : f32
36+
linalg.yield %0 : f32
37+
} -> tensor<?xf32>
38+
39+
// Print and verify the output
40+
// F32-LABEL: SVE: START OF TEST OUTPUT
41+
vector.print str "SVE: START OF TEST OUTPUT\n"
42+
43+
// F32-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
44+
// F32-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6]
45+
46+
%xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
47+
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
48+
49+
// F32-NEXT: SVE: END OF TEST OUTPUT
50+
vector.print str "SVE: END OF TEST OUTPUT\n"
51+
52+
return
53+
}
54+
55+
module attributes {transform.with_named_sequence} {
56+
// A sequence that will tile and vectorise a Reduce Op
57+
transform.named_sequence @tile_and_vectorize_reduce(%func
58+
: !transform.op<"func.func"> {transform.readonly}) {
59+
60+
// Step 0: Get a handle to the reduce Op
61+
%reduce = transform.structured.match ops{["linalg.generic"]} in %func
62+
: (!transform.op<"func.func">) -> !transform.any_op
63+
64+
// Step 1: Tile
65+
%tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]]
66+
: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
67+
68+
// Step 2: Vectorize
69+
transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op
70+
71+
// Step 3: Lower vector.multi_reduction
72+
transform.apply_patterns to %func {
73+
transform.apply_patterns.vector.lower_masked_transfers
74+
transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
75+
} : !transform.op<"func.func">
76+
77+
transform.yield
78+
}
79+
80+
// A sequence that goes over all functions in tis module and applies
81+
// "tile_and_vectorize_reduce"
82+
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
83+
%funcs = transform.structured.match ops{["func.func"]} in %module
84+
: (!transform.any_op) -> !transform.op<"func.func">
85+
86+
transform.foreach %funcs : !transform.op<"func.func"> {
87+
^bb2(%func : !transform.op<"func.func">):
88+
transform.include @tile_and_vectorize_reduce failures(propagate)
89+
(%func) : (!transform.op<"func.func">) -> ()
90+
}
91+
transform.yield
92+
}
93+
}
94+
95+
func.func private @printMemrefF32(%ptr : tensor<*xf32>)
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// DEFINE: %{compile} = mlir-opt %s \
2+
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
3+
// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
4+
// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
5+
// DEFINE: %{entry_point} = reduce_1d_f32
6+
// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
7+
// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
8+
9+
// RUN: %{compile}
10+
11+
// RUN: %{run} | FileCheck %s --check-prefix=F32
12+
13+
func.func @reduce_1d_f32() {
14+
// 1-D Tensor
15+
%N = arith.constant 1000 : index
16+
%c0_f32 = arith.constant 0.0 : f32
17+
18+
// Allocate the input and output tensors
19+
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
20+
%C_alloc = bufferization.alloc_tensor() : tensor<f32>
21+
22+
// Initialise the tensors
23+
%pi = arith.constant 3.1416 : f32
24+
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
25+
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>
26+
27+
// Reduce
28+
%C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0]
29+
(%in: f32, %init: f32) {
30+
%0 = arith.addf %in, %init : f32
31+
linalg.yield %0 : f32
32+
}
33+
34+
// Print and verify the output
35+
// F32-LABEL: SVE: START OF TEST OUTPUT
36+
vector.print str "SVE: START OF TEST OUTPUT\n"
37+
38+
// F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
39+
// F32-NEXT: [3141.6]
40+
41+
%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
42+
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
43+
44+
// F32-NEXT: SVE: END OF TEST OUTPUT
45+
vector.print str "SVE: END OF TEST OUTPUT\n"
46+
47+
return
48+
}
49+
50+
module attributes {transform.with_named_sequence} {
51+
// A sequence that will tile and vectorise a Reduce Op
52+
transform.named_sequence @tile_and_vectorize_reduce(%func
53+
: !transform.op<"func.func"> {transform.readonly}) {
54+
55+
// Step 0: Get a handle to the reduce Op
56+
%reduce = transform.structured.match ops{["linalg.reduce"]} in %func
57+
: (!transform.op<"func.func">) -> !transform.any_op
58+
59+
// Step 1: Tile
60+
%tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]]
61+
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
62+
63+
// Step 2: Vectorize
64+
transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op
65+
66+
// Step 3: Lower vector.multi_reduction
67+
transform.apply_patterns to %func {
68+
transform.apply_patterns.vector.lower_masked_transfers
69+
transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
70+
} : !transform.op<"func.func">
71+
72+
transform.yield
73+
}
74+
75+
// A sequence that goes over all functions in tis module and applies
76+
// "tile_and_vectorize_reduce"
77+
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
78+
%funcs = transform.structured.match ops{["func.func"]} in %module
79+
: (!transform.any_op) -> !transform.op<"func.func">
80+
81+
transform.foreach %funcs : !transform.op<"func.func"> {
82+
^bb2(%func : !transform.op<"func.func">):
83+
transform.include @tile_and_vectorize_reduce failures(propagate)
84+
(%func) : (!transform.op<"func.func">) -> ()
85+
}
86+
transform.yield
87+
}
88+
}
89+
90+
func.func private @printMemrefF32(%ptr : tensor<*xf32>)
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// DEFINE: %{compile} = mlir-opt %s \
2+
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
3+
// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
4+
// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
5+
// DEFINE: %{entry_point} = reduce_2d_f32
6+
// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
7+
// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext
8+
9+
// RUN: %{compile}
10+
11+
// RUN: %{run} | FileCheck %s --check-prefix=F32
12+
13+
func.func @reduce_2d_f32() {
14+
// 2-D Tensor
15+
%M = arith.constant 16 : index
16+
%N = arith.constant 1000 : index
17+
%c0_f32 = arith.constant 0.0 : f32
18+
19+
// Allocate the input and output tensors
20+
%A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
21+
%C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32>
22+
23+
// Initialise the tensors
24+
%pi = arith.constant 3.1416 : f32
25+
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
26+
%C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32>
27+
28+
// Reduce
29+
%C_out = linalg.reduce ins(%A_in : tensor<?x?xf32>) outs(%C_in: tensor<?xf32>) dimensions = [1]
30+
(%in: f32, %init: f32) {
31+
%0 = arith.addf %in, %init : f32
32+
linalg.yield %0 : f32
33+
}
34+
35+
// Print and verify the output
36+
// F32-LABEL: SVE: START OF TEST OUTPUT
37+
vector.print str "SVE: START OF TEST OUTPUT\n"
38+
39+
// F32-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data =
40+
// F32-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6]
41+
42+
%xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32>
43+
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
44+
45+
// F32-NEXT: SVE: END OF TEST OUTPUT
46+
vector.print str "SVE: END OF TEST OUTPUT\n"
47+
48+
return
49+
}
50+
51+
module attributes {transform.with_named_sequence} {
52+
// A sequence that will tile and vectorise a Reduce Op
53+
transform.named_sequence @tile_and_vectorize_reduce(%func
54+
: !transform.op<"func.func"> {transform.readonly}) {
55+
56+
// Step 0: Get a handle to the reduce Op
57+
%reduce = transform.structured.match ops{["linalg.reduce"]} in %func
58+
: (!transform.op<"func.func">) -> !transform.any_op
59+
60+
// Step 1: Tile
61+
%tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]]
62+
: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
63+
64+
// Step 2: Vectorize
65+
transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op
66+
67+
// Step 3: Lower vector.multi_reduction
68+
transform.apply_patterns to %func {
69+
transform.apply_patterns.vector.lower_masked_transfers
70+
transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
71+
} : !transform.op<"func.func">
72+
73+
transform.yield
74+
}
75+
76+
// A sequence that goes over all functions in tis module and applies
77+
// "tile_and_vectorize_reduce"
78+
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
79+
%funcs = transform.structured.match ops{["func.func"]} in %module
80+
: (!transform.any_op) -> !transform.op<"func.func">
81+
82+
transform.foreach %funcs : !transform.op<"func.func"> {
83+
^bb2(%func : !transform.op<"func.func">):
84+
transform.include @tile_and_vectorize_reduce failures(propagate)
85+
(%func) : (!transform.op<"func.func">) -> ()
86+
}
87+
transform.yield
88+
}
89+
}
90+
91+
func.func private @printMemrefF32(%ptr : tensor<*xf32>)

0 commit comments

Comments
 (0)