@@ -45,109 +45,6 @@ namespace linalg {
45
45
// / when used on distributed loops with memref semantics!
46
46
void hoistRedundantVectorTransfers (func::FuncOp func);
47
47
48
- // / Greedily hoist redundant subset extract/insert operations on tensors outside
49
- // / of `forOp`. The logic follows:
50
- // / 1. Look for a write walking back from the `forOp` yield.
51
- // / 2. Check the uses of the matching block argument and look for a matching
52
- // / read (i.e. extract_slice of transfer_read) with matching indices.
53
- // / 3. In the case of a transfer_write, we can bypass other non-conflicting
54
- // / operations and find more hoisting opportunities.
55
- // / 4. Hoist the read/write pair and update the tensor SSA links.
56
- // /
57
- // / Return the unmodified `forOp` if no hoisting occured.
58
- // / Return a new scf::ForOp if hoisting on tensors occured.
59
- // /
60
- // / After this transformation the returned scf::ForOp may have unused arguments
61
- // / that can be removed by application of canonicalization patterns.
62
- // /
63
- // / Example:
64
- // / ========
65
- // / IR Resembling:
66
- // /
67
- // / ```
68
- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0)->(tensor<10xf32>) {
69
- // / %1 = scf.for %j = %l to %u step %s iter_args(%a6 = %a0)->(tensor<10xf32>) {
70
- // / %e = tensor.extract_slice %a6[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
71
- // / %r = vector.transfer_read %e[%c0], %cst: tensor<?xf32>, vector<4xf32>
72
- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
73
- // / %w = vector.transfer_write %u, %e[%c0] : vector<4xf32>, tensor<?xf32>
74
- // / %st = tensor.insert_slice %w into %a6[%i][%sz][1]
75
- // / : tensor<?xf32> into tensor<10xf32>
76
- // / scf.yield %st: tensor<10xf32>
77
- // / }
78
- // / scf.yield %1: tensor<10xf32>
79
- // / }
80
- // / ```
81
- // /
82
- // / Progressively hoists to:
83
- // /
84
- // / ```
85
- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
86
- // / %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
87
- // / %1:2 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e)
88
- // / -> (tensor<10xf32>, tensor<?xf32>) {
89
- // / %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
90
- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
91
- // / %w = vector.transfer_write %u, %a7[%c0] : vector<4xf32>, tensor<?xf32>
92
- // / scf.yield %a6, %w: tensor<10xf32>, tensor<?xf32>
93
- // / }
94
- // / %st = tensor.insert_slice %1#1 into %1#0[%i][%sz][1]
95
- // / : tensor<?xf32> into tensor<10xf32>
96
- // / scf.yield %1: tensor<10xf32>
97
- // / }
98
- // / ```
99
- // /
100
- // / and
101
- // /
102
- // / ```
103
- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
104
- // / %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
105
- // / %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
106
- // / %1:3 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e, %a7 = r)
107
- // / -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
108
- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
109
- // / scf.yield %a6, %a7, %u: tensor<10xf32>, tensor<?xf32>, vector<4xf32>
110
- // / }
111
- // / %w = vector.transfer_write %1#2, %1#1[%c0] : vector<4xf32>, tensor<?xf32>
112
- // / %st = tensor.insert_slice %w into %1#0[%i][%sz][1]
113
- // / : tensor<?xf32> into tensor<10xf32>
114
- // / scf.yield %1: tensor<10xf32>
115
- // / }
116
- // / ```
117
- // /
118
- // / It can then canonicalize to:
119
- // /
120
- // / ```
121
- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
122
- // / %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
123
- // / %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
124
- // / %1 = scf.for %j = %l to %u step %s iter_args(%a7 = r)
125
- // / -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
126
- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
127
- // / scf.yield %u: vector<4xf32>
128
- // / }
129
- // / %w = vector.transfer_write %1, %e[%c0] : vector<4xf32>, tensor<?xf32>
130
- // / %st = tensor.insert_slice %w into %a0[%i][%sz][1]
131
- // / : tensor<?xf32> into tensor<10xf32>
132
- // / scf.yield %1: tensor<10xf32>
133
- // / }
134
- // / ```
135
- // /
136
- // TODO: This should be further generalized along a few different axes:
137
- // - Other loops than scf.ForOp that operate on tensors (both sequential and
138
- // parallel loops).
139
- // - Other subset extract/insert pairs than tensor.extract/insert_slice and
140
- // vector.transfer_read/write.
141
- // - More general areSubsetDisjoint analysis/interface to work across all
142
- // subset op types and allow bypassing non-WAW-conflicting operations in
143
- // more cases.
144
- scf::ForOp hoistRedundantSubsetExtractInsert (RewriterBase &rewriter,
145
- scf::ForOp forOp);
146
-
147
- // / Call into `hoistRedundantSubsetInsertExtract` without a RewriterBase.
148
- // TODO: obsolete and should be retired
149
- void hoistRedundantVectorTransfersOnTensor (func::FuncOp func);
150
-
151
48
} // namespace linalg
152
49
} // namespace mlir
153
50
0 commit comments