8
8
9
9
#include < executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10
10
11
+ #include < executorch/backends/vulkan/runtime/api/api.h>
12
+ #include < executorch/backends/vulkan/runtime/graph/Logging.h>
13
+
14
+ #include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
11
15
#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
12
16
#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
13
17
#include < executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
14
18
15
19
namespace vkcompute {
16
20
21
+ using api::utils::ivec3;
22
+ using api::utils::uvec3;
23
+
17
24
void add_copy_offset_node (
18
25
ComputeGraph& graph,
19
26
const ValueRef in,
20
- const api::utils:: ivec3& range,
21
- const api::utils:: ivec3& src_offset,
22
- const api::utils:: ivec3& dst_offset,
27
+ const ivec3& range,
28
+ const ivec3& src_offset,
29
+ const ivec3& dst_offset,
23
30
const ValueRef out) {
24
31
vTensorPtr t_in = graph.get_tensor (in);
25
32
vTensorPtr t_out = graph.get_tensor (out);
26
33
27
- VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
28
- VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
29
-
30
34
std::string kernel_name = " copy_offset" ;
31
35
kernel_name.reserve (kShaderNameReserve );
32
36
add_dtype_suffix (kernel_name, *t_out);
33
37
34
- api::utils:: uvec3 global_size = api::utils::make_uvec3 (range);
35
- api::utils:: uvec3 local_size = adaptive_work_group_size (global_size);
38
+ uvec3 global_size = api::utils::make_uvec3 (range);
39
+ uvec3 local_size = adaptive_work_group_size (global_size);
36
40
37
41
const struct Block final {
38
- api::utils:: ivec3 range;
42
+ ivec3 range;
39
43
int32_t unused0;
40
- api::utils:: ivec3 src_offset;
44
+ ivec3 src_offset;
41
45
int32_t unused1;
42
- api::utils:: ivec3 dst_offset;
46
+ ivec3 dst_offset;
43
47
int32_t unused2;
44
48
} offset_params{
45
49
range,
@@ -58,7 +62,10 @@ void add_copy_offset_node(
58
62
global_size,
59
63
local_size,
60
64
// Inputs and Outputs
61
- {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
65
+ {
66
+ {out, api::MemoryAccessType::WRITE},
67
+ {in, api::MemoryAccessType::READ},
68
+ },
62
69
// Parameter buffers
63
70
{t_out->texture_limits_ubo (),
64
71
t_in->texture_limits_ubo (),
@@ -67,4 +74,169 @@ void add_copy_offset_node(
67
74
{}));
68
75
}
69
76
77
+ void add_copy_channel_offset_node (
78
+ ComputeGraph& graph,
79
+ const ValueRef in,
80
+ int32_t channel_range,
81
+ int32_t src_channel_offset,
82
+ int32_t dst_channel_offset,
83
+ const ValueRef out) {
84
+ vTensorPtr t_in = graph.get_tensor (in);
85
+ vTensorPtr t_out = graph.get_tensor (out);
86
+
87
+ // Likely need to prepad these numbers.
88
+ std::vector<int64_t > in_sizes = t_in->sizes ();
89
+ std::vector<int64_t > out_sizes = t_in->sizes ();
90
+
91
+ VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
92
+ VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
93
+
94
+ // NOTE: This function should be able to support 1d and 2d tensors when
95
+ // range=1, src_offset=dst_offset=1.
96
+ VK_CHECK_COND (t_in->dim () >= 3 , " Src dim should be at least 3" );
97
+ VK_CHECK_COND (t_out->dim () >= 3 , " Dst dim should be at least 3" );
98
+
99
+ VK_CHECK_COND (
100
+ dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
101
+ " Source channel and range should be less than or equal to input tensor's channel size" );
102
+ VK_CHECK_COND (
103
+ dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
104
+ " Source channel and range should be less than or equal to input tensor's channel size" );
105
+
106
+ VK_CHECK_COND (channel_range >= 0 , " Channel range must be non-negative" );
107
+ VK_CHECK_COND (
108
+ src_channel_offset >= 0 , " Src channel offset must be non-negative" );
109
+ VK_CHECK_COND (
110
+ dst_channel_offset >= 0 , " Dst channel offset must be non-negative" );
111
+
112
+ std::string kernel_name = " copy_channel_offset" ;
113
+ kernel_name.reserve (kShaderNameReserve );
114
+ add_dtype_suffix (kernel_name, *t_out);
115
+
116
+ int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
117
+
118
+ for (int batch_idx = 0 ; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
119
+ batch_idx++) {
120
+ // Mapping the tensor NCHW coordinates into texture XYZ coordinates
121
+ int32_t dst_first_z = dst_channel_offset / 4 ;
122
+ int32_t dst_last_z = (dst_channel_offset + channel_range - 1 ) / 4 ;
123
+
124
+ // We copy the entire width and height dimension. For the batch dimension,
125
+ // the global_size variable specify the range. The shader combines it with
126
+ // the dst_offset to get the actual coordinate.
127
+
128
+ ivec3 dst_offset{
129
+ 0 , 0 , dst_first_z + batch_idx * api::utils::div_up (out_channels, 4 )};
130
+
131
+ uvec3 global_size{
132
+ dim_at<Dim4D::Width>(in_sizes),
133
+ dim_at<Dim4D::Height>(in_sizes),
134
+ api::utils::safe_downcast<uint32_t >(dst_last_z - dst_first_z + 1 )};
135
+
136
+ uvec3 local_size = adaptive_work_group_size (global_size);
137
+
138
+ /*
139
+ std::cout << "shader channel offset. "
140
+ << " batch_idx=" << batch_idx
141
+ << " channel_range=" << channel_range
142
+ << " src_channel_offset=" << src_channel_offset
143
+ << " dst_channel_offset=" << dst_channel_offset
144
+ << " in_size=" << in_sizes
145
+ << " out_size=" << out_sizes
146
+ << " dst_offset=" << dst_offset
147
+ << " global_size=" << global_size
148
+ << std::endl;
149
+ */
150
+
151
+ const struct Block final {
152
+ api::utils::ivec4 out_sizes;
153
+ api::utils::ivec4 in_sizes;
154
+ int32_t channel_range;
155
+ int32_t src_channel_offset;
156
+ int32_t dst_channel_offset;
157
+ int32_t unused;
158
+ ivec3 range;
159
+ int32_t unused1;
160
+ ivec3 dst_offset;
161
+ int32_t unused2;
162
+
163
+ } channel_offset_params{
164
+ api::utils::make_whcn_ivec4 (out_sizes),
165
+ api::utils::make_whcn_ivec4 (in_sizes),
166
+ channel_range,
167
+ src_channel_offset,
168
+ dst_channel_offset,
169
+ 0 ,
170
+ api::utils::make_ivec3 (global_size),
171
+ 0 ,
172
+ dst_offset,
173
+ };
174
+
175
+ auto shader = VK_KERNEL_FROM_STR (kernel_name);
176
+
177
+ graph.execute_nodes ().emplace_back (new ExecuteNode (
178
+ graph,
179
+ VK_KERNEL_FROM_STR (kernel_name),
180
+ global_size,
181
+ local_size,
182
+ // Inputs and Outputs
183
+ {
184
+ {out, api::MemoryAccessType::WRITE},
185
+ {out, api::MemoryAccessType::READ},
186
+ {in, api::MemoryAccessType::READ},
187
+ },
188
+ // Parameter buffers
189
+ {t_out->texture_limits_ubo (),
190
+ t_in->texture_limits_ubo (),
191
+ graph.create_params_buffer (channel_offset_params)},
192
+ // Specialization Constants
193
+ {}));
194
+ }
195
+ }
196
+
197
+ void add_copy_offset_node (
198
+ ComputeGraph& graph,
199
+ ValueRef in,
200
+ ValueRef range_ref,
201
+ ValueRef src_offset_ref,
202
+ ValueRef dst_offset_ref,
203
+ ValueRef out) {
204
+ ivec3 range = api::utils::make_ivec3 (*graph.get_int_list (range_ref));
205
+ ivec3 src_offset =
206
+ api::utils::make_ivec3 (*graph.get_int_list (src_offset_ref));
207
+ ivec3 dst_offset =
208
+ api::utils::make_ivec3 (*graph.get_int_list (dst_offset_ref));
209
+
210
+ add_copy_offset_node (graph, in, range, src_offset, dst_offset, out);
211
+ }
212
+
213
+ void copy_offset (ComputeGraph& graph, const std::vector<ValueRef>& args) {
214
+ add_copy_offset_node (graph, args[0 ], args[1 ], args[2 ], args[3 ], args[4 ]);
215
+ }
216
+
217
+ void copy_channel_offset (
218
+ ComputeGraph& graph,
219
+ const std::vector<ValueRef>& args) {
220
+ ValueRef in = args[0 ];
221
+ ValueRef channel_range_ref = args[1 ];
222
+ ValueRef src_channel_offset_ref = args[2 ];
223
+ ValueRef dst_channel_offset_ref = args[3 ];
224
+ ValueRef out = args[4 ];
225
+
226
+ auto channel_range = graph.extract_scalar <int64_t >(channel_range_ref);
227
+ auto src_channel_offset =
228
+ graph.extract_scalar <int64_t >(src_channel_offset_ref);
229
+ auto dst_channel_offset =
230
+ graph.extract_scalar <int64_t >(dst_channel_offset_ref);
231
+
232
+ add_copy_channel_offset_node (
233
+ graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
234
+ }
235
+
236
+ REGISTER_OPERATORS {
237
+ // VK_REGISTER_OP(aten.clone.default, add_test_node);
238
+ VK_REGISTER_OP (etvk.copy_offset , copy_offset);
239
+ VK_REGISTER_OP (etvk.copy_channel_offset , copy_channel_offset);
240
+ }
241
+
70
242
} // namespace vkcompute
0 commit comments