8
8
9
9
#include < executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10
10
11
+ #include < executorch/backends/vulkan/runtime/api/api.h>
12
+ #include < executorch/backends/vulkan/runtime/graph/Logging.h>
13
+
14
+ #include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
11
15
#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
12
16
#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
13
17
#include < executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
14
18
15
19
namespace vkcompute {
16
20
21
+ using api::utils::ivec3;
22
+ using api::utils::uvec3;
23
+
17
24
void add_copy_offset_node (
18
25
ComputeGraph& graph,
19
26
const ValueRef in,
20
- const api::utils:: ivec3& range,
21
- const api::utils:: ivec3& src_offset,
22
- const api::utils:: ivec3& dst_offset,
27
+ const ivec3& range,
28
+ const ivec3& src_offset,
29
+ const ivec3& dst_offset,
23
30
const ValueRef out) {
24
31
vTensorPtr t_in = graph.get_tensor (in);
25
32
vTensorPtr t_out = graph.get_tensor (out);
26
33
27
- VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
28
- VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
29
-
30
34
std::string kernel_name = " copy_offset" ;
31
35
kernel_name.reserve (kShaderNameReserve );
32
36
add_dtype_suffix (kernel_name, *t_out);
33
37
34
- api::utils:: uvec3 global_size = api::utils::make_uvec3 (range);
35
- api::utils:: uvec3 local_size = adaptive_work_group_size (global_size);
38
+ uvec3 global_size = api::utils::make_uvec3 (range);
39
+ uvec3 local_size = adaptive_work_group_size (global_size);
36
40
37
41
const struct Block final {
38
- api::utils:: ivec3 range;
42
+ ivec3 range;
39
43
int32_t unused0;
40
- api::utils:: ivec3 src_offset;
44
+ ivec3 src_offset;
41
45
int32_t unused1;
42
- api::utils:: ivec3 dst_offset;
46
+ ivec3 dst_offset;
43
47
int32_t unused2;
44
48
} offset_params{
45
49
range,
@@ -58,13 +62,166 @@ void add_copy_offset_node(
58
62
global_size,
59
63
local_size,
60
64
// Inputs and Outputs
61
- {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
65
+ {
66
+ {out, api::MemoryAccessType::WRITE},
67
+ {in, api::MemoryAccessType::READ},
68
+ },
62
69
// Parameter buffers
63
- {t_out->texture_limits_ubo (),
64
- t_in->texture_limits_ubo (),
65
- graph.create_params_buffer (offset_params)},
70
+ {graph.create_params_buffer (offset_params)},
66
71
// Specialization Constants
67
72
{}));
68
73
}
69
74
75
+ void add_copy_channel_offset_node (
76
+ ComputeGraph& graph,
77
+ const ValueRef in,
78
+ int32_t channel_range,
79
+ int32_t src_channel_offset,
80
+ int32_t dst_channel_offset,
81
+ const ValueRef out) {
82
+ vTensorPtr t_in = graph.get_tensor (in);
83
+ vTensorPtr t_out = graph.get_tensor (out);
84
+
85
+ // Likely need to prepad these numbers.
86
+ std::vector<int64_t > in_sizes = t_in->sizes ();
87
+ std::vector<int64_t > out_sizes = t_out->sizes ();
88
+
89
+ VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
90
+ VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
91
+
92
+ // NOTE: This function should be able to support 1d and 2d tensors when
93
+ // range=1, src_offset=dst_offset=1.
94
+ VK_CHECK_COND (t_in->dim () >= 3 , " Src dim should be at least 3" );
95
+ VK_CHECK_COND (t_out->dim () >= 3 , " Dst dim should be at least 3" );
96
+
97
+ VK_CHECK_COND (
98
+ dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
99
+ " Source channel plus range should be less than or equal to input tensor's channel size" );
100
+ VK_CHECK_COND (
101
+ dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
102
+ " Source channel and range should be less than or equal to input tensor's channel size" );
103
+
104
+ VK_CHECK_COND (channel_range >= 0 , " Channel range must be non-negative" );
105
+ VK_CHECK_COND (
106
+ src_channel_offset >= 0 , " Src channel offset must be non-negative" );
107
+ VK_CHECK_COND (
108
+ dst_channel_offset >= 0 , " Dst channel offset must be non-negative" );
109
+
110
+ std::string kernel_name = " copy_channel_offset" ;
111
+ kernel_name.reserve (kShaderNameReserve );
112
+ add_dtype_suffix (kernel_name, *t_out);
113
+
114
+ int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
115
+
116
+ // Copy one batch at a time.
117
+ for (int batch_idx = 0 ; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
118
+ batch_idx++) {
119
+ // Mapping the tensor NCHW coordinates into texture XYZ coordinates
120
+ int32_t dst_first_z = dst_channel_offset / 4 ;
121
+ int32_t dst_last_z = (dst_channel_offset + channel_range - 1 ) / 4 ;
122
+
123
+ // We copy the entire width and height dimension. For the channel dimension,
124
+ // we use the z-dimension of the global_size to specify the texture range.
125
+ // The shader combines the global invocation id and the dst_offset to get
126
+ // the actual coordinate.
127
+
128
+ ivec3 dst_offset{
129
+ 0 , 0 , dst_first_z + batch_idx * api::utils::div_up (out_channels, 4 )};
130
+
131
+ uvec3 global_size{
132
+ dim_at<Dim4D::Width>(in_sizes),
133
+ dim_at<Dim4D::Height>(in_sizes),
134
+ api::utils::safe_downcast<uint32_t >(dst_last_z - dst_first_z + 1 )};
135
+
136
+ uvec3 local_size = adaptive_work_group_size (global_size);
137
+
138
+ const struct Block final {
139
+ api::utils::ivec4 out_sizes;
140
+ api::utils::ivec4 in_sizes;
141
+ int32_t channel_range;
142
+ int32_t src_channel_offset;
143
+ int32_t dst_channel_offset;
144
+ int32_t unused;
145
+ ivec3 range;
146
+ int32_t unused1;
147
+ ivec3 dst_offset;
148
+ int32_t unused2;
149
+
150
+ } channel_offset_params{
151
+ api::utils::make_whcn_ivec4 (out_sizes),
152
+ api::utils::make_whcn_ivec4 (in_sizes),
153
+ channel_range,
154
+ src_channel_offset,
155
+ dst_channel_offset,
156
+ 0 ,
157
+ api::utils::make_ivec3 (global_size),
158
+ 0 ,
159
+ dst_offset,
160
+ 0 ,
161
+ };
162
+
163
+ auto shader = VK_KERNEL_FROM_STR (kernel_name);
164
+
165
+ graph.execute_nodes ().emplace_back (new ExecuteNode (
166
+ graph,
167
+ VK_KERNEL_FROM_STR (kernel_name),
168
+ global_size,
169
+ local_size,
170
+ // Inputs and Outputs
171
+ {
172
+ {out, api::MemoryAccessType::WRITE},
173
+ {out, api::MemoryAccessType::READ},
174
+ {in, api::MemoryAccessType::READ},
175
+ },
176
+ // Parameter buffers
177
+ {graph.create_params_buffer (channel_offset_params)},
178
+ // Specialization Constants
179
+ {}));
180
+ }
181
+ }
182
+
183
+ void add_copy_offset_node (
184
+ ComputeGraph& graph,
185
+ ValueRef in,
186
+ ValueRef range_ref,
187
+ ValueRef src_offset_ref,
188
+ ValueRef dst_offset_ref,
189
+ ValueRef out) {
190
+ ivec3 range = api::utils::make_ivec3 (*graph.get_int_list (range_ref));
191
+ ivec3 src_offset =
192
+ api::utils::make_ivec3 (*graph.get_int_list (src_offset_ref));
193
+ ivec3 dst_offset =
194
+ api::utils::make_ivec3 (*graph.get_int_list (dst_offset_ref));
195
+
196
+ add_copy_offset_node (graph, in, range, src_offset, dst_offset, out);
197
+ }
198
+
199
+ void copy_offset (ComputeGraph& graph, const std::vector<ValueRef>& args) {
200
+ add_copy_offset_node (graph, args[0 ], args[1 ], args[2 ], args[3 ], args[4 ]);
201
+ }
202
+
203
+ void copy_channel_offset (
204
+ ComputeGraph& graph,
205
+ const std::vector<ValueRef>& args) {
206
+ ValueRef in = args[0 ];
207
+ ValueRef channel_range_ref = args[1 ];
208
+ ValueRef src_channel_offset_ref = args[2 ];
209
+ ValueRef dst_channel_offset_ref = args[3 ];
210
+ ValueRef out = args[4 ];
211
+
212
+ auto channel_range = graph.extract_scalar <int64_t >(channel_range_ref);
213
+ auto src_channel_offset =
214
+ graph.extract_scalar <int64_t >(src_channel_offset_ref);
215
+ auto dst_channel_offset =
216
+ graph.extract_scalar <int64_t >(dst_channel_offset_ref);
217
+
218
+ add_copy_channel_offset_node (
219
+ graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
220
+ }
221
+
222
+ REGISTER_OPERATORS {
223
+ VK_REGISTER_OP (etvk.copy_offset , copy_offset);
224
+ VK_REGISTER_OP (etvk.copy_channel_offset , copy_channel_offset);
225
+ }
226
+
70
227
} // namespace vkcompute
0 commit comments