8
8
9
9
#include < executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10
10
11
+ #include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
11
12
#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
12
13
#include < executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
13
14
#include < executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
14
15
15
16
namespace vkcompute {
16
17
18
+ using api::utils::ivec3;
19
+ using api::utils::uvec3;
20
+
17
21
void add_copy_offset_node (
18
22
ComputeGraph& graph,
19
23
const ValueRef in,
20
- const api::utils:: ivec3& range,
21
- const api::utils:: ivec3& src_offset,
22
- const api::utils:: ivec3& dst_offset,
24
+ const ivec3& range,
25
+ const ivec3& src_offset,
26
+ const ivec3& dst_offset,
23
27
const ValueRef out) {
24
28
vTensorPtr t_in = graph.get_tensor (in);
25
29
vTensorPtr t_out = graph.get_tensor (out);
26
30
27
- VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
28
- VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
29
-
30
31
std::string kernel_name = " copy_offset" ;
31
32
kernel_name.reserve (kShaderNameReserve );
32
33
add_dtype_suffix (kernel_name, *t_out);
33
34
34
- api::utils:: uvec3 global_size = api::utils::make_uvec3 (range);
35
- api::utils:: uvec3 local_size = adaptive_work_group_size (global_size);
35
+ uvec3 global_size = api::utils::make_uvec3 (range);
36
+ uvec3 local_size = adaptive_work_group_size (global_size);
36
37
37
38
const struct Block final {
38
- api::utils:: ivec3 range;
39
+ ivec3 range;
39
40
int32_t unused0;
40
- api::utils:: ivec3 src_offset;
41
+ ivec3 src_offset;
41
42
int32_t unused1;
42
- api::utils:: ivec3 dst_offset;
43
+ ivec3 dst_offset;
43
44
int32_t unused2;
44
45
} offset_params{
45
46
range,
@@ -58,13 +59,166 @@ void add_copy_offset_node(
58
59
global_size,
59
60
local_size,
60
61
// Inputs and Outputs
61
- {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
62
+ {
63
+ {out, api::MemoryAccessType::WRITE},
64
+ {in, api::MemoryAccessType::READ},
65
+ },
62
66
// Parameter buffers
63
- {t_out->texture_limits_ubo (),
64
- t_in->texture_limits_ubo (),
65
- graph.create_params_buffer (offset_params)},
67
+ {graph.create_params_buffer (offset_params)},
66
68
// Specialization Constants
67
69
{}));
68
70
}
69
71
72
+ void add_copy_channel_offset_node (
73
+ ComputeGraph& graph,
74
+ const ValueRef in,
75
+ int32_t channel_range,
76
+ int32_t src_channel_offset,
77
+ int32_t dst_channel_offset,
78
+ const ValueRef out) {
79
+ vTensorPtr t_in = graph.get_tensor (in);
80
+ vTensorPtr t_out = graph.get_tensor (out);
81
+
82
+ // Likely need to prepad these numbers.
83
+ std::vector<int64_t > in_sizes = t_in->sizes ();
84
+ std::vector<int64_t > out_sizes = t_out->sizes ();
85
+
86
+ VK_CHECK_COND (check_memory_layout_is (*t_in, api::kChannelsPacked ));
87
+ VK_CHECK_COND (check_memory_layout_is (*t_out, api::kChannelsPacked ));
88
+
89
+ // NOTE: This function should be able to support 1d and 2d tensors when
90
+ // range=1, src_offset=dst_offset=1.
91
+ VK_CHECK_COND (t_in->dim () >= 3 , " Src dim should be at least 3" );
92
+ VK_CHECK_COND (t_out->dim () >= 3 , " Dst dim should be at least 3" );
93
+
94
+ VK_CHECK_COND (
95
+ dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
96
+ " Source channel plus range should be less than or equal to input tensor's channel size" );
97
+ VK_CHECK_COND (
98
+ dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
99
+ " Source channel and range should be less than or equal to input tensor's channel size" );
100
+
101
+ VK_CHECK_COND (channel_range >= 0 , " Channel range must be non-negative" );
102
+ VK_CHECK_COND (
103
+ src_channel_offset >= 0 , " Src channel offset must be non-negative" );
104
+ VK_CHECK_COND (
105
+ dst_channel_offset >= 0 , " Dst channel offset must be non-negative" );
106
+
107
+ std::string kernel_name = " copy_channel_offset" ;
108
+ kernel_name.reserve (kShaderNameReserve );
109
+ add_dtype_suffix (kernel_name, *t_out);
110
+
111
+ int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
112
+
113
+ // Copy one batch at a time.
114
+ for (int batch_idx = 0 ; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
115
+ batch_idx++) {
116
+ // Mapping the tensor NCHW coordinates into texture XYZ coordinates
117
+ int32_t dst_first_z = dst_channel_offset / 4 ;
118
+ int32_t dst_last_z = (dst_channel_offset + channel_range - 1 ) / 4 ;
119
+
120
+ // We copy the entire width and height dimension. For the channel dimension,
121
+ // we use the z-dimension of the global_size to specify the texture range.
122
+ // The shader combines the global invocation id and the dst_offset to get
123
+ // the actual coordinate.
124
+
125
+ ivec3 dst_offset{
126
+ 0 , 0 , dst_first_z + batch_idx * api::utils::div_up (out_channels, 4 )};
127
+
128
+ uvec3 global_size{
129
+ dim_at<Dim4D::Width>(in_sizes),
130
+ dim_at<Dim4D::Height>(in_sizes),
131
+ api::utils::safe_downcast<uint32_t >(dst_last_z - dst_first_z + 1 )};
132
+
133
+ uvec3 local_size = adaptive_work_group_size (global_size);
134
+
135
+ const struct Block final {
136
+ api::utils::ivec4 out_sizes;
137
+ api::utils::ivec4 in_sizes;
138
+ int32_t channel_range;
139
+ int32_t src_channel_offset;
140
+ int32_t dst_channel_offset;
141
+ int32_t unused;
142
+ ivec3 range;
143
+ int32_t unused1;
144
+ ivec3 dst_offset;
145
+ int32_t unused2;
146
+
147
+ } channel_offset_params{
148
+ api::utils::make_whcn_ivec4 (out_sizes),
149
+ api::utils::make_whcn_ivec4 (in_sizes),
150
+ channel_range,
151
+ src_channel_offset,
152
+ dst_channel_offset,
153
+ 0 ,
154
+ api::utils::make_ivec3 (global_size),
155
+ 0 ,
156
+ dst_offset,
157
+ 0 ,
158
+ };
159
+
160
+ auto shader = VK_KERNEL_FROM_STR (kernel_name);
161
+
162
+ graph.execute_nodes ().emplace_back (new ExecuteNode (
163
+ graph,
164
+ VK_KERNEL_FROM_STR (kernel_name),
165
+ global_size,
166
+ local_size,
167
+ // Inputs and Outputs
168
+ {
169
+ {out, api::MemoryAccessType::WRITE},
170
+ {out, api::MemoryAccessType::READ},
171
+ {in, api::MemoryAccessType::READ},
172
+ },
173
+ // Parameter buffers
174
+ {graph.create_params_buffer (channel_offset_params)},
175
+ // Specialization Constants
176
+ {}));
177
+ }
178
+ }
179
+
180
+ void add_copy_offset_node (
181
+ ComputeGraph& graph,
182
+ ValueRef in,
183
+ ValueRef range_ref,
184
+ ValueRef src_offset_ref,
185
+ ValueRef dst_offset_ref,
186
+ ValueRef out) {
187
+ ivec3 range = api::utils::make_ivec3 (*graph.get_int_list (range_ref));
188
+ ivec3 src_offset =
189
+ api::utils::make_ivec3 (*graph.get_int_list (src_offset_ref));
190
+ ivec3 dst_offset =
191
+ api::utils::make_ivec3 (*graph.get_int_list (dst_offset_ref));
192
+
193
+ add_copy_offset_node (graph, in, range, src_offset, dst_offset, out);
194
+ }
195
+
196
+ void copy_offset (ComputeGraph& graph, const std::vector<ValueRef>& args) {
197
+ add_copy_offset_node (graph, args[0 ], args[1 ], args[2 ], args[3 ], args[4 ]);
198
+ }
199
+
200
+ void copy_channel_offset (
201
+ ComputeGraph& graph,
202
+ const std::vector<ValueRef>& args) {
203
+ ValueRef in = args[0 ];
204
+ ValueRef channel_range_ref = args[1 ];
205
+ ValueRef src_channel_offset_ref = args[2 ];
206
+ ValueRef dst_channel_offset_ref = args[3 ];
207
+ ValueRef out = args[4 ];
208
+
209
+ auto channel_range = graph.extract_scalar <int64_t >(channel_range_ref);
210
+ auto src_channel_offset =
211
+ graph.extract_scalar <int64_t >(src_channel_offset_ref);
212
+ auto dst_channel_offset =
213
+ graph.extract_scalar <int64_t >(dst_channel_offset_ref);
214
+
215
+ add_copy_channel_offset_node (
216
+ graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
217
+ }
218
+
219
+ REGISTER_OPERATORS {
220
+ VK_REGISTER_OP (etvk.copy_offset , copy_offset);
221
+ VK_REGISTER_OP (etvk.copy_channel_offset , copy_channel_offset);
222
+ }
223
+
70
224
} // namespace vkcompute
0 commit comments