Skip to content

Commit f7300b2

Browse files
SS-JIAfacebook-github-bot
authored andcommitted
Add binary op support for height and width packing GPU layouts (#2516)
Summary: Pull Request resolved: #2516 ## Context Enable `binary_op` to support inputs that are `HEIGHT_PACKED` and `WIDTH_PACKED`. ghstack-source-id: 219281504 exported-using-ghexport Reviewed By: jorgep31415 Differential Revision: D55031044 fbshipit-source-id: 0db40bbdf1ab6a1b6f38c9b67e6dcc10a431ad5f
1 parent 2d9c489 commit f7300b2

16 files changed

+458
-40
lines changed

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,27 +84,68 @@ void ComputeGraph::update_descriptor_counts(
8484
}
8585
}
8686

87+
api::StorageType ComputeGraph::suggested_storage_type() {
88+
if (config_.enableStorageTypeOverride) {
89+
return config_.storageTypeOverride;
90+
}
91+
return api::StorageType::TEXTURE_3D;
92+
}
93+
94+
api::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
95+
const std::vector<int64_t>& sizes) {
96+
if (config_.enableMemoryLayoutOverride) {
97+
return config_.memoryLayoutOverride;
98+
}
99+
if (sizes.size() < 3) {
100+
return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
101+
}
102+
// For 3 dimensional tensors that only have a channels dimension of 1, still
103+
// prefer width packed.
104+
if (api::utils::val_at(-3, sizes) == 1) {
105+
return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
106+
}
107+
return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
108+
}
109+
87110
ValueRef ComputeGraph::add_tensor(
88111
const std::vector<int64_t>& sizes,
89112
const api::ScalarType dtype,
113+
const api::StorageType storage_type,
114+
const api::GPUMemoryLayout memory_layout,
90115
const int64_t shared_object_idx) {
91116
bool allocate_memory = shared_object_idx < 0;
92117

93118
ValueRef idx(static_cast<int>(values_.size()));
94119
values_.emplace_back(vTensor(
95-
context(),
96-
sizes,
97-
dtype,
98-
api::StorageType::TEXTURE_3D,
99-
api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
100-
allocate_memory));
120+
context(), sizes, dtype, storage_type, memory_layout, allocate_memory));
101121

102122
if (!allocate_memory) {
103123
get_shared_object(shared_object_idx).add_user(this, idx);
104124
}
105125
return idx;
106126
}
107127

128+
ValueRef ComputeGraph::add_tensor(
129+
const std::vector<int64_t>& sizes,
130+
const api::ScalarType dtype,
131+
const api::GPUMemoryLayout memory_layout,
132+
const int64_t shared_object_idx) {
133+
return add_tensor(
134+
sizes, dtype, suggested_storage_type(), memory_layout, shared_object_idx);
135+
}
136+
137+
ValueRef ComputeGraph::add_tensor(
138+
const std::vector<int64_t>& sizes,
139+
const api::ScalarType dtype,
140+
const int64_t shared_object_idx) {
141+
return add_tensor(
142+
sizes,
143+
dtype,
144+
suggested_storage_type(),
145+
suggested_memory_layout(sizes),
146+
shared_object_idx);
147+
}
148+
108149
ValueRef ComputeGraph::add_tensorref(
109150
const std::vector<int64_t>& sizes,
110151
const api::ScalarType dtype,

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,13 @@ class ComputeGraph final {
9393
bool execute);
9494

9595
/*
96-
* Returns the value at a particular reference
96+
* Returns the value at a particular index in the graph. If storing this
97+
* function's return value in a lvalue reference, it is imperative that no
98+
* values are added to the graph while the reference is in scope, otherwise
99+
* the underlying value may have been moved as part of a vector resize.
97100
*/
98101
inline Value& get_val(ValueRef idx) {
99-
return values_[idx];
102+
return values_.at(idx);
100103
}
101104

102105
inline const std::vector<int64_t>& get_val_sizes(ValueRef idx) {
@@ -127,18 +130,88 @@ class ComputeGraph final {
127130
return execute_nodes_;
128131
}
129132

133+
//
134+
// Utility functions
135+
//
136+
137+
/*
138+
* Returns a suggested storage type (i.e. buffer or texture) that can be used
139+
* to construct `vTensor`s. The storage type is typically determined by the
140+
* GPU reported by the Vulkan context, unless a storage type override is
141+
* defined in the graph configuration. Some GPU architectures work better with
142+
* buffer storage, and others with texture storage. Current only texture
143+
* storage is supported.
144+
*/
145+
api::StorageType suggested_storage_type();
146+
147+
/*
148+
* Returns a suggested memory layout (i.e. channels, width, or height packed)
149+
* that can be used to construct `vTensor`s. The memory layout impacts which
150+
* dimension will be treated as the vectorized dimension. For texture storage,
151+
* elements along the vectorized dimension are packed into texels. The
152+
* suggested memory layout is determined based on the sizes of the tensor,
153+
* unless a memory layout override is defined in the graph configuration.
154+
*/
155+
api::GPUMemoryLayout suggested_memory_layout(
156+
const std::vector<int64_t>& sizes);
157+
158+
/*
159+
* Returns the memory layout of a Tensor value at the specified index.
160+
*/
161+
inline api::GPUMemoryLayout memory_layout_of(ValueRef idx) {
162+
return get_val(idx).toTensor().gpu_memory_layout();
163+
}
164+
130165
//
131166
// Graph Building
132167
//
133168

169+
/*
170+
* Add a `vTensor` value to the graph with the specified properties. There are
171+
* various convenience overloads of this function that may be used instead.
172+
*/
173+
ValueRef add_tensor(
174+
const std::vector<int64_t>& sizes,
175+
const api::ScalarType dtype,
176+
const api::StorageType storage_type,
177+
const api::GPUMemoryLayout memory_layout,
178+
const int64_t shared_object_idx);
179+
180+
/*
181+
* Add a `vTensor` value to the graph with the specified properties. The
182+
* suggested storage type will be used to construct the `vTensor`.
183+
*/
184+
ValueRef add_tensor(
185+
const std::vector<int64_t>& sizes,
186+
const api::ScalarType dtype,
187+
const api::GPUMemoryLayout memory_layout,
188+
const int64_t shared_object_idx = -1);
189+
190+
/*
191+
* Add a `vTensor` value to the graph with the specified properties. The
192+
* suggested storage type and memory layout will be used to construct the
193+
* `vTensor`.
194+
*/
134195
ValueRef add_tensor(
135196
const std::vector<int64_t>& sizes,
136197
const api::ScalarType dtype = api::ScalarType::Float,
137198
const int64_t shared_object_idx = -1);
199+
200+
/*
201+
* Add a `TensorRef` value to the graph with the specific properties. A
202+
* `TensorRef` is a reference to a `vTensor` whose data is stored in an
203+
* external CPU buffer.
204+
*/
138205
ValueRef add_tensorref(
139206
const std::vector<int64_t>& sizes,
140207
const api::ScalarType dtype,
141208
const void* const data);
209+
210+
/*
211+
* Add a staging buffer to the graph. Staging buffers are data buffers that
212+
* use memory that is visible to both the CPU and GPU, and therefore is used
213+
* as a intermediary when transferring data between the CPU and GPU.
214+
*/
142215
ValueRef add_staging(const api::ScalarType dtype, const size_t numel);
143216

144217
ValueRef add_none();
@@ -176,6 +249,20 @@ class ComputeGraph final {
176249
return {t, staging};
177250
}
178251

252+
/*
253+
* Convenience function to add an input tensor with a specific memory layout
254+
* along with its staging buffer
255+
*/
256+
inline IOValueRef add_input_tensor(
257+
const std::vector<int64_t>& sizes,
258+
const api::ScalarType dtype,
259+
const api::GPUMemoryLayout memory_layout,
260+
const int64_t shared_object_idx = -1) {
261+
ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx);
262+
ValueRef staging = set_input_tensor(t);
263+
return {t, staging};
264+
}
265+
179266
SharedObject& get_shared_object(const int64_t idx);
180267

181268
//

backends/vulkan/runtime/graph/GraphConfig.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,26 @@ GraphConfig::GraphConfig() {
4949
// Empirically selected safety factor. If descriptor pools start running out
5050
// of memory, increase this safety factor.
5151
descriptorPoolSafetyFactor = 1.25;
52+
53+
// For now, force TEXTURE_3D storage as we are still developing shader
54+
// support for buffer storage type.
55+
enableStorageTypeOverride = true;
56+
storageTypeOverride = api::StorageType::TEXTURE_3D;
57+
58+
// For now, force TENSOR_CHANNELS_PACKED memory layout by default as we are
59+
// still developing support for other memory layouts.
60+
enableMemoryLayoutOverride = true;
61+
memoryLayoutOverride = api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
62+
}
63+
64+
void GraphConfig::setStorageTypeOverride(api::StorageType storage_type) {
65+
enableStorageTypeOverride = true;
66+
storageTypeOverride = storage_type;
67+
}
68+
69+
void GraphConfig::setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout) {
70+
enableMemoryLayoutOverride = true;
71+
memoryLayoutOverride = memory_layout;
5272
}
5373

5474
} // namespace vulkan

backends/vulkan/runtime/graph/GraphConfig.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,17 @@ struct GraphConfig final {
2626
// risk.
2727
float descriptorPoolSafetyFactor;
2828

29+
bool enableStorageTypeOverride;
30+
api::StorageType storageTypeOverride;
31+
32+
bool enableMemoryLayoutOverride;
33+
api::GPUMemoryLayout memoryLayoutOverride;
34+
2935
// Generate a default graph config with pre-configured settings
3036
explicit GraphConfig();
37+
38+
void setStorageTypeOverride(api::StorageType storage_type);
39+
void setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout);
3140
};
3241

3342
} // namespace vulkan

backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,5 +63,10 @@ void main() {
6363
COORD_TO_POS_${PACKING}(other_coord, other_sizes.data),
6464
0);
6565

66+
// Detect broadcasting
67+
if (PACKED_DIM_${PACKING}(other_sizes.data) < PACKED_DIM_${PACKING}(in_sizes.data)) {
68+
other_texel = other_texel.xxxx;
69+
}
70+
6671
imageStore(image_out, pos, OP(in_texel, other_texel, alpha.data));
6772
}

backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,18 @@ binary_op:
1111
DTYPE: float
1212
PACKING: CHANNELS_PACKED
1313
generate_variant_forall:
14+
PACKING:
15+
- VALUE: CHANNELS_PACKED
16+
SUFFIX: C_packed
17+
- VALUE: WIDTH_PACKED
18+
SUFFIX: W_packed
19+
- VALUE: HEIGHT_PACKED
20+
SUFFIX: H_packed
1421
DTYPE:
15-
- VALUE: "half"
16-
SUFFIX: "half"
17-
- VALUE: "float"
18-
SUFFIX: "float"
22+
- VALUE: half
23+
SUFFIX: half
24+
- VALUE: float
25+
SUFFIX: float
1926
shader_variants:
2027
- NAME: binary_add
2128
- NAME: binary_sub

backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
ivec4 out_coord_to_in_coord(const ivec4 out_coord, const ivec4 in_sizes) {
1010
ivec4 in_coord = out_coord;
1111
for (int i = 0; i < 4; ++i) {
12-
if (in_sizes[i] == 1) {
12+
if (out_coord[i] >= in_sizes[i]) {
1313
in_coord[i] = 0;
1414
}
1515
}

backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@
2424
#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
2525
ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
2626

27+
#define COORD_TO_POS_WIDTH_PACKED(coord, sizes) \
28+
ivec3(coord.x / 4, coord.y, (coord.z + coord.w * sizes.z))
29+
30+
#define COORD_TO_POS_HEIGHT_PACKED(coord, sizes) \
31+
ivec3(coord.x, coord.y / 4, (coord.z + coord.w * sizes.z))
32+
33+
#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
34+
ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
35+
2736
#define COORD_TO_BUFFER_IDX(coord, sizes) \
2837
coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
2938
coord.w* sizes.z* sizes.y* sizes.x;

backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@ namespace at {
1919
namespace native {
2020
namespace vulkan {
2121

22+
void check_binary_op_args(
23+
const vTensor& self,
24+
const vTensor& other,
25+
const vTensor& out) {
26+
VK_CHECK_COND(check_same_memory_layout(self, other, out));
27+
VK_CHECK_COND(check_broadcastable(self, other));
28+
std::vector<int64_t> broadcasted_sizes =
29+
calculate_broadcasted_output_size(self, other);
30+
VK_CHECK_COND(out.sizes() == broadcasted_sizes);
31+
}
32+
2233
void resize_binary_op_node(
2334
ComputeGraph* graph,
2435
const std::vector<ArgGroup>& args,
@@ -28,15 +39,8 @@ void resize_binary_op_node(
2839
vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
2940
vTensor& other = graph->get_val(args[1].refs[1]).toTensor();
3041

31-
std::vector<int64_t> new_out_sizes(
32-
std::max(self.sizes().size(), other.sizes().size()));
33-
34-
// Match the sizes in reverse because sizes are in NCHW order
35-
for (int i = -1; i >= -new_out_sizes.size(); --i) {
36-
new_out_sizes.at(new_out_sizes.size() + i) = std::max(
37-
api::utils::val_at(i, self.sizes()),
38-
api::utils::val_at(i, other.sizes()));
39-
}
42+
std::vector<int64_t> new_out_sizes =
43+
calculate_broadcasted_output_size(self, other);
4044

4145
out.virtual_resize(new_out_sizes);
4246
}
@@ -49,12 +53,16 @@ void add_binary_op_node(
4953
const ValueRef out,
5054
const std::string& op_name) {
5155
ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
52-
ValueRef arg2 = prepack_if_tensor_ref(graph, in2);
56+
ValueRef arg2 =
57+
prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));
5358

5459
vTensor& t_in1 = graph.get_val(arg1).toTensor();
5560
vTensor& t_in2 = graph.get_val(arg2).toTensor();
61+
5662
vTensor& t_out = graph.get_val(out).toTensor();
5763

64+
check_binary_op_args(t_in1, t_in2, t_out);
65+
5866
api::utils::uvec3 global_size = t_out.virtual_extents();
5967
api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
6068

@@ -67,6 +75,7 @@ void add_binary_op_node(
6775

6876
std::stringstream kernel_name;
6977
kernel_name << "binary_" << op_name;
78+
apply_memory_layout_suffix(kernel_name, t_out);
7079
apply_dtype_suffix(kernel_name, t_out);
7180

7281
graph.execute_nodes().emplace_back(new ExecuteNode(

0 commit comments

Comments
 (0)