[ET-VK] Enable storage type and memory layout settings to be serialized with Vulkan graph

SS-JIA · SS-JIA · commit 67502f2e853f · 2024-03-20T15:23:18.000-07:00
Pull Request resolved: #2540 ## Context Allow `api::StorageType` and `api::GPUMemoryLayout` settings to be serialized with the flatbuffer. There are two entry points for this: 1. `VkTensor` table now has two fields that can be set to select particular settings for that tensor 2. A storage type and memory layout override can be set via the `CompileSpec` API ghstack-source-id: 219475440 Differential Revision: [D55154628](https://our.internmc.facebook.com/intern/diff/D55154628/)
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -5,7 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
-from typing import final, List, Optional
+from typing import Any, Dict, final, List, Optional
+
+import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema
 
 import torch
 from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
@@ -45,11 +47,29 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         return supported
 
 
+def parse_compile_options(
+    compile_options: Optional[Dict[str, Any]] = None
+) -> List[CompileSpec]:
+    compile_specs = []
+    if compile_options is None:
+        return compile_specs
+
+    for key, value in compile_options.items():
+        if isinstance(
+            value, (vk_graph_schema.VkStorageType, vk_graph_schema.VkMemoryLayout)
+        ):
+            value_bytes = int(value).to_bytes(4, byteorder="little")
+            compile_specs.append(CompileSpec(key, value_bytes))
+        else:
+            raise RuntimeError(f"Invalid compile option {key} with type {type(value)}")
+
+    return compile_specs
+
+
 @final
 class VulkanPartitioner(Partitioner):
-    def __init__(self, compile_spec: Optional[List[CompileSpec]] = None) -> None:
-        if compile_spec is None:
-            compile_spec = []
+    def __init__(self, compile_options: Optional[Dict[str, Any]] = None) -> None:
+        compile_spec = parse_compile_options(compile_options)
         self.delegation_spec = DelegationSpec(VulkanBackend.__name__, compile_spec)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -22,6 +22,7 @@
 
 #include <cstdio>
 #include <cstdlib> /* strtol */
+#include <cstring>
 #include <memory>
 #include <type_traits>
 #include <vector>
@@ -72,6 +73,62 @@ api::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) {
   }
 }
 
+api::StorageType get_storage_type(
+    const vkgraph::VkStorageType& vk_storage_type) {
+  switch (vk_storage_type) {
+    case vkgraph::VkStorageType::BUFFER:
+      return api::StorageType::BUFFER;
+    case vkgraph::VkStorageType::TEXTURE_3D:
+      return api::StorageType::TEXTURE_3D;
+    case vkgraph::VkStorageType::TEXTURE_2D:
+      return api::StorageType::TEXTURE_2D;
+    default:
+      break;
+  }
+  return api::StorageType::UNKNOWN;
+}
+
+api::GPUMemoryLayout get_memory_layout(
+    const vkgraph::VkMemoryLayout& vk_memory_layout) {
+  switch (vk_memory_layout) {
+    case vkgraph::VkMemoryLayout::TENSOR_WIDTH_PACKED:
+      return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+    case vkgraph::VkMemoryLayout::TENSOR_HEIGHT_PACKED:
+      return api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED;
+    case vkgraph::VkMemoryLayout::TENSOR_CHANNELS_PACKED:
+      return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+    default:
+      break;
+  }
+  VK_THROW("Invalid memory layout encountered!");
+}
+
+GraphConfig get_graph_config(ArrayRef<CompileSpec>& compile_specs) {
+  GraphConfig config = GraphConfig();
+
+  for (const CompileSpec& spec : compile_specs) {
+    const uint8_t* value_data = (const uint8_t*)spec.value.buffer;
+    const size_t value_size = spec.value.nbytes;
+    if (strcmp(spec.key, "storage_type_override") == 0) {
+      ET_CHECK_MSG(value_size == sizeof(int32_t), "Unexpected value size!");
+      int value_as_int = static_cast<int>(GetUInt32LE(value_data));
+      api::StorageType storage_type =
+          static_cast<api::StorageType>(value_as_int);
+
+      config.setStorageTypeOverride(storage_type);
+    }
+    if (strcmp(spec.key, "memory_layout_override") == 0) {
+      ET_CHECK_MSG(value_size == sizeof(uint32_t), "Unexpected value size!");
+      uint32_t value_as_int = GetUInt32LE(value_data);
+      api::GPUMemoryLayout memory_layout =
+          static_cast<api::GPUMemoryLayout>(value_as_int);
+
+      config.setMemoryLayoutOverride(memory_layout);
+    }
+  }
+  return config;
+}
+
 class GraphBuilder {
   ComputeGraph* compute_graph_;
   VkGraphPtr flatbuffer_;
@@ -109,10 +166,19 @@ class GraphBuilder {
 
   void add_tensor_to_graph(const uint32_t fb_id, VkTensorPtr tensor_fb) {
     const api::ScalarType& dtype = get_scalar_type(tensor_fb->datatype());
+    api::StorageType storage_type =
+        tensor_fb->storage_type() == vkgraph::VkStorageType::DEFAULT_STORAGE
+        ? compute_graph_->suggested_storage_type()
+        : get_storage_type(tensor_fb->storage_type());
 
     UIntVector dims_fb = tensor_fb->dims();
     const std::vector<int64_t> dims_vector(dims_fb->cbegin(), dims_fb->cend());
 
+    api::GPUMemoryLayout memory_layout =
+        tensor_fb->memory_layout() == vkgraph::VkMemoryLayout::DEFAULT_LAYOUT
+        ? compute_graph_->suggested_memory_layout(dims_vector)
+        : get_memory_layout(tensor_fb->memory_layout());
+
     ValueRef ref;
     if (tensor_fb->constant_id() >= 0) {
       const uint8_t* tensor_data = getConstantDataPtr(
@@ -121,7 +187,11 @@ class GraphBuilder {
       ref = compute_graph_->add_tensorref(dims_vector, dtype, tensor_data);
     } else {
       ref = compute_graph_->add_tensor(
-          dims_vector, dtype, tensor_fb->mem_obj_id());
+          dims_vector,
+          dtype,
+          storage_type,
+          memory_layout,
+          tensor_fb->mem_obj_id());
     }
 
     ref_mapping_[fb_id] = ref;
@@ -371,11 +441,11 @@ class VulkanBackend final : public PyTorchBackendInterface {
   Result<DelegateHandle*> init(
       BackendInitContext& context,
       FreeableBuffer* processed,
-      ArrayRef<CompileSpec>) const override {
+      ArrayRef<CompileSpec> compile_specs) const override {
     ComputeGraph* compute_graph = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
         context.get_runtime_allocator(), ComputeGraph);
 
-    new (compute_graph) ComputeGraph(GraphConfig());
+    new (compute_graph) ComputeGraph(get_graph_config(compile_specs));
 
     Error err = compileModel(processed->data(), compute_graph);
 
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.cpp b/backends/vulkan/runtime/VulkanDelegateHeader.cpp
@@ -36,6 +36,8 @@ constexpr ByteSlice kFlatbufferSize = {14, 4};
 constexpr ByteSlice kBytesOffset = {18, 4};
 constexpr ByteSlice kBytesSize = {22, 8};
 
+} // namespace
+
 /// Interprets the 8 bytes at `data` as a little-endian uint64_t.
 uint64_t GetUInt64LE(const uint8_t* data) {
   return (uint64_t)data[0] | ((uint64_t)data[1] << 8) |
@@ -55,8 +57,6 @@ uint32_t GetUInt16LE(const uint8_t* data) {
   return (uint32_t)data[0] | ((uint32_t)data[1] << 8);
 }
 
-} // namespace
-
 bool VulkanDelegateHeader::is_valid() const {
   if (header_size < kExpectedSize) {
     return false;
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.h b/backends/vulkan/runtime/VulkanDelegateHeader.h
@@ -14,6 +14,11 @@ namespace torch {
 namespace executor {
 namespace vulkan {
 
+// Byte decoding utilities
+uint64_t GetUInt64LE(const uint8_t* data);
+uint32_t GetUInt32LE(const uint8_t* data);
+uint32_t GetUInt16LE(const uint8_t* data);
+
 struct VulkanDelegateHeader {
   bool is_valid() const;
 
diff --git a/backends/vulkan/serialization/schema.fbs b/backends/vulkan/serialization/schema.fbs
@@ -19,6 +19,26 @@ enum VkDataType : byte {
   FLOAT32 = 5,
 }
 
+// Describes what kind of GPU resource should be used to represent a tensor. The
+// int values assigned to each entry must match the corresponding entry in
+// api::StorageType.
+enum VkStorageType : ubyte {
+  BUFFER = 0,
+  TEXTURE_3D = 1,
+  TEXTURE_2D = 2,
+  DEFAULT_STORAGE = 255,
+}
+
+// Describes how memory should be laid out in GPU memory. See the GPUMemoryLayout
+// enum class in PyTorch Vulkan for more details. The int values assigned to each
+// entry must match the corresponding entry in api::GPUMemoryLayout.
+enum VkMemoryLayout : ubyte {
+  TENSOR_WIDTH_PACKED = 0,
+  TENSOR_HEIGHT_PACKED = 1,
+  TENSOR_CHANNELS_PACKED = 2,
+  DEFAULT_LAYOUT = 255,
+}
+
 table VkTensor {
   // Type of the tensor elements.
   datatype:VkDataType;
@@ -28,6 +48,10 @@ table VkTensor {
   constant_id:int;
   // Index to the shared memory object. Negative indicates the tensor doesn't share memory.
   mem_obj_id:int;
+  // Storage type that should be used to represent this tensor
+  storage_type:VkStorageType = DEFAULT_STORAGE;
+  // Memory layout that should be used to represent this tensor
+  memory_layout:VkMemoryLayout = DEFAULT_LAYOUT;
 }
 
 table Null {}
@@ -103,6 +127,17 @@ table VkGraph {
   // Raw Objects (e.g. weight tensors and custom shaders)
   constants:[VkBytes];
   shaders:[VkBytes];
+
+  // Graph configuration
+  // As per flatbuffer BC/FC policy, new fields can be freely added to this
+  // section. It is recommended to provide default values, since older blobs
+  // without the field will be deserialized with the default value.
+
+  // Sets an override for the storage type and memory layout that will be used
+  // to represent a VkTensor if the VkTensor is not serialized with a particular
+  // storage type or memory layout setting
+  storage_type_override:VkStorageType = DEFAULT_STORAGE;
+  memory_layout_override:VkMemoryLayout = DEFAULT_LAYOUT;
 }
 
 root_type VkGraph;
diff --git a/backends/vulkan/serialization/vulkan_graph_schema.py b/backends/vulkan/serialization/vulkan_graph_schema.py
@@ -30,12 +30,28 @@ class VkDataType(IntEnum):
     FLOAT32 = 5
 
 
+class VkStorageType(IntEnum):
+    BUFFER = 0
+    TEXTURE_3D = 1
+    TEXTURE_2D = 2
+    DEFAULT_STORAGE = 255
+
+
+class VkMemoryLayout(IntEnum):
+    TENSOR_WIDTH_PACKED = 0
+    TENSOR_HEIGHT_PACKED = 1
+    TENSOR_CHANNELS_PACKED = 2
+    DEFAULT_LAYOUT = 255
+
+
 @dataclass
 class VkTensor:
     datatype: VkDataType
     dims: List[int]
     constant_id: int
     mem_obj_id: int
+    storage_type: VkStorageType = VkStorageType.DEFAULT_STORAGE
+    memory_layout: VkMemoryLayout = VkMemoryLayout.DEFAULT_LAYOUT
 
 
 @dataclass
@@ -120,3 +136,6 @@ class VkGraph:
 
     constants: List[VkBytes]
     shaders: List[VkBytes]
+
+    storage_type_override: VkStorageType = VkStorageType.DEFAULT_STORAGE
+    memory_layout_override: VkMemoryLayout = VkMemoryLayout.DEFAULT_LAYOUT
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -8,6 +8,8 @@
 import unittest
 from typing import Tuple
 
+import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema
+
 import torch
 
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
@@ -56,46 +58,66 @@ def lower_module_and_test_output(
         rtol=1e-01,
         dynamic_shapes=None,
         test_inputs=None,
+        memory_layouts=None,
     ):
         """
         Helper testing function that takes a torch.nn.Module and lowers it to Vulkan with
         the given sample inputs. It then runs the lowered module and compares its
         outputs with the outputs of the eager module.
         """
-        program: ExportedProgram = export(
-            model, sample_inputs, dynamic_shapes=dynamic_shapes
-        )
-        edge_program: EdgeProgramManager = to_edge(program)
-        edge_program = edge_program.to_backend(VulkanPartitioner())
 
-        executorch_program = edge_program.to_executorch()
+        def run_test(memory_layout):
+            compile_options = {
+                "memory_layout_override": memory_layout,
+            }
+            program: ExportedProgram = export(
+                model, sample_inputs, dynamic_shapes=dynamic_shapes
+            )
+            edge_program: EdgeProgramManager = to_edge(program)
 
-        self.assertEqual(
-            executorch_program.executorch_program.execution_plan[0].delegates[0].id,
-            VulkanBackend.__name__,
-        )
+            edge_program = edge_program.to_backend(VulkanPartitioner(compile_options))
 
-        executorch_module = _load_for_executorch_from_buffer(executorch_program.buffer)
-        # pyre-fixme[16]: Module `pytree` has no attribute `tree_flatten`.
-        inputs_flattened, _ = tree_flatten(sample_inputs)
+            executorch_program = edge_program.to_executorch()
 
-        model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
-        ref_output = model(*sample_inputs)
+            self.assertEqual(
+                executorch_program.executorch_program.execution_plan[0].delegates[0].id,
+                VulkanBackend.__name__,
+            )
 
-        self.assert_outputs_equal(model_output, ref_output, atol=atol, rtol=rtol)
+            executorch_module = _load_for_executorch_from_buffer(
+                executorch_program.buffer
+            )
+            inputs_flattened, _ = tree_flatten(sample_inputs)
 
-        if test_inputs is not None:
-            for test_input in test_inputs:
-                # pyre-fixme[16]: Module `pytree` has no attribute `tree_flatten`.
-                test_inputs_flattened, _ = tree_flatten(test_input)
-                model_output = executorch_module.run_method(
-                    "forward", tuple(test_inputs_flattened)
-                )
-                ref_output = model(*test_input)
+            model_output = executorch_module.run_method(
+                "forward", tuple(inputs_flattened)
+            )
+            ref_output = model(*sample_inputs)
 
-                self.assert_outputs_equal(
-                    model_output, ref_output, atol=atol, rtol=rtol
-                )
+            self.assert_outputs_equal(model_output, ref_output, atol=atol, rtol=rtol)
+
+            if test_inputs is not None:
+                for test_input in test_inputs:
+                    test_inputs_flattened, _ = tree_flatten(test_input)
+                    model_output = executorch_module.run_method(
+                        "forward", tuple(test_inputs_flattened)
+                    )
+                    ref_output = model(*test_input)
+
+                    self.assert_outputs_equal(
+                        model_output, ref_output, atol=atol, rtol=rtol
+                    )
+
+        memory_layouts_to_test = [
+            vk_graph_schema.VkMemoryLayout.TENSOR_WIDTH_PACKED,
+            vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED,
+        ]
+
+        if memory_layouts is not None:
+            memory_layouts_to_test = memory_layouts
+
+        for memory_layout in memory_layouts_to_test:
+            run_test(memory_layout)
 
     def test_vulkan_backend_add(self):
         # This test is the simplest test by manually lowering some submodules, we can use paritioner for auto detecting lowerable parts