Update on "Factor out eager val from eval_llama_lib"

cccclai · cccclai · commit 3483891e5bc5 · 2024-05-29T10:54:07.000-07:00
Would like to re-use EagerEvalWrapper and eval function for quantization calibration. Differential Revision: [D57881028](https://our.internmc.facebook.com/intern/diff/D57881028/) [ghstack-poisoned]
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
@@ -112,6 +112,7 @@ def __contains__(self, op):
 ]
 
 CREATION_OPS = [
+    exir_ops.edge.aten.arange.start_step,
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.full.default,
 ]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange.glsl
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_ubo(1, "ivec4", "sizes")}
+${layout_declare_ubo(2, "float", "start")}
+${layout_declare_ubo(3, "float", "step")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
+
+  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
+    return;
+  }
+
+  VEC4_T outtex = VEC4_T(start + pos.x * step, 0, 0, 0);
+
+  imageStore(t_out, pos, outtex);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.yaml b/backends/vulkan/runtime/graph/ops/glsl/arange.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+arange:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: int
+    STORAGE: texture3d
+    PACKING: C_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+  shader_variants:
+    - NAME: arange
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.glsl b/backends/vulkan/runtime/graph/ops/glsl/full.glsl
@@ -12,7 +12,6 @@
 
 #define VEC4_T ${texel_type(DTYPE)}
 
-#include "broadcasting_utils.h"
 #include "indexing_utils.h"
 
 layout(std430) buffer;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp b/backends/vulkan/runtime/graph/ops/impl/Arange.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/api/Utils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_arange_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+
+  int start_val = 0;
+  int step_val = 1;
+  if (!graph->val_is_none(extra_args[0])) {
+    start_val = graph->extract_scalar<int64_t>(extra_args[0]);
+  }
+  int end_val = graph->extract_scalar<int64_t>(extra_args[1]);
+  if (!graph->val_is_none(extra_args[2])) {
+    step_val = graph->extract_scalar<int64_t>(extra_args[2]);
+  }
+
+  std::vector<int64_t> out_sizes = {
+      api::utils::div_up(end_val - start_val, step_val)};
+
+  out->virtual_resize(out_sizes);
+}
+
+void check_arange_input(
+    ComputeGraph& graph,
+    const ValueRef start,
+    const ValueRef end,
+    const ValueRef step) {
+  if (!graph.val_is_none(start) && !graph.val_is_int(end)) {
+    VK_THROW("arange: start must be int!");
+  }
+  if (!graph.val_is_none(end) && !graph.val_is_int(end)) {
+    VK_THROW("arange: end must be int!");
+  }
+  if (!graph.val_is_none(step) && !graph.val_is_int(end)) {
+    VK_THROW("arange: step must be int!");
+  }
+}
+
+void add_arange_node(
+    ComputeGraph& graph,
+    const ValueRef start,
+    const ValueRef end,
+    const ValueRef step,
+    const ValueRef out) {
+  float start_val = 0.0f;
+  float step_val = 1.0f;
+
+  if (graph.val_is_none(end)) {
+    VK_THROW("arange: end must be specified!");
+  }
+
+  if (!graph.val_is_none(start)) {
+    if (graph.val_is_int(start)) {
+      start_val = static_cast<float>(graph.extract_scalar<int64_t>(start));
+    } else {
+      start_val = graph.extract_scalar<float>(start);
+    }
+  }
+  if (!graph.val_is_none(step)) {
+    if (graph.val_is_int(step)) {
+      step_val = static_cast<float>(graph.extract_scalar<int64_t>(step));
+    } else {
+      step_val = graph.extract_scalar<float>(step);
+    }
+  }
+
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  api::utils::uvec3 global_size = t_out->image_extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("arange");
+  kernel_name.reserve(kShaderNameReserve);
+
+  add_dtype_suffix(kernel_name, *t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE}},
+      // Shader params buffers
+      {t_out->sizes_ubo(),
+       graph.create_params_buffer(start_val),
+       graph.create_params_buffer(step_val)},
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_arange_node,
+      {start, end, step}));
+}
+
+void arange(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_arange_node(graph, args[0], args[1], args[2], args[7]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.arange.start_step, arange);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
@@ -16,10 +16,10 @@
 
 namespace vkcompute {
 
-// Executorch-Vulkan framework to add node
+// ExecuTorch-Vulkan framework to add node
 // Args:
 //   in: will be converted from NCHW input tensor to 3D ARGB representation in
-//   openGL (via Executorch) output_sizes: optional 2D array of targetting
+//   openGL (via ExecuTorch) output_sizes: optional 2D array of targetting
 //   output size of H and W dimensions. >= input sizes;
 
 //      will be computed if only given the scale_factors.
@@ -31,7 +31,6 @@ void add_upsample_nearest2d_node(
     const ValueRef output_sizes,
     const ValueRef scale_factors,
     const ValueRef out) {
-  // TODO(T190297757) add supports for output_sizes
   if (graph.val_is_none(output_sizes) && graph.val_is_none(scale_factors)) {
     VK_THROW(
         "Invalid input, must provide either output_sizes or scale_factors");
@@ -40,7 +39,6 @@ void add_upsample_nearest2d_node(
     VK_THROW(
         "Invalid input, must provide ONLY one of output_sizes or scale_factors");
   }
-  auto scales = graph.get_double_list(scale_factors);
 
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
 
@@ -50,10 +48,25 @@ void add_upsample_nearest2d_node(
   api::utils::ivec2 input_size = {
       api::utils::safe_downcast<int32_t>(input_sizes.data[0]),
       api::utils::safe_downcast<int32_t>(input_sizes.data[1])};
-  // Reverse scale factors that pre-computed before GLSL.
   api::utils::vec2 rev_scales = {
-      api::utils::safe_downcast<float>(1.0 / scales->at(1)),
-      api::utils::safe_downcast<float>(1.0 / scales->at(0))};
+      api::utils::safe_downcast<float>(1.0),
+      api::utils::safe_downcast<float>(1.0)};
+
+  // Reverse scale factors that pre-computed before GLSL.
+  if (!graph.val_is_none(output_sizes)) {
+    auto output_size_ref = graph.get_int_list(output_sizes);
+    rev_scales = {
+        api::utils::safe_downcast<float>(
+            (float)input_size.data[0] / output_size_ref->at(1)),
+        api::utils::safe_downcast<float>(
+            (float)input_size.data[1] / output_size_ref->at(0))};
+
+  } else {
+    auto scales = graph.get_double_list(scale_factors);
+    rev_scales = {
+        api::utils::safe_downcast<float>(1.0 / scales->at(1)),
+        api::utils::safe_downcast<float>(1.0 / scales->at(0))};
+  }
 
   vTensorPtr t_out = graph.get_tensor(out);
   api::utils::uvec3 global_size = t_out->image_extents();
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -249,7 +249,9 @@ def get_upsample_inputs():
             ((1, 1, 2, 2), None, [2, 2]),
             ((1, 1, 2, 2), None, [2, 4]),
             ((1, 1, 2, 2), None, [4, 2]),
-            # TODO(T190297757) add supports for output_sizes
+            ((1, 1, 2, 2), [2, 2], None),
+            ((1, 1, 2, 2), [2, 4], None),
+            ((1, 1, 2, 2), [3, 2], None),
         ]
     )
     return test_suite
@@ -817,6 +819,28 @@ def get_gelu_inputs():
     return test_suite
 
 
+def get_arange_inputs():
+    test_suite = VkTestSuite(
+        [
+            (1, 13),
+            (1.0, 11),
+            (-13, 3),
+            (-11.0, 2),
+            (3, 15, 3),
+            (3, 23, 2),
+            (3, 23.0, 4),
+            (13, 1, -1),
+            (-3, -13, -2),
+            (13, -2.0, -4),
+        ],
+    )
+
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -855,4 +879,5 @@ def get_gelu_inputs():
     "aten.sin.default": get_unary_ops_inputs(),
     "aten.neg.default": get_unary_ops_inputs(),
     "aten.cos.default": get_unary_ops_inputs(),
+    "aten.arange.start_step": get_arange_inputs(),
 }
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -146,10 +146,10 @@ def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
 
         if cpp_type == AT_INT_ARRAY_REF:
             ret_str = f"std::vector<int64_t> {arg.name} = "
-        elif (
-            cpp_type == OPT_AT_DOUBLE_ARRAY_REF or cpp_type == OPT_AT_INT_ARRAY_REF
-        ) and str(data) != "None":
+        elif cpp_type == OPT_AT_DOUBLE_ARRAY_REF and str(data) != "None":
             ret_str = f"std::vector<double> {arg.name} = "
+        elif cpp_type == OPT_AT_INT_ARRAY_REF and str(data) != "None":
+            ret_str = f"std::vector<int64_t> {arg.name} = "
         else:
             ret_str = f"{cpp_type} {arg.name} = "
 
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1308,3 +1308,54 @@ def forward(self, x):
             sample_inputs,
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_arange_int(self):
+        class ArangeModule(torch.nn.Module):
+            def __init__(self, input):
+                super().__init__()
+                self.input = input
+
+            def forward(self, x):
+                return torch.arange(*self.input, dtype=torch.int32)
+
+        # `torch.arange` could take one, two or three arguments as input.
+        # If only one argument is provided, it will be interpreted as `end`.
+        # If two arguments are provided, the first one will be interpreted as `start`
+        # and the second one will be interpreted as `end`.
+        # If three arguments are provided, the first one will be interpreted as `start`,
+        # the second one will be interpreted as `end` and the third one will be
+        # interpreted as `step`.
+        inputs = [
+            [1],
+            [-3, 5],
+            [1, 11, 2],
+            [12, 1, -2],
+        ]
+        for input in inputs:
+            self.lower_module_and_test_output(
+                ArangeModule(input),
+                (torch.randn(size=(1,), dtype=torch.float32),),  # dummy input
+                memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+            )
+
+    def test_vulkan_backend_arange_float(self):
+        class ArangeModule(torch.nn.Module):
+            def __init__(self, input):
+                super().__init__()
+                self.input = input
+
+            def forward(self, x):
+                return torch.arange(*self.input)
+
+        inputs = [
+            [1.5],
+            [-3, 5.0],
+            [1.0, 11, 2],
+            [12, 1, -2.0],
+        ]
+        for input in inputs:
+            self.lower_module_and_test_output(
+                ArangeModule(input),
+                (torch.randn(size=(1,), dtype=torch.float32),),  # dummy input
+                memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+            )
diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py

Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,7 @@ def __contains__(self, op):`
`112`	`112`	`]`
`113`	`113`
`114`	`114`	`CREATION_OPS = [`
	`115`	`+ exir_ops.edge.aten.arange.start_step,`
`115`	`116`	`exir_ops.edge.aten.clone.default,`
`116`	`117`	`exir_ops.edge.aten.full.default,`
`117`	`118`	`]`