Update base for Update on "Remove llama related stuff out of bpe_tokenizer"

helunwencser · helunwencser · commit 1b5184d714c5 · 2024-07-12T10:20:03.000-07:00
We don't need to initialize `vocab_`, `vocab_scores_`, etc. They will be initialized anyway while loading the tokenizer binary. A benefit of removing them is that we can remove these llama related default values and make `bpe_tokenizer` agnostic to models. Differential Revision: [D59664556](https://our.internmc.facebook.com/intern/diff/D59664556/) [ghstack-poisoned]
diff --git a/backends/transforms/TARGETS b/backends/transforms/TARGETS
@@ -120,6 +120,7 @@ runtime.python_library(
         "//executorch/backends/...",
         "//executorch/examples/...",
         "//executorch/extension/llm/...",
+        "@EXECUTORCH_CLIENTS",
     ],
     deps = [
         "//caffe2:torch",
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
@@ -45,6 +45,7 @@ def __contains__(self, op):
 BINARY_OPS = [
     exir_ops.edge.aten.add.Tensor,
     exir_ops.edge.aten.sub.Tensor,
+    exir_ops.edge.aten.minimum.default,
     exir_ops.edge.aten.mul.Tensor,
     exir_ops.edge.aten.div.Tensor,
     exir_ops.edge.aten.div.Tensor_mode,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
@@ -28,3 +28,5 @@ binary_op:
       OPERATOR: pow(X, Y)
     - NAME: binary_floor_divide
       OPERATOR: floor(X / Y)
+    - NAME: binary_minimum
+      OPERATOR: min(X, Y)
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -118,6 +118,7 @@ DEFINE_BINARY_OP_WITH_ALPHA_FN(floor_divide);
 DEFINE_BINARY_OP_FN(mul);
 DEFINE_BINARY_OP_FN(div);
 DEFINE_BINARY_OP_FN(pow);
+DEFINE_BINARY_OP_FN(minimum);
 
 REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.add.Tensor, add);
@@ -126,6 +127,7 @@ REGISTER_OPERATORS {
   VK_REGISTER_OP(aten.div.Tensor, div);
   VK_REGISTER_OP(aten.div.Tensor_mode, floor_divide);
   VK_REGISTER_OP(aten.pow.Tensor_Tensor, pow);
+  VK_REGISTER_OP(aten.minimum.default, minimum);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -1022,3 +1022,19 @@ def get_constant_pad_nd_inputs():
         ]
     )
     return test_suite
+
+
+@register_test_suite("aten.minimum.default")
+def get_minimum_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((M1, M2), (M2)),
+            ((M1, M2), (M1, M2)),
+            ((M1, M2, M), (M2, M)),
+            ((M1, M1, S1, S2), (M1, M1, S1, S2)),
+            ((S1, S1, S2, S), (S1, S2, S)),
+            ((M1, S1, S2), (L, M1, S1, S2)),
+            ((S1, S2), (L, M1, S1, S2)),
+        ]
+    )
+    return test_suite
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1072,6 +1072,25 @@ def forward(self, x):
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
 
+    def test_vulkan_backend_minimum(self):
+        class MinimumModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return torch.minimum(x, y)
+
+        sample_inputs = (
+            torch.rand(size=(3, 5, 6, 4), dtype=torch.float32),
+            torch.rand(size=(6, 4), dtype=torch.float32),
+        )
+
+        self.lower_module_and_test_output(
+            MinimumModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
     def test_vulkan_backend_reshape(self):
         class ReshapeModule(torch.nn.Module):
             def __init__(self):
diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py
@@ -869,7 +869,9 @@ def __init__(
         self.quant = quant
 
         # TODO(T174256335) - remove this once we have a better way to handle >2d Mask
-        self._lower_recomposed_sdpa: bool = _lower_recomposed_sdpa or True
+        self._lower_recomposed_sdpa: bool = (
+            _lower_recomposed_sdpa if _lower_recomposed_sdpa is not None else True
+        )
 
         self.delegation_spec = DelegationSpec(XnnpackBackend.__name__, [])
         self.partition_tags: Dict[str, DelegationSpec] = {}
diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.cpp b/examples/models/llama2/tokenizer/llama_tiktoken.cpp
@@ -97,5 +97,13 @@ const Encoder LlamaTiktoken::get_special_tokens(ssize_t num_base_tokens) const {
       return _get_default_special_tokens(num_base_tokens);
   }
 }
+
+const std::string LlamaTiktoken::get_bos_token() const {
+  return "<|begin_of_text|>";
+}
+
+const std::string LlamaTiktoken::get_eos_token() const {
+  return "<|end_of_text|>";
+}
 } // namespace executor
 } // namespace torch
diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.h b/examples/models/llama2/tokenizer/llama_tiktoken.h
@@ -26,6 +26,8 @@ class LlamaTiktoken : public Tiktoken {
 
  protected:
   const Encoder get_special_tokens(ssize_t num_base_tokens) const override;
+  const std::string get_bos_token() const override;
+  const std::string get_eos_token() const override;
 
  private:
   const Version _version;
diff --git a/examples/models/llama2/tokenizer/tiktoken.cpp b/examples/models/llama2/tokenizer/tiktoken.cpp
@@ -346,8 +346,8 @@ Error Tiktoken::load(const std::string& path) {
 
   // initialize vocab_size, bos_tok, eos_tok
   vocab_size_ = _encoder.size() + _special_token_encoder.size();
-  bos_tok_ = _special_token_encoder.at("<|begin_of_text|>");
-  eos_tok_ = _special_token_encoder.at("<|end_of_text|>");
+  bos_tok_ = _special_token_encoder.at(get_bos_token());
+  eos_tok_ = _special_token_encoder.at(get_eos_token());
 
   initialized_ = true;
   return Error::Ok;
diff --git a/examples/models/llama2/tokenizer/tiktoken.h b/examples/models/llama2/tokenizer/tiktoken.h
@@ -39,6 +39,10 @@ class Tiktoken : public Tokenizer {
  protected:
   // Provide model specific special tokens.
   virtual const Encoder get_special_tokens(ssize_t num_base_tokens) const = 0;
+  // Provide beginning of sentence token.
+  virtual const std::string get_bos_token() const = 0;
+  // Provide end of sentence token.
+  virtual const std::string get_eos_token() const = 0;
 
  private:
   template <typename T>
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -219,6 +219,9 @@ def verify_graph_input_output(self) -> None:
                 if _is_mutable_buffer(nd, self.graph_signature):
                     continue
                 assert len(specs) > 0, "Expect tensor specs"
+                specs = list(filter(lambda spec: not spec.const, specs))
+                if len(specs) == 0:
+                    continue
                 allocated = any(
                     spec is None or spec.mem_offset is not None for spec in specs
                 )
diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py
@@ -1058,12 +1058,18 @@ def serialize(self, exported_program: ep.ExportedProgram) -> SerializedArtifact:
             assert n not in constants
             constants[n] = t
 
+        additional_kwargs = {}
+        if hasattr(exported_program, "verifiers"):
+            additional_kwargs["verifiers"] = [
+                v.dialect for v in exported_program.verifiers
+            ]
         serialized_ep = ExportedProgram(
             graph_module=serialized_graph_module,
             opset_version=self.opset_version,
             range_constraints=serialized_range_constraints,
             schema_version=SchemaVersion(-1, -1),
             dialect=exported_program.dialect,
+            **additional_kwargs,
         )
 
         return SerializedArtifact(
diff --git a/exir/serde/serialize.py b/exir/serde/serialize.py
@@ -344,13 +344,19 @@ def serialize(
             assert n not in constants
             constants[n] = t
 
+        additional_kwargs = {}
+        if hasattr(exported_program, "verifiers"):
+            additional_kwargs["verifiers"] = [
+                v.dialect for v in exported_program.verifiers
+            ]
         return export_serialize.SerializedArtifact(
             schema.ExportedProgram(
                 graph_module=serialized_graph_module,
                 opset_version=self.opset_version,
                 range_constraints=serialized_range_constraints,
                 schema_version=SchemaVersion(-1, -1),
                 dialect=exported_program.dialect,
+                **additional_kwargs,
             ),
             export_serialize.serialize_torch_artifact(exported_program.state_dict),
             export_serialize.serialize_torch_artifact(constants),
diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp
@@ -136,17 +136,6 @@ void conv2d_impl(
       }
     }
   } else { // transposed convolution
-    if (bias_ptr != nullptr) {
-      out_coord[2] = 0;
-      out_coord[3] = 0;
-      size_t out_c_start_idx =
-          calculate_linear_index(out_coord, out_strides.data(), 4);
-      size_t out_c_end_idx = out_c_start_idx + out_H * out_W;
-      for (size_t out_ix = out_c_start_idx; out_ix < out_c_end_idx; out_ix++) {
-        out_ptr[out_ix] = convert<CTYPE, CTYPE_BIAS>(bias_ptr[out_c]);
-      }
-    }
-
     w_coord[1] = out_c - out_c_start;
 
     for (size_t in_y = 0; in_y < in_H; ++in_y) {
@@ -295,12 +284,22 @@ void convolution_wrapper(
       bias.has_value() ? bias.value().const_data_ptr<CTYPE_BIAS>() : nullptr;
 
   size_t out_N = out.size(0);
-  size_t out_C_per_group = out.size(1) / groups;
+  size_t out_C = out.size(1);
+  size_t out_C_per_group = out_C / groups;
 
-  if (transposed && bias_ptr == nullptr) {
-    // If bias is not present, we need to initialize the output to 0
-    // before we can accumulate into it.
-    memset(out_ptr, 0, out.nbytes());
+  if (transposed) {
+    // For transposed convolution, we need to initialized the output before we
+    // can accumulate into it.
+    if (bias_ptr == nullptr) {
+      // If bias is not present, we need to initialize the output to 0
+      memset(out_ptr, 0, out.nbytes());
+    } else {
+      // If bias is present, we initialize the output to the bias value
+      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+        out_ptr[out_ix] = convert<CTYPE, CTYPE_BIAS>(
+            bias_ptr[(out_ix / out_strides[1]) % out_C]);
+      }
+    }
   }
 
   for (size_t batch = 0; batch < out_N; ++batch) {
diff --git a/kernels/test/op_convolution_test.cpp b/kernels/test/op_convolution_test.cpp
@@ -556,7 +556,7 @@ TEST_F(OpConvCorrectnessTest, TransposedNonDefaultParams) {
   Tensor input = tf.full({2, 6, 4, 5}, 2.0);
   Tensor weight = tf.full({6, 1, 2, 2}, 0.5);
   Tensor bias = tf.make({3}, {1, 2, 3});
-  Tensor out = tf.zeros({2, 3, 3, 6});
+  Tensor out = tf.full({2, 3, 3, 6}, 0.7);
   Tensor expected = tf.make(
       {2, 3, 3, 6},
       {1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 2, 2, 2, 2,
@@ -587,6 +587,118 @@ TEST_F(OpConvCorrectnessTest, TransposedNonDefaultParams) {
   EXPECT_TENSOR_CLOSE(out, expected);
 }
 
+template <typename T>
+std::vector<T> get_channels_last_data(const Tensor& t) {
+  const std::vector<int32_t> sizes(t.sizes().begin(), t.sizes().end());
+  std::vector<T> contiguous_data(
+      t.const_data_ptr<T>(), t.const_data_ptr<T>() + t.numel());
+  std::vector<T> channels_last_data(t.numel());
+  int32_t N = sizes[0];
+  int32_t C = sizes[1];
+  int32_t H = sizes[2];
+  int32_t W = sizes[3];
+  for (int32_t n = 0; n < N; ++n) {
+    for (int32_t c = 0; c < C; ++c) {
+      for (int32_t h = 0; h < H; ++h) {
+        for (int32_t w = 0; w < W; ++w) {
+          // Calculate the index in the original blob
+          int32_t old_index = ((n * C + c) * H + h) * W + w;
+          // Calculate the index in the new blob
+          int32_t new_index = ((n * H + h) * W + w) * C + c;
+          // Copy the data
+          channels_last_data[new_index] = contiguous_data[old_index];
+        }
+      }
+    }
+  }
+  return channels_last_data;
+}
+
+TEST_F(OpConvCorrectnessTest, TransposedDefaultParamsChannelsLast) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor input = tf.full_channels_last({2, 4, 3, 2}, 2.0);
+  Tensor weight = tf.full_channels_last({4, 1, 2, 2}, 0.5);
+  optional<Tensor> bias;
+  Tensor out = tf.full_channels_last({2, 2, 4, 3}, 0.7);
+  Tensor expected =
+      tf.make({2, 2, 4, 3}, {2, 4, 2, 4, 8, 4, 4, 8, 4, 2, 4, 2, 2, 4, 2, 4,
+                             8, 4, 4, 8, 4, 2, 4, 2, 2, 4, 2, 4, 8, 4, 4, 8,
+                             4, 2, 4, 2, 2, 4, 2, 4, 8, 4, 4, 8, 4, 2, 4, 2});
+
+  const std::vector<int32_t> sizes(
+      expected.sizes().begin(), expected.sizes().end());
+  std::vector<float> channels_last_data =
+      get_channels_last_data<float>(expected);
+  Tensor expected_channels_last =
+      tf.make_channels_last(sizes, channels_last_data);
+
+  int64_t stride[1] = {1};
+  int64_t padding[1] = {0};
+  int64_t dilation[1] = {1};
+  bool transposed = true;
+  int64_t output_padding[1] = {0};
+  int64_t groups = 2;
+
+  op_convolution_out(
+      input,
+      weight,
+      exec_aten::optional<Tensor>(bias),
+      exec_aten::ArrayRef<int64_t>{stride, 1},
+      exec_aten::ArrayRef<int64_t>{padding, 1},
+      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      transposed,
+      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      groups,
+      out);
+
+  EXPECT_TENSOR_CLOSE(out, expected_channels_last);
+}
+
+TEST_F(OpConvCorrectnessTest, TransposedNonDefaultParamsChannelsLast) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor input = tf.full_channels_last({2, 6, 4, 5}, 2.0);
+  Tensor weight = tf.full_channels_last({6, 1, 2, 2}, 0.5);
+  Tensor bias = tf.make({3}, {1, 2, 3});
+  Tensor out = tf.full_channels_last({2, 3, 3, 6}, 0.7);
+  Tensor expected = tf.make(
+      {2, 3, 3, 6},
+      {1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 2, 2, 2, 2,
+       2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 4, 2, 4, 4, 3, 3, 3, 3, 3, 3, 3, 5,
+       5, 3, 5, 5, 3, 5, 5, 3, 5, 5, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 3,
+       1, 3, 3, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 4, 2,
+       4, 4, 3, 3, 3, 3, 3, 3, 3, 5, 5, 3, 5, 5, 3, 5, 5, 3, 5, 5});
+
+  const std::vector<int32_t> sizes(
+      expected.sizes().begin(), expected.sizes().end());
+  std::vector<float> channels_last_data =
+      get_channels_last_data<float>(expected);
+  Tensor expected_channels_last =
+      tf.make_channels_last(sizes, channels_last_data);
+
+  int64_t stride[1] = {3};
+  int64_t padding[1] = {7};
+  int64_t dilation[1] = {5};
+  bool transposed = true;
+  int64_t output_padding[1] = {2};
+  int64_t groups = 3;
+
+  op_convolution_out(
+      input,
+      weight,
+      exec_aten::optional<Tensor>(bias),
+      exec_aten::ArrayRef<int64_t>{stride, 1},
+      exec_aten::ArrayRef<int64_t>{padding, 1},
+      exec_aten::ArrayRef<int64_t>{dilation, 1},
+      transposed,
+      exec_aten::ArrayRef<int64_t>{output_padding, 1},
+      groups,
+      out);
+
+  EXPECT_TENSOR_CLOSE(out, expected_channels_last);
+}
+
 TEST_F(OpConvCorrectnessTest, InvalidOutputPadding) {
   TensorFactory<ScalarType::Float> tf;
 

Original file line number	Diff line number	Diff line change
`@@ -97,5 +97,13 @@ const Encoder LlamaTiktoken::get_special_tokens(ssize_t num_base_tokens) const {`
`97`	`97`	`return _get_default_special_tokens(num_base_tokens);`
`98`	`98`	`}`
`99`	`99`	`}`
	`100`	`+`
	`101`	`+const std::string LlamaTiktoken::get_bos_token() const {`
	`102`	`+ return "<\|begin_of_text\|>";`
	`103`	`+}`
	`104`	`+`
	`105`	`+const std::string LlamaTiktoken::get_eos_token() const {`
	`106`	`+ return "<\|end_of_text\|>";`
	`107`	`+}`
`100`	`108`	`} // namespace executor`
`101`	`109`	`} // namespace torch`