[llava][20/N] Add llava runner using building blocks in e/llm/runner (#4666)

larryliu0820 · web-flow · commit 84100d1bf964 · 2024-08-14T17:20:44.000-07:00
* [llava][18/N] Move token generation loop to a class As titled. This PR moves the token generation loop in llama2 runner into a new class so it can be reused. [ghstack-poisoned] * [llava][19/N] Add multimodal runner base class and build file [ghstack-poisoned] * [llava][20/N] Add llava runner using building blocks in e/llm/runner [ghstack-poisoned] * Update base for Update on "[llava][20/N] Add llava runner using building blocks in e/llm/runner" Add llava runner that uses runner lib in `extension/llm/runner`. [ghstack-poisoned] * Update base for Update on "[llava][20/N] Add llava runner using building blocks in e/llm/runner" Add llava runner that uses runner lib in `extension/llm/runner`. Differential Revision: [D61292846](https://our.internmc.facebook.com/intern/diff/D61292846) [ghstack-poisoned]
diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Simple CMake build system for LLaVa runner.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# build llava_runner library
+set(_llava_runner__srcs
+    "${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"
+    "${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"
+    "${EXECUTORCH_ROOT}/extension/llm/tokenizer/bpe_tokenizer.cpp"
+)
+
+# extension llm runner lib
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/runner
+  ${CMAKE_CURRENT_BINARY_DIR}/../../../../extension/llm/runner
+)
+
+add_library(llava_runner STATIC ${_llava_runner__srcs})
+
+set(llava_runner_deps executorch extension_module extension_data_loader
+                      extension_llm_runner
+)
+
+target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
+
+target_include_directories(
+  llava_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
+)
diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given a image tensor, prefill the KV cache of LLaVA.
+
+#pragma once
+
+#include <executorch/extension/llm/runner/image_prefiller.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+
+namespace torch::executor {
+
+class LlavaImagePrefiller : public ImagePrefiller {
+ public:
+  LlavaImagePrefiller(Module* module) : ImagePrefiller(module){};
+  /**
+   * Prefill an LLM Module with the given image input.
+   * @param image The image input to LLaVa.
+   * @param start_pos The starting position in KV cache of the input in the LLM
+   * @return logits of the image prefill.
+   */
+  inline Result<exec_aten::Tensor> prefill(
+      Image& image,
+      int64_t start_pos = 0) {
+    ManagedTensor managed_images(
+        image.data.data(), {3, image.height, image.width}, ScalarType::Byte);
+    // Run image encoder
+    std::vector<EValue> image_encoder_outputs = ET_UNWRAP(module_->execute(
+        "image_encoder", {managed_images.get_aliasing_tensor()}));
+
+    // inputs:[start_pos, embeds]
+    ManagedTensor managed_start_pos(&start_pos, {1}, ScalarType::Long);
+    auto start_pos_tensor = managed_start_pos.get_aliasing_tensor();
+
+    // Run text model
+    std::vector<EValue> outputs_res = ET_UNWRAP(module_->execute(
+        "text_decoder", {start_pos_tensor, image_encoder_outputs[0]}));
+    ET_CHECK_MSG(
+        outputs_res[0].isTensor(),
+        "Non Tensor Output returned from executing image prefill");
+
+    return outputs_res[0].toTensor();
+  }
+};
+
+} // namespace torch::executor
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple LLaVA runner that includes preprocessing and post processing logic.
+// The runner takes in a prompt string as well as a list of images as input and
+// emits a string as output.
+
+#include <executorch/examples/models/llava/runner/llava_image_prefiller.h>
+#include <executorch/examples/models/llava/runner/llava_runner.h>
+#include <executorch/examples/models/llava/runner/llava_text_decoder_runner.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+
+#include <ctime>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+namespace torch::executor {
+
+bool LlavaRunner::is_loaded() {
+  Result<std::unordered_set<std::string>> methods_res = module_->method_names();
+  if (methods_res.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to get method names");
+    ET_CHECK_MSG(false, "Failed to get method names");
+  }
+  std::unordered_set<std::string> methods = methods_res.get();
+  bool methods_exist = methods.find("image_encoder") != methods.end() &&
+      methods.find("token_embedding") != methods.end() &&
+      methods.find("text_decoder") != methods.end();
+  if (!methods_exist) {
+    for (const auto& method : methods) {
+      ET_LOG(Error, "Method: %s", method.c_str());
+    }
+    ET_CHECK_MSG(
+        methods_exist,
+        "Missing required methods (image_encoder, token_embedding, text_decoder) in the model");
+  }
+  bool methods_loaded = module_->is_method_loaded("image_encoder") &&
+      module_->is_method_loaded("token_embedding") &&
+      module_->is_method_loaded("text_decoder");
+  return methods_loaded && tokenizer_ && text_decoder_runner_ &&
+      text_prefiller_ && image_prefiller_ && text_token_generator_;
+}
+
+Error LlavaRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  stats_.model_load_start_ms = util::time_in_ms();
+
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("image_encoder"));
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("token_embedding"));
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("text_decoder"));
+
+  // Load the tokenizer
+  tokenizer_ = std::make_unique<BPETokenizer>();
+  tokenizer_->load(tokenizer_path_);
+
+  // Load the text decoder runner
+  text_decoder_runner_ = std::make_unique<LlavaTextDecoderRunner>(
+      module_.get(), tokenizer_->vocab_size(), temperature_);
+
+  // Load the text prefiller
+  text_prefiller_ = std::make_unique<TextPrefiller>(
+      tokenizer_.get(),
+      text_decoder_runner_.get(),
+      /*use_kv_cache=*/true,
+      /*enable_parallel_prefill=*/true);
+
+  // Load the image prefiller
+  image_prefiller_ = std::make_unique<LlavaImagePrefiller>(module_.get());
+
+  // Load the text token generator
+  text_token_generator_ = std::make_unique<TextTokenGenerator>(
+      tokenizer_.get(),
+      text_decoder_runner_.get(),
+      /*use_kv_cache=*/true,
+      tokenizer_->eos_tok(),
+      &stats_);
+
+  stats_.model_load_end_ms = util::time_in_ms();
+  return Error::Ok;
+}
+
+Error LlavaRunner::generate(
+    std::vector<Image>& images,
+    const std::string& prompt,
+    int32_t seq_len,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback](const std::string& piece) {
+        util::safe_printf(piece.c_str());
+        fflush(stdout);
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
+
+  int64_t pos = 0;
+
+  // prefill preset prompt
+  std::vector<uint64_t> preset_prompt_tokens =
+      ET_UNWRAP(tokenizer_->encode(kPresetPrompt, /*bos=*/1, /*eos=*/0));
+  size_t num_preset_tokens = preset_prompt_tokens.size();
+
+  ET_UNWRAP(text_prefiller_->prefill(preset_prompt_tokens, pos));
+  pos += num_preset_tokens;
+
+  // prefill images
+  for (auto& image : images) {
+    auto logits = ET_UNWRAP(image_prefiller_->prefill(image, pos));
+    pos += logits.size(1);
+  }
+
+  // prefill user prompt. No BOS because preset prompt already has it.
+  std::vector<uint64_t> user_prompt_tokens =
+      ET_UNWRAP(tokenizer_->encode(prompt, /*bos=*/0, /*eos=*/0));
+  size_t num_user_tokens = user_prompt_tokens.size();
+
+  uint64_t prefill_next_token = ET_UNWRAP(
+      text_prefiller_->prefill(user_prompt_tokens, pos, wrapped_callback));
+  pos += num_user_tokens;
+
+  // Generate tokens
+  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
+      {prefill_next_token}, pos, seq_len, wrapped_callback));
+
+  // Bookkeeping
+  stats_.num_prompt_tokens = num_preset_tokens + num_user_tokens;
+  stats_.num_generated_tokens = num_generated_tokens;
+  ::executorch::llm::print_report(stats_);
+  if (stats_callback) {
+    stats_callback(stats_);
+  }
+
+  return Error::Ok;
+}
+
+} // namespace torch::executor
diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple multimodal LLM runner that includes preprocessing and post
+// processing logic.
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+
+namespace torch::executor {
+
+class LlavaRunner : public MultimodalRunner {
+ public:
+  explicit LlavaRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const float temperature = 0.8f)
+      : MultimodalRunner(model_path, tokenizer_path, temperature){};
+  bool is_loaded();
+  Error load();
+  Error generate(
+      std::vector<Image>& images,
+      const std::string& prompt,
+      int32_t seq_len = 1024,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
+
+ private:
+  inline static const std::string kPresetPrompt =
+      "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
+};
+
+} // namespace torch::executor
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given inputs, run a text decoder in Llava and return the output.
+
+#pragma once
+
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+
+namespace torch::executor {
+
+class LlavaTextDecoderRunner : public TextDecoderRunner {
+ public:
+  LlavaTextDecoderRunner(Module* module, int32_t vocab_size, float temperature)
+      : TextDecoderRunner(module, true, vocab_size, temperature){};
+
+  Result<exec_aten::Tensor> step(
+      ManagedTensor& managed_tokens,
+      ManagedTensor& managed_start_pos) {
+    auto tokens = managed_tokens.get_aliasing_tensor();
+    auto start_pos = managed_start_pos.get_aliasing_tensor();
+
+    // run token embedding
+    std::vector<EValue> token_embedding_outputs =
+        ET_UNWRAP(module_->execute("token_embedding", {tokens}));
+
+    // run text model
+    std::vector<EValue> outputs_res = ET_UNWRAP(module_->execute(
+        "text_decoder", {start_pos, token_embedding_outputs[0]}));
+
+    ET_CHECK_MSG(
+        outputs_res.size() == 1,
+        "More then one output returned from executing LLM.");
+    ET_CHECK_MSG(
+        outputs_res[0].isTensor(),
+        "Non Tensor Output returned from executing LLM");
+
+    // Return the logits tensor
+    return outputs_res[0].toTensor();
+  }
+};
+
+} // namespace torch::executor