[llava][19/N] Add multimodal runner base class and build file

larryliu0820 · larryliu0820 · commit 9a884843182c · 2024-08-12T01:04:25.000-07:00
ghstack-source-id: e768077 Pull Request resolved: #4665
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
@@ -158,6 +158,19 @@ deps = [
   "executorch_no_prim_ops",
 ]
 
+[targets.extension_llm_runner]
+buck_targets = [
+  "//extension/llm/runner:runner_lib",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_no_prim_ops",
+  "extension_module",
+  "extension_runner_util",
+]
 # ---------------------------------- extension end ----------------------------------
 # ---------------------------------- binary start ----------------------------------
 
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Build llm runner lib.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+#
+# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
+#
+set(EXECUTORCH_SRCS_FILE
+    "${CMAKE_CURRENT_BINARY_DIR}/../../../executorch_srcs.cmake"
+)
+
+extract_sources(${EXECUTORCH_SRCS_FILE})
+
+include(${EXECUTORCH_SRCS_FILE})
+
+# build llm runner library
+list(TRANSFORM _extension_llm_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
+
+target_include_directories(
+  extension_module INTERFACE ${_common_include_directories}
+)
+
+add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs})
+
+set(runner_deps executorch extension_module extension_data_loader)
+
+target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
+
+target_include_directories(
+    extension_llm_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
+)
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
@@ -0,0 +1,17 @@
+// The module takes in a string as input and emits a string as output.
+
+#pragma once
+#include <cstdint>
+#include <vector>
+
+namespace torch::executor {
+
+struct Image {
+  // Assuming NCHW format
+  std::vector<uint8_t> data;
+  int32_t width;
+  int32_t height;
+  int32_t channels;
+};
+
+} // namespace torch::executor
diff --git a/extension/llm/runner/image_prefiller.h b/extension/llm/runner/image_prefiller.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given a image tensor, prefill the KV cache of a multimodal LLM.
+
+#pragma once
+
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/module/module.h>
+
+namespace torch::executor {
+
+// Assuming kv cache and parallel prefill are enabled.
+class ImagePrefiller {
+ public:
+  ImagePrefiller(Module* module) : module_(module){};
+  /**
+   * Prefill an LLM Module with the given image input.
+   * @param image The image input to the multimodal LLM.
+   * @param start_pos The starting position in KV cache of the input in the LLM
+   * @return The next token of the LLM Module after prefill.
+   */
+  virtual Result<exec_aten::Tensor> prefill(
+      Image& image,
+      int64_t start_pos = 0) = 0;
+
+ protected:
+  Module* module_;
+};
+
+} // namespace torch::executor
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple multimodal LLM runner that includes preprocessing and post
+// processing logic. The module takes in a string as input and emits a string as
+// output.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/image_prefiller.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/runner/text_prefiller.h>
+#include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+
+namespace torch::executor {
+using Stats = ::executorch::llm::Stats;
+
+class MultimodalRunner {
+ public:
+  explicit MultimodalRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const float temperature = 0.8f)
+      : temperature_(temperature),
+        module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
+        tokenizer_path_(tokenizer_path) {
+    ET_LOG(
+        Info,
+        "Creating Multimodal LLM runner: model_path=%s, tokenizer_path=%s",
+        model_path.c_str(),
+        tokenizer_path.c_str());
+  };
+
+  virtual bool is_loaded() = 0;
+  virtual Error load() = 0;
+  virtual Error generate(
+      std::vector<Image>& images,
+      const std::string& prompt,
+      int32_t seq_len = 1024,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {}) = 0;
+
+  inline void stop() {
+    text_token_generator_->stop();
+  }
+
+ protected:
+  // metadata
+  int32_t vocab_size_;
+  int32_t bos_id_;
+  int32_t eos_id_;
+  int32_t n_bos_;
+  int32_t n_eos_;
+  int32_t max_seq_len_;
+  float temperature_;
+
+  // model
+  std::unordered_set<std::string> model_methods_;
+  std::unique_ptr<Module> module_;
+  std::unique_ptr<TextDecoderRunner> text_decoder_runner_;
+  std::unique_ptr<TextPrefiller> text_prefiller_;
+  std::unique_ptr<ImagePrefiller> image_prefiller_;
+  std::unique_ptr<TextTokenGenerator> text_token_generator_;
+  std::string tokenizer_path_;
+  std::unique_ptr<Tokenizer> tokenizer_;
+
+  // stats
+  Stats stats_;
+};
+
+} // namespace torch::executor
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
@@ -44,6 +44,7 @@ def define_common_targets():
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
             ],
         )
+
         runtime.cxx_library(
             name = "text_token_generator" + aten_suffix,
             exported_headers = ["text_token_generator.h"],
@@ -57,3 +58,20 @@ def define_common_targets():
                 "//executorch/extension/runner_util:managed_tensor" + aten_suffix,
             ],
         )
+
+        runtime.cxx_library(
+            name = "runner_lib" + aten_suffix,
+            exported_headers = [
+                "image_prefiller.h",
+                "image.h",
+                "multimodal_runner.h",
+            ],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            exported_deps = [
+                ":text_decoder_runner" + aten_suffix,
+                ":text_prefiller" + aten_suffix,
+                ":text_token_generator" + aten_suffix,
+            ],
+        )