[llava] Expose prefill image and prompt APIs

larryliu0820 · larryliu0820 · commit 89291007bb41 · 2024-09-05T16:09:05.000-07:00
Summary: We want to expose prefill_images() and prefill_prompt() for
Llava runner. These APIs will be called by JNI/Demo app so that we can
prefill asynchronously.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/models/llava/runner/llava_runner.cpp b/examples/models/llava/runner/llava_runner.cpp
@@ -72,6 +72,27 @@ Error LlavaRunner::load() {
   return Error::Ok;
 }
 
+Error LlavaRunner::prefill_images(
+    std::vector<Image>& images,
+    int64_t& start_pos) {
+  for (auto& image : images) {
+    // pos is updated inside image prefill.
+    ET_UNWRAP(image_prefiller_->prefill(image, start_pos));
+  }
+  return Error::Ok;
+}
+
+Result<uint64_t> LlavaRunner::prefill_prompt(
+    const std::string& prompt,
+    int64_t& start_pos,
+    int8_t bos,
+    int8_t eos) {
+  std::vector<uint64_t> prompt_tokens =
+      ET_UNWRAP(tokenizer_->encode(prompt, bos, eos));
+
+  return text_prefiller_->prefill(prompt_tokens, start_pos);
+}
+
 Error LlavaRunner::generate(
     std::vector<Image> images,
     const std::string& prompt,
@@ -96,36 +117,23 @@ Error LlavaRunner::generate(
   int64_t pos = 0;
 
   // prefill preset prompt
-  std::vector<uint64_t> preset_prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(kPresetPrompt, /*bos=*/1, /*eos=*/0));
-  size_t num_preset_tokens = preset_prompt_tokens.size();
-
-  ET_UNWRAP(text_prefiller_->prefill(preset_prompt_tokens, pos));
-  pos += num_preset_tokens;
+  prefill_prompt(kPresetPrompt, pos, /*bos=*/1, /*eos*/ 0);
 
   // prefill images
-  for (auto& image : images) {
-    // pos is updated inside image prefill.
-    ET_UNWRAP(image_prefiller_->prefill(image, pos));
-  }
+  prefill_images(images, pos);
 
   // prefill user prompt. No BOS because preset prompt already has it.
   wrapped_callback(prompt);
 
-  std::vector<uint64_t> user_prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(prompt, /*bos=*/0, /*eos=*/0));
-  size_t num_user_tokens = user_prompt_tokens.size();
-
   uint64_t prefill_next_token =
-      ET_UNWRAP(text_prefiller_->prefill(user_prompt_tokens, pos));
-  pos += num_user_tokens;
+      ET_UNWRAP(prefill_prompt(prompt, pos, /*bos=*/0, /*eos*/ 0));
+  stats_.num_prompt_tokens = pos;
 
   // Generate tokens
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
       {prefill_next_token}, pos, seq_len, wrapped_callback));
 
   // Bookkeeping
-  stats_.num_prompt_tokens = num_preset_tokens + num_user_tokens;
   stats_.num_generated_tokens = num_generated_tokens;
   ::executorch::llm::print_report(stats_);
   if (stats_callback) {
diff --git a/examples/models/llava/runner/llava_runner.h b/examples/models/llava/runner/llava_runner.h
@@ -38,6 +38,30 @@ class LlavaRunner : public MultimodalRunner {
       std::function<void(const ::executorch::extension::llm::Stats&)>
           stats_callback = {});
 
+  /**
+   * Prefill an LLaVA Module with the given images input.
+   * @param images The image input to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @return The error status of prefilling images.
+   */
+  Error prefill_images(std::vector<Image>& images, int64_t& start_pos);
+
+  /**
+   * Prefill an LLaVA Module with the given text input.
+   * @param prompt The text prompt to LLaVA.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @param bos The number of BOS (begin of sequence) token.
+   * @param eos The number of EOS (end of sequence) token.
+   * @return The generated token of the LLaVA Module after prefill prompt.
+   */
+  Result<uint64_t> prefill_prompt(
+      const std::string& prompt,
+      int64_t& start_pos,
+      int8_t bos = 0,
+      int8_t eos = 0);
+
  private:
   inline static const std::string kPresetPrompt =
       "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -25,7 +25,7 @@ TextPrefiller::TextPrefiller(
 
 ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     std::vector<uint64_t>& prompt_tokens,
-    int64_t start_pos) {
+    int64_t& start_pos) {
   ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
   if (!text_decoder_runner_->is_method_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
@@ -53,11 +53,10 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     ET_LOG(
         Info, "Prefill token result numel(): %zu", outputs_res.get().numel());
 
+    start_pos += num_prompt_tokens;
     cur_token = text_decoder_runner_->logits_to_token(outputs_res.get());
   } else { // sequential prefill
     int64_t pos = 0; // position in the sequence
-    // token & pos
-    int64_t pos_data = 0;
     // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
     cur_token = prompt_tokens[0];
 
@@ -66,18 +65,17 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
         &cur_token, {1, 1}, exec_aten::ScalarType::Long);
 
     ManagedTensor managed_start_pos(
-        &pos_data, {1}, exec_aten::ScalarType::Long);
+        &start_pos, {1}, exec_aten::ScalarType::Long);
 
-    // run the first token and get back logits tensor. Assuming the first token
-    // is bos so don't callback.
+    // run the first token and get back logits tensor.
     exec_aten::Tensor logits_tensor = ET_UNWRAP(
         text_decoder_runner_->step(managed_tokens, managed_start_pos));
 
-    pos = 1; // start from index 1
+    pos += 1; // start the loop from index 1
+    start_pos += 1;
 
     while (pos < num_prompt_tokens) {
       // Run the model
-      pos_data = start_pos + pos;
 
       // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
       cur_token = prompt_tokens[pos];
@@ -86,6 +84,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
           text_decoder_runner_->step(managed_tokens, managed_start_pos));
 
       pos++;
+      start_pos++;
     }
 
     cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -36,7 +36,7 @@ class TextPrefiller {
    */
   ::executorch::runtime::Result<uint64_t> prefill(
       std::vector<uint64_t>& prompt_tokens,
-      int64_t start_pos = 0);
+      int64_t& start_pos);
 
  private:
   TextDecoderRunner* text_decoder_runner_;