Intorduce TextLLMRunner. (#12055)

shoumikhin · web-flow · commit cf0bfd2f632b · 2025-06-28T00:31:32.000-04:00
Summary: .

Reviewed By: mergennachin

Differential Revision: D77416842
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -76,6 +76,7 @@ exclude_patterns = [
     'examples/demo-apps/apple_ios/**',
     'examples/demo-apps/react-native/rnllama/ios/**',
     'extension/apple/**',
+    'extension/llm/apple/**',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.h
@@ -10,18 +10,16 @@
 
 NS_ASSUME_NONNULL_BEGIN
 
-FOUNDATION_EXPORT NSErrorDomain const LLaMARunnerErrorDomain;
-
 @interface LLaMARunner : NSObject
 
-- (instancetype)initWithModelPath:(NSString*)filePath
-                    tokenizerPath:(NSString*)tokenizerPath;
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath;
 - (BOOL)isLoaded;
-- (BOOL)loadWithError:(NSError**)error;
-- (BOOL)generate:(NSString*)prompt
-       sequenceLength:(NSInteger)seq_len
-    withTokenCallback:(nullable void (^)(NSString*))callback
-                error:(NSError**)error;
+- (BOOL)loadWithError:(NSError **)error;
+- (BOOL)generate:(NSString *)prompt
+      sequenceLength:(NSInteger)seq_len
+   withTokenCallback:(nullable void (^)(NSString *))callback
+               error:(NSError **)error;
 - (void)stop;
 
 + (instancetype)new NS_UNAVAILABLE;
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
@@ -9,33 +9,29 @@
 #import "LLaMARunner.h"
 
 #import <ExecuTorch/ExecuTorchLog.h>
-#import <executorch/extension/llm/runner/text_llm_runner.h>
+#import <ExecuTorchLLM/ExecuTorchLLM.h>
 #import <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 
-using namespace executorch::extension;
-using namespace executorch::runtime;
-
-NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain";
-
 @interface LLaMARunner ()<ExecuTorchLogSink>
 @end
 
 @implementation LLaMARunner {
-  std::unique_ptr<llm::TextLLMRunner> _runner;
+  ExecuTorchTextLLMRunner *_runner;
 }
 
-- (instancetype)initWithModelPath:(NSString*)modelPath
-                    tokenizerPath:(NSString*)tokenizerPath {
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath {
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = llm::create_text_llm_runner(
-      modelPath.UTF8String,
-      llm::load_tokenizer(
-        tokenizerPath.UTF8String,
-        example::get_special_tokens(example::Version::Default)
-      )
-    );
+    auto tokens = example::get_special_tokens(example::Version::Default);
+    NSMutableArray<NSString*> *specialTokens = [[NSMutableArray alloc] initWithCapacity:tokens->size()];
+    for (const auto &token : *tokens) {
+      [specialTokens addObject:(NSString *)@(token.c_str())];
+    }
+    _runner = [[ExecuTorchTextLLMRunner alloc] initWithModelPath:modelPath
+                                                   tokenizerPath:tokenizerPath
+                                                   specialTokens:specialTokens];
   }
   return self;
 }
@@ -45,45 +41,25 @@ - (void)dealloc {
 }
 
 - (BOOL)isLoaded {
-  return _runner->is_loaded();
+  return [_runner isLoaded];
 }
 
 - (BOOL)loadWithError:(NSError**)error {
-  const auto status = _runner->load();
-  if (status != Error::Ok) {
-    if (error) {
-      *error = [NSError errorWithDomain:LLaMARunnerErrorDomain
-                                   code:(NSInteger)status
-                               userInfo:nil];
-    }
-    return NO;
-  }
-  return YES;
+  return [_runner loadWithError:error];
 }
 
-- (BOOL)generate:(NSString*)prompt
-       sequenceLength:(NSInteger)seq_len
-    withTokenCallback:(nullable void (^)(NSString*))callback
-                error:(NSError**)error {
-  const auto status = _runner->generate(
-      prompt.UTF8String,
-      llm::GenerationConfig{.seq_len = static_cast<int32_t>(seq_len)},
-      [callback](const std::string& token) {
-        callback(@(token.c_str()));
-      });
-  if (status != Error::Ok) {
-    if (error) {
-      *error = [NSError errorWithDomain:LLaMARunnerErrorDomain
-                                   code:(NSInteger)status
-                               userInfo:nil];
-    }
-    return NO;
-  }
-  return YES;
+- (BOOL)generate:(NSString *)prompt
+    sequenceLength:(NSInteger)seq_len
+ withTokenCallback:(nullable void (^)(NSString *))callback
+             error:(NSError **)error {
+  return [_runner generate:prompt
+            sequenceLength:seq_len
+         withTokenCallback:callback
+                     error:error];
 }
 
 - (void)stop {
-  _runner->stop();
+  [_runner stop];
 }
 
 #pragma mark - ExecuTorchLogSink
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchTextLLMRunner.h"
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchTextLLMRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchTextLLMRunner.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+FOUNDATION_EXPORT NSErrorDomain const ExecuTorchTextLLMRunnerErrorDomain;
+
+/**
+ A wrapper class for the C++ llm::TextLLMRunner that provides
+ Objective-C APIs to load models, manage tokenization with custom
+ special tokens, generate text sequences, and stop the runner.
+*/
+NS_SWIFT_NAME(TextLLMRunner)
+__attribute__((deprecated("This API is experimental.")))
+@interface ExecuTorchTextLLMRunner : NSObject
+
+/**
+ Initializes a text LLM runner with the given model and tokenizer paths,
+ and a list of special tokens to include in the tokenizer.
+
+ @param modelPath      File system path to the serialized model.
+ @param tokenizerPath  File system path to the tokenizer data.
+ @param tokens         An array of NSString special tokens to use during tokenization.
+ @return An initialized ExecuTorchTextLLMRunner instance.
+*/
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath
+                    specialTokens:(NSArray<NSString *> *)tokens;
+
+/**
+ Checks whether the underlying model has been successfully loaded.
+
+ @return YES if the model is loaded, NO otherwise.
+*/
+- (BOOL)isLoaded;
+
+/**
+ Loads the model into memory, returning an error if loading fails.
+
+ @param error   On failure, populated with an NSError explaining the issue.
+ @return YES if loading succeeds, NO if an error occurred.
+*/
+- (BOOL)loadWithError:(NSError **)error;
+
+/**
+ Generates text given an input prompt, up to a specified sequence length.
+ Invokes the provided callback for each generated token.
+
+ @param prompt    The initial text prompt to generate from.
+ @param seq_len   The maximum number of tokens to generate.
+ @param callback  A block called with each generated token as an NSString.
+ @param error     On failure, populated with an NSError explaining the issue.
+ @return YES if generation completes successfully, NO if an error occurred.
+*/
+- (BOOL)generate:(NSString *)prompt
+   sequenceLength:(NSInteger)seq_len
+withTokenCallback:(nullable void (^)(NSString *))callback
+            error:(NSError **)error;
+
+/**
+ Stops any ongoing generation and cleans up internal resources.
+*/
+- (void)stop;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchTextLLMRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchTextLLMRunner.mm
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchTextLLMRunner.h"
+
+#import <executorch/extension/llm/runner/text_llm_runner.h>
+
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+NSErrorDomain const ExecuTorchTextLLMRunnerErrorDomain = @"ExecuTorchTextLLMRunnerErrorDomain";
+
+@implementation ExecuTorchTextLLMRunner {
+  NSString *_modelPath;
+  NSString *_tokenizerPath;
+  std::unique_ptr<std::vector<std::string>> _specialTokens;
+  std::unique_ptr<llm::TextLLMRunner> _runner;
+}
+
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath
+                    specialTokens:(NSArray<NSString*>*)tokens {
+  self = [super init];
+  if (self) {
+    _modelPath = [modelPath copy];
+    _tokenizerPath = [tokenizerPath copy];
+    _specialTokens = std::make_unique<std::vector<std::string>>();
+    for (NSString *token in tokens) {
+      _specialTokens->emplace_back(token.UTF8String);
+    }
+  }
+  return self;
+}
+
+- (BOOL)isLoaded {
+  return _runner && _runner->is_loaded();
+}
+
+- (BOOL)loadWithError:(NSError**)error {
+  if (![self isLoaded]) {
+    _runner = llm::create_text_llm_runner(
+      _modelPath.UTF8String,
+      llm::load_tokenizer(_tokenizerPath.UTF8String, std::move(_specialTokens))
+    );
+    if (!_runner) {
+      if (error) {
+        *error = [NSError errorWithDomain:ExecuTorchTextLLMRunnerErrorDomain
+                                     code:-1
+                                 userInfo:@{NSLocalizedDescriptionKey: @"Failed to create runner"}];
+      }
+      return NO;
+    }
+  }
+  auto status = _runner->load();
+  if (status != Error::Ok) {
+    if (error) {
+      *error = [NSError errorWithDomain:ExecuTorchTextLLMRunnerErrorDomain
+                                   code:(NSInteger)status
+                               userInfo:nil];
+    }
+    return NO;
+  }
+  return YES;
+}
+
+- (BOOL)generate:(NSString*)prompt
+    sequenceLength:(NSInteger)seq_len
+withTokenCallback:(nullable void (^)(NSString*))callback
+                error:(NSError**)error {
+  if (![self loadWithError:error]) {
+    return NO;
+  }
+  auto status = _runner->generate(
+    prompt.UTF8String,
+    llm::GenerationConfig{.seq_len = static_cast<int32_t>(seq_len)},
+    [callback](const std::string& token) {
+      if (callback) callback(@(token.c_str()));
+    }
+  );
+  if (status != Error::Ok) {
+    if (error) {
+      *error = [NSError errorWithDomain:ExecuTorchTextLLMRunnerErrorDomain
+                                   code:(NSInteger)status
+                               userInfo:nil];
+    }
+    return NO;
+  }
+  return YES;
+}
+
+- (void)stop {
+  if (_runner) {
+    _runner->stop();
+  }
+}
+
+@end
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 #include <pytorch/tokenizers/llama2c_tokenizer.h>
 #include <pytorch/tokenizers/sentencepiece.h>
@@ -256,6 +257,7 @@ std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
     std::optional<std::string> pattern,
     size_t bos_token_index,
     size_t eos_token_index) {
+  runtime::runtime_init();
   auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
   if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
     ET_LOG(Info, "Loaded json tokenizer");