pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 2 additions & 1 deletion b/‎.ci/scripts/build_llama_android.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 2 additions & 1 deletion b/‎.ci/scripts/test_llama.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/scripts/test_llava.sh
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/test_llava.sh
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/upload-android-test-specs.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/upload-android-test-specs.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/scripts/build.sh
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/scripts/build.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/vulkan/docs/android_demo.md
Lines changed: 2 additions & 1 deletion b/‎backends/vulkan/docs/android_demo.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
Lines changed: 2 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/xnnpack/README.md
Lines changed: 2 additions & 1 deletion b/‎backends/xnnpack/README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎build/build_android_llm_demo.sh
Lines changed: 1 addition & 0 deletions b/‎build/build_android_llm_demo.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/build-run-qualcomm-ai-engine-direct-backend.md
Lines changed: 2 additions & 0 deletions b/‎docs/source/build-run-qualcomm-ai-engine-direct-backend.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/llm/getting-started.md
Lines changed: 7 additions & 7 deletions b/‎docs/source/llm/getting-started.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/source/tutorial-xnnpack-delegate-lowering.md
Lines changed: 2 additions & 1 deletion b/‎docs/source/tutorial-xnnpack-delegate-lowering.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/demo-apps/android/ExecuTorchDemo/README.md
Lines changed: 2 additions & 0 deletions b/‎examples/demo-apps/android/ExecuTorchDemo/README.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/demo-apps/android/ExecuTorchDemo/setup.sh
Lines changed: 1 addition & 0 deletions b/‎examples/demo-apps/android/ExecuTorchDemo/setup.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
Lines changed: 22 additions & 0 deletions b/‎examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml
Lines changed: 22 additions & 0 deletions
diff --git a/‎examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
Lines changed: 20 additions & 2 deletions b/‎examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
Lines changed: 20 additions & 2 deletions
diff --git a/‎examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
Lines changed: 1 addition & 0 deletions b/‎examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/demo-apps/android/LlamaDemo/setup.sh
Lines changed: 1 addition & 0 deletions b/‎examples/demo-apps/android/LlamaDemo/setup.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/llm_manual/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎examples/llm_manual/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/llm_manual/main.cpp
Lines changed: 3 additions & 4 deletions b/‎examples/llm_manual/main.cpp
Lines changed: 3 additions & 4 deletions
@@ -29,6 +29,7 @@ set_up_aot() {
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
       -DEXECUTORCH_BUILD_SDK=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3 \
       -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
 
@@ -22,8 +22,9 @@ install_executorch_and_backend_lib() {
     -DANDROID_PLATFORM=android-23 \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
 
@@ -107,8 +107,9 @@ cmake_install_executorch_libraries() {
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Debug \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
 
@@ -20,8 +20,9 @@ cmake_install_executorch_libraries() {
     cmake                                               \
         -DCMAKE_INSTALL_PREFIX=cmake-out                \
         -DCMAKE_BUILD_TYPE=${BUILD_TYPE}                \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON          \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
@@ -61,7 +62,7 @@ export_llava() {
 # Download a new image with different size, to test if the model can handle different image sizes
 prepare_image_tensor() {
     echo "Downloading image"
-    curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg 
+    curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
     $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
 }
 
 
@@ -41,7 +41,7 @@ jobs:
     with:
       # Just use a small model here with a minimal amount of configuration to test the spec
       models: stories110M
-      devices: samsung_galaxy_s2x
+      devices: samsung_galaxy_s22
       delegates: xnnpack
       test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml
 
 
@@ -81,6 +81,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_QNN=ON \
         -DEXECUTORCH_BUILD_SDK=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
@@ -124,6 +125,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_QNN=ON \
         -DEXECUTORCH_BUILD_SDK=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
         -S $PRJ_ROOT \
 
@@ -94,8 +94,9 @@ binary using the Android NDK toolchain.
   cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI=$ANDROID_ABI \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_VULKAN=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DPYTHON_EXECUTABLE=python \
 
@@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST;
 constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST;
 
 inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) {
-  return static_cast<DimIndex>(dim - v_in.dim());
+  return dim < 0 ? static_cast<DimIndex>(dim)
+                 : static_cast<DimIndex>(dim - v_in.dim());
 }
 
 /*
 
@@ -105,9 +105,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
 
@@ -38,6 +38,7 @@ build_android_native_library() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
 
@@ -136,6 +136,7 @@ cmake .. \
   -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
   -DEXECUTORCH_BUILD_SDK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
   -DPYTHON_EXECUTABLE=python3 \
   -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
@@ -167,6 +168,7 @@ cmake .. \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
     -DEXECUTORCH_BUILD_SDK=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
     -DPYTHON_EXECUTABLE=python3 \
     -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
 
@@ -201,9 +201,9 @@ Create a file called main.cpp with the following contents:
 
 #include "basic_sampler.h"
 #include "basic_tokenizer.h"
-#include "managed_tensor.h"
 
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>
@@ -244,14 +244,13 @@ std::string generate(
     for (auto i = 0u; i < max_output_length; i++) {
         // Convert the input_tokens from a vector of int64_t to EValue.
         // EValue is a unified data type in the ExecuTorch runtime.
-        ManagedTensor tensor_tokens(
+        auto inputs = from_blob(
             input_tokens.data(),
             {1, static_cast<int>(input_tokens.size())},
             ScalarType::Long);
-        std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
 
         // Run the model. It will return a tensor of logits (log-probabilities).
-        Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+        auto logits_evalue = llm_model.forward(inputs);
 
         // Convert the output logits from EValue to std::vector, which is what
         // the sampler expects.
@@ -339,7 +338,6 @@ Finally, download the following files into the same directory as main.h:
 ```
 curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h
 curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h
-curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h
 ```
 
 To learn more, see the [Runtime APIs Tutorial](../extension-module.md).
@@ -364,6 +362,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 
 # Include the executorch subdirectory.
@@ -377,6 +376,7 @@ target_link_libraries(
     PRIVATE
     executorch
     extension_module_static # Provides the Module class
+    extension_tensor # Provides the TensorPtr class
     optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels
 ```
 
@@ -386,7 +386,6 @@ At this point, the working directory should contain the following files:
 - main.cpp
 - basic_tokenizer.h
 - basic_sampler.h
-- managed_tensor.h
 - export_nanogpt.py
 - model.py
 - vocab.json
@@ -518,6 +517,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
@@ -534,6 +534,7 @@ target_link_libraries(
     PRIVATE
     executorch
     extension_module_static # Provides the Module class
+    extension_tensor # Provides the TensorPtr class
     optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
     xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
 ```
@@ -548,7 +549,6 @@ At this point, the working directory should contain the following files:
 - main.cpp
 - basic_tokenizer.h
 - basic_sampler.h
-- managed_tensor.h
 - export_nanogpt.py
 - model.py
 - vocab.json
 
@@ -149,9 +149,10 @@ mkdir cmake-out
 cmake \
     -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DPYTHON_EXECUTABLE=python \
     -Bcmake-out .
 
@@ -78,6 +78,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -Bcmake-android-out
 
 cmake --build cmake-android-out -j16 --target install
@@ -119,6 +120,7 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -Bcmake-android-out
 
 cmake --build cmake-android-out -j16 --target install
 
@@ -15,6 +15,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"
 
@@ -73,8 +73,30 @@ phases:
           fi
         fi;
 
+      # Run the new generic benchmark activity https://developer.android.com/tools/adb#am
+      - echo "Run LLM benchmark"
+      - |
+        adb -s $DEVICEFARM_DEVICE_UDID shell am start -W -n com.example.executorchllamademo/.LlmBenchmarkRunner \
+        --es "model_dir" "/data/local/tmp/llama" \
+        --es "tokenizer_path" "/data/local/tmp/llama/tokenizer.bin"
+
   post_test:
     commands:
+      - echo "Gather LLM benchmark results"
+      - |
+        BENCHMARK_RESULTS=""
+        ATTEMPT=0
+        MAX_ATTEMPT=10
+        while [ -z "${BENCHMARK_RESULTS}" ] && [ $ATTEMPT -lt $MAX_ATTEMPT ]; do
+          echo "Waiting for benchmark results..."
+          BENCHMARK_RESULTS=$(adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo cat files/benchmark_results.json)
+          sleep 30
+          ((ATTEMPT++))
+        done
+
+        adb -s $DEVICEFARM_DEVICE_UDID shell run-as com.example.executorchllamademo ls -la files/
+        # Trying to pull the file using adb ends up with permission error, but this works too, so why not
+        echo "${BENCHMARK_RESULTS}" > $DEVICEFARM_LOG_DIR/benchmark_results.json
 
 artifacts:
   # By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory.
 
@@ -14,8 +14,11 @@
 import android.util.Log;
 import android.widget.TextView;
 import androidx.annotation.NonNull;
+import com.google.gson.Gson;
+import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.Arrays;
 
 public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback {
   ModelRunner mModelRunner;
@@ -32,7 +35,12 @@ protected void onCreate(Bundle savedInstanceState) {
 
     Intent intent = getIntent();
 
-    String modelPath = intent.getStringExtra("model_path");
+    File modelDir = new File(intent.getStringExtra("model_dir"));
+    File model =
+        Arrays.stream(modelDir.listFiles())
+            .filter(file -> file.getName().endsWith(".pte"))
+            .findFirst()
+            .get();
     String tokenizerPath = intent.getStringExtra("tokenizer_path");
 
     float temperature = intent.getFloatExtra("temperature", 0.8f);
@@ -42,7 +50,7 @@ protected void onCreate(Bundle savedInstanceState) {
     }
 
     mStatsDump = new StatsDump();
-    mModelRunner = new ModelRunner(modelPath, tokenizerPath, temperature, this);
+    mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
     mStatsDump.loadStart = System.currentTimeMillis();
   }
 
@@ -79,11 +87,21 @@ public void onGenerationStopped() {
           mTextView.append(mStatsDump.toString());
         });
 
+    // TODO (huydhn): Remove txt files here once the JSON format is ready
     try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
       writer.write(mStatsDump.toString());
     } catch (IOException e) {
       e.printStackTrace();
     }
+
+    // TODO (huydhn): Figure out on what the final JSON results looks like, we need something
+    // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042
+    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
+      Gson gson = new Gson();
+      writer.write(gson.toJson(mStatsDump));
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
   }
 }
 
 
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_QNN=ON \
 
@@ -16,6 +16,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_XNNPACK=ON \
   -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
   -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
   -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
 
@@ -13,6 +13,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
 # Set options for executorch build.
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
 option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
 
@@ -29,6 +30,7 @@ target_link_libraries(
   nanogpt_runner
   PRIVATE executorch
           extension_module_static # Provides the Module class
+          extension_tensor # Provides the TensorPtr class
           optimized_native_cpu_ops_lib # Provides baseline cross-platform
                                        # kernels
           xnnpack_backend
 
@@ -10,9 +10,9 @@
 
 #include "basic_sampler.h"
 #include "basic_tokenizer.h"
-#include "managed_tensor.h"
 
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>
@@ -42,14 +42,13 @@ std::string generate(
   for (auto i = 0u; i < max_output_length; i++) {
     // Convert the input_tokens from a vector of int64_t to EValue.
     // EValue is a unified data type in the ExecuTorch runtime.
-    ManagedTensor tensor_tokens(
+    auto inputs = from_blob(
         input_tokens.data(),
         {1, static_cast<int>(input_tokens.size())},
         ScalarType::Long);
-    std::vector<EValue> inputs = {tensor_tokens.get_tensor()};
 
     // Run the model. It will return a tensor of logits (log-probabilities).
-    Result<std::vector<EValue>> logits_evalue = llm_model.forward(inputs);
+    auto logits_evalue = llm_model.forward(inputs);
 
     // Convert the output logits from EValue to std::vector, which is what
     // the sampler expects.
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST;`
`32`	`32`	`constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST;`
`33`	`33`
`34`	`34`	`inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) {`
`35`		`- return static_cast<DimIndex>(dim - v_in.dim());`
	`35`	`+ return dim < 0 ? static_cast<DimIndex>(dim)`
	`36`	`+ : static_cast<DimIndex>(dim - v_in.dim());`
`36`	`37`	`}`
`37`	`38`
`38`	`39`	`/*`