ai-dynamo · richardhuo-nv · Jun 16, 2025 · Jun 12, 2025 · Jun 13, 2025 · Jun 13, 2025
@@ -88,14 +88,15 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
+TRTLLM_COMMIT=""
 
 # TensorRT-LLM PyPI index URL
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==0.21.0rc0"
 TENSORRTLLM_PIP_WHEEL=""
 
 
-
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # FIXME: NCCL will hang with 25.03, so use 25.01 for now
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
@@ -155,6 +156,13 @@ get_options() {
                 missing_requirement "$1"
             fi
             ;;
+        --use-default-experimental-tensorrtllm-commit)
+            if [ -n "$2" ] && [[ "$2" != --* ]]; then
+                echo "ERROR: --use-default-experimental-tensorrtllm-commit does not take any argument"
+                exit 1
+            fi
+            USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
+            ;;
         --tensorrtllm-pip-wheel)
             if [ "$2" ]; then
                 TENSORRTLLM_PIP_WHEEL=$2
@@ -341,6 +349,7 @@ show_help() {
     echo "  [--framework framework one of ${!FRAMEWORKS[*]}]"
     echo "  [--tensorrtllm-pip-wheel-dir path to tensorrtllm pip wheel directory]"
     echo "  [--tensorrtllm-commit tensorrtllm commit to use for building the trtllm wheel if the wheel is not provided]"
+    echo "  [--use-default-experimental-tensorrtllm-commit] Use the default experimental commit (${DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT}) to build TensorRT-LLM. This is a flag (no argument). Do not combine with --tensorrtllm-commit or --tensorrtllm-pip-wheel."
     echo "  [--tensorrtllm-pip-wheel tensorrtllm pip wheel on artifactory]"
     echo "  [--tensorrtllm-index-url tensorrtllm PyPI index URL if providing the wheel from artifactory]"
     echo "  [--build-arg additional build args to pass to docker build]"
@@ -470,6 +479,19 @@ check_wheel_file() {
 }
 
 if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
+    if [ "$USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT" = true ]; then
+        if [ -n "$TRTLLM_COMMIT" ] || [ -n "$TENSORRTLLM_PIP_WHEEL" ]; then
+            echo "ERROR: When using --use-default-experimental-trtllm-commit, do not set --tensorrtllm-commit or --tensorrtllm-pip-wheel."
+            exit 1
+        fi
+        TRTLLM_COMMIT="$DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT"
+    fi
+
+    # If user didn't set both wheel and commit, use default tensorrt_llm pip wheel
+    if [ -z "$TENSORRTLLM_PIP_WHEEL" ] && [ -z "$TRTLLM_COMMIT" ]; then
+        TENSORRTLLM_PIP_WHEEL="$DEFAULT_TENSORRTLLM_PIP_WHEEL"
+    fi
+
     if [ -z "${TENSORRTLLM_PIP_WHEEL}" ]; then
         # Use option 1
         if [ ! -d "${TENSORRTLLM_PIP_WHEEL_DIR}" ]; then

@@ -62,6 +62,11 @@ apt-get update && apt-get -y install git git-lfs
 
 # On an ARM machine:
 ./container/build.sh --framework tensorrtllm --platform linux/arm64
+
+# Build the container with the default experimental TensorRT-LLM commit
+# WARNING: This is for experimental feature testing only.
+# The container should not be used in a production environment.
+./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit
 ```
 
 > [!NOTE]
@@ -136,6 +141,10 @@ dynamo serve graphs.agg:Frontend -f configs/deepseek_r1/mtp/mtp_agg.yaml
 ```
 
 Notes:
+- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script.
+
+  Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit`
+
 - There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
 - MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.
 
@@ -275,6 +284,9 @@ dynamo serve components.prefill_worker:TensorRTLLMPrefillWorker -f configs/deeps
 ```
 
 Notes:
+- MTP is only available within the container built with the experimental TensorRT-LLM commit. Please add --use-default-experimental-tensorrtllm-commit to the arguments of the build.sh script.
+
+  Example: `./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit`
 - There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
 - MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.