pytorch · mikekgfb · Apr 27, 2024 · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh
@@ -25,6 +25,7 @@ function generate_compiled_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
 
+
     if [[ $CHECKPOINT_PATH != *"stories"* && $TARGET_DEVICE == "cuda" ]]; then
         DTYPES="bfloat16"
         EXCLUDE_INT8_QUANT=true
@@ -74,7 +75,7 @@ function generate_compiled_model_output() {
         python3 -W ignore generate.py --dtype ${DTYPE} --compile --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
         cat "$MODEL_DIR/output_compiled"
 
-        if [ "$EXCLUDE_INT8_QUANT" = false ]; then
+        if [ "${EXCLUDE_INT8_QUANT:-false}" == false ]; then
             echo "******************************************"
             echo "******* INT8 channel-wise quantized ******"
             echo "******************************************"
@@ -109,17 +110,24 @@ function generate_compiled_model_output() {
 function generate_aoti_model_output() {
     local CHECKPOINT_PATH="$1"
     local TARGET_DEVICE="${2:-cpu}"
+    local DTYPES="${3:-default}"
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
 
-    if [[ $CHECKPOINT_PATH != *"stories"* && $TARGET_DEVICE == "cuda" ]]; then
-        DTYPES="bfloat16"
-        EXCLUDE_INT8_QUANT=true
-    else
-        DTYPES="float32 bfloat16 float16"
-        EXCLUDE_INT8_QUANT=false
+    echo "Local DTYPES=$DTYPES"
+
+    if [[ $DTYPES == "default" ]]; then
+        if [[ $CHECKPOINT_PATH != *"stories"* && $TARGET_DEVICE == "cuda" ]]; then
+            DTYPES="bfloat16"
+            EXCLUDE_INT8_QUANT=true
+        else
+            DTYPES="float32 bfloat16 float16"
+            EXCLUDE_INT8_QUANT=false
+        fi
     fi
 
+    echo "Local after default DTYPES=$DTYPES"
+
     for DTYPE in $DTYPES; do
         echo ""############### Run inference with AOT Inductor  for dtype $DTYPE "###############"
         echo ""
@@ -158,7 +166,7 @@ function generate_aoti_model_output() {
         python3 -W ignore generate.py --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
         cat "$MODEL_DIR/output_aoti"
 
-        if [ "$EXCLUDE_INT8_QUANT" = false ]; then
+        if [ "${EXCLUDE_INT8_QUANT:-false}" == false ]; then
             echo "******************************************"
             echo "******* INT8 channel-wise quantized ******"
             echo "******************************************"
@@ -295,11 +303,12 @@ function run_compile() {
 }
 
 function run_aoti() {
-    generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
+    echo "Passing DTYPES=$DTYPES"
+    generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" "$DTYPES" || exit 1
 }
 
 function run_executorch() {
-    if [ "$TARGET_DEVICE" = "cpu" ]; then
+    if [ "$TARGET_DEVICE" == "cpu" ]; then
         generate_executorch_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
     else
         echo "Skipped: Executorch doesn't run on ${TARGET_DEVICE}"
@@ -318,24 +327,44 @@ CHECKPOINT_PATH="$1"
 TARGET_DEVICE="${2:-cpu}"
 PROMPT="Hello, my name is"
 
-
 if [ "$#" -gt 2 ]; then
     # Additional arguments provided
     for arg in "${@:3}"; do
         case "$arg" in
             "compile")
+                echo "arg:$arg"
                 run_compile || exit 1
                 ;;
             "aoti")
+                echo "arg:$arg"
+                DTYPES="default"
+                run_aoti || exit 1
+                ;;
+            "aoti-bfloat16")
+                echo "arg:$arg"
+                DTYPES="bfloat16"
+                run_aoti || exit 1
+                ;;
+            "aoti-float16")
+                echo "arg:$arg"
+                DTYPES="float16"
+                run_aoti || exit 1
+                ;;
+            "aoti-float32")
+                echo "arg:$arg"
+                DTYPES="float32"
                 run_aoti || exit 1
                 ;;
             "executorch")
+                echo "arg:$arg"
                 run_executorch || exit 1
                 ;;
             "eval")
+                echo "arg:$arg"
                 run_eval || exit 1
                 ;;
             "eval_sanity_check")
+                echo "arg:$arg"
                 run_eval_sanity_check || exit 1
                 ;;
             *)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -183,9 +183,9 @@ jobs:
         bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
         echo "::endgroup::"
 
-  test-gpu-aoti:
+  test-gpu-aoti-bfloat16:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
+    name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
       matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
@@ -222,7 +222,89 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Run inference"
-        bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti"
+        bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16"
+        echo "::endgroup::"
+
+  test-gpu-aoti-float32:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-gpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        nvidia-smi
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"        
+
+        echo "::group::Install required packages"
+        pip install --pre torch  --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r ./requirements.txt
+        pip list
+        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoint"
+        export REPO_NAME=${{ matrix.repo_name }}
+        bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
+        echo "::endgroup::"
+
+        echo "::group::Convert checkpoint"
+        bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float32"
+        echo "::endgroup::"
+
+  test-gpu-aoti-float16:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-gpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        nvidia-smi
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"        
+
+        echo "::group::Install required packages"
+        pip install --pre torch  --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r ./requirements.txt
+        pip list
+        python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoint"
+        export REPO_NAME=${{ matrix.repo_name }}
+        bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
+        echo "::endgroup::"
+
+        echo "::group::Convert checkpoint"
+        bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float16"
         echo "::endgroup::"
 
   test-gpu-eval-sanity-check:
@@ -685,7 +767,7 @@ jobs:
 
           echo "Running compiled"
           python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile
-          
+
           echo "******************************************"
           echo "******* Emb: channel-wise quantized ******"
           echo "******************************************"